This is the final compilation of logistic models through manual parameter tuning.  Very few actual features were involved here!

In [659]:
"""

Inbound Data:

timestamp                 datetime64[ns, US/Pacific]
solar_curtailment                            float64
solar                                        float64
net_load                                     float64
load                                         float64
generation                                   float64
renewables                                   float64
wind_curtailment                             float64
is_weekday                                      bool
t_mean                                       float32
t_wmean                                      float32
t_wmax                                       float32
t_wmin                                       float32
t_absmax                                     float32
t_absmin                                     float32
dswrf_mean                                   float32
dswrf_absmax                                 float32
dswrf_wmean                                  float32
capacity_mw                                  float64
installed_capacity                           float64
solar_capacity_factor                        float64
curtailment_event_0.01                          bool
curtailment_event_0.05                          bool
curtailment_event_0.10                          bool

dtype: object

"""
import altair as alt
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

from loguru import logger
from src.conf import settings


INPUT_DIR = settings.DATA_DIR / "processed/training/"


def load_driven_model(df, y):
    """Simple model self-contained with just load and time
    """
    df["month"] = df["timestamp"].dt.month
    numeric = ["load"]
    numeric_transformer = Pipeline(
        steps=[
            ("scaler", StandardScaler())
        ]
    )
    categoricals = ["is_weekday", "month"]
    categorical_transformer = Pipeline(
        steps=[
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]
    )

    preprocessor = ColumnTransformer(
        transformers = [
            ("num", numeric_transformer, numeric),
            ("cat", categorical_transformer, categoricals)
        ]
    )

    X = df[numeric + categoricals].copy()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    clf = Pipeline(steps=[("preprocessor", preprocessor),("classifier", LogisticRegression())])
    clf.fit(X_train, y_train, fit_intercept=False)
    return clf, X_test, y_test


def load_driven_model(df, y):
    """Simple model self-contained with just load and time
    """
    df["month"] = df["timestamp"].dt.month
    numeric = ["load"]
    numeric_transformer = Pipeline(
        steps=[
            ("scaler", StandardScaler())
        ]
    )
    categoricals = ["is_weekday", "month"]
    categorical_transformer = Pipeline(
        steps=[
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]
    )

    preprocessor = ColumnTransformer(
        transformers = [
            ("num", numeric_transformer, numeric),
            ("cat", categorical_transformer, categoricals)
        ]
    )

    X = df[numeric + categoricals].copy()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    clf = Pipeline(steps=[("preprocessor", preprocessor),("classifier", LogisticRegression())])
    clf.fit(X_train, y_train)
    return clf, X_test, y_test


if __name__ == "__main__":

    # Step 1: read data!
    data = pd.read_parquet(INPUT_DIR / "1_labeled_curtailment_events.parquet")

    cutoff_cols = data.columns[data.columns.str.match(r"curtailment_event_\d.\d\d")]
    predictions = {}
    classifiers = {}
    for col in cutoff_cols:
        y = data[col]
        clf, X_test, y_test = load_driven_model(data, y)
        p = pd.DataFrame(clf.predict_proba(X_test), columns=list(map(str,clf.classes_))).assign(true_value=y_test.values)
        predictions[col] = p
        classifiers[col] = clf      

In [660]:
charts = []
for col, p in predictions.items():
    c = alt.Chart(
        p, title=f"{col}",
    ).mark_bar(
        cornerRadiusTopLeft=3,
        cornerRadiusTopRight=3
    ).encode(
        alt.X("True", bin=alt.Bin(step=.05)),
        alt.Y("count(True)"),
        alt.Color("true_value"),
    )
    charts.append(c)
alt.hconcat(*charts)

In [535]:

def weather_driven_model_a(df, y):
    """Simple model self-contained with just load and time
    """
    df["month"] = df["timestamp"].dt.month
    numeric = ["load", "t_mean", "dswrf_mean"]
    numeric_transformer = Pipeline(
        steps=[
            ("scaler", StandardScaler())
        ]
    )
    categoricals = ["is_weekday", "month"]
    categorical_transformer = Pipeline(
        steps=[
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]
    )

    preprocessor = ColumnTransformer(
        transformers = [
            ("num", numeric_transformer, numeric),
            ("cat", categorical_transformer, categoricals)
        ]
    )

    X = df[numeric + categoricals].copy()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    clf = Pipeline(steps=[("preprocessor", preprocessor),("classifier", LogisticRegression())])
    clf.fit(X_train, y_train)
    return clf, X_test, y_test


if __name__ == "__main__":

    # Step 1: read data!
    data = pd.read_parquet(INPUT_DIR / "1_labeled_curtailment_events.parquet")

    cutoff_cols = data.columns[data.columns.str.match(r"curtailment_event_\d.\d\d")]
    predictions = {}
    classifiers = {}
    for col in cutoff_cols:
        y = data[col]
        clf, X_test, y_test = weather_driven_model_a(data, y)
        p = pd.DataFrame(clf.predict_proba(X_test), columns=list(map(str,clf.classes_))).assign(true_value=y_test.values)
        predictions[col] = p
        classifiers[col] = clf   


In [536]:
charts = []
for col, p in predictions.items():
    c = alt.Chart(
        p, title=f"{col}",
    ).mark_bar(
        cornerRadiusTopLeft=3,
        cornerRadiusTopRight=3
    ).encode(
        alt.X("True", bin=True),
        alt.Y("count(True)"),
        alt.Color("true_value"),
    )
    charts.append(c)
alt.hconcat(*charts)

In [654]:
from sklearn import svm

def weather_driven_model_b(df, y):
    """Simple model self-contained with just load and time
    """
    df["month"] = df["timestamp"].dt.month
    numeric = ["load","t_mean", "t_wmin", "t_wmax", "dswrf_wmean", "dswrf_absmax"]
    numeric_transformer = Pipeline(
        steps=[
            ("scaler", StandardScaler())
        ]
    )
    categoricals = ["is_weekday", "month"]
    categorical_transformer = Pipeline(
        steps=[
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]
    )

    preprocessor = ColumnTransformer(
        transformers = [
            ("num", numeric_transformer, numeric),
            ("cat", categorical_transformer, categoricals)
        ]
    )

    X = df[numeric + categoricals].copy()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    clf = Pipeline(steps=[("preprocessor", preprocessor),("classifier", svm.SVC(probability=True))])
    clf.fit(X_train, y_train)
    return clf, X_test, y_test



if __name__ == "__main__":

    # Step 1: read data!
    data = pd.read_parquet(INPUT_DIR / "1_labeled_curtailment_events.parquet")

    cutoff_cols = data.columns[data.columns.str.match(r"curtailment_event_\d.\d\d")]
    predictions = {}
    classifiers = {}
    for col in cutoff_cols:
        y = data[col]
        clf, X_test, y_test = weather_driven_model_b(data, y)
        p = pd.DataFrame(clf.predict_proba(X_test), columns=list(map(str,clf.classes_))).assign(true_value=y_test.values)
        predictions[col] = p
        classifiers[col] = clf   

In [655]:
charts = []
for col, p in predictions.items():
    c = alt.Chart(
        p, title=f"{col}",
    ).mark_bar(
        cornerRadiusTopLeft=3,
        cornerRadiusTopRight=3
    ).encode(
        alt.X("True", bin=alt.Bin(step=.05)),
        alt.Y("count(True)"),
        alt.Color("true_value"),
    )
    charts.append(c)
alt.hconcat(*charts)

In [505]:
predictions["curtailment_event_0.01"]["true_value"].value_counts()

True     49
False    22
Name: true_value, dtype: int64

In [474]:
alt.Chart(
    data.assign(pct_curtailed=data.eval("solar_curtailment/solar"), curtailed_per_mw_installed=data.eval("solar_curtailment/installed_capacity"))
).mark_bar().encode(
    alt.X("yearmonth(timestamp)"),
    alt.Y("mean(pct_curtailed)")
)

In [466]:
data[["timestamp", "solar_curtailment", "solar", "installed_capacity"]]

Unnamed: 0,timestamp,solar_curtailment,solar,installed_capacity
0,2017-02-01 00:00:00-08:00,706.721955,53045.729532,10192.63
1,2017-02-02 00:00:00-08:00,24.079722,41689.418186,10192.63
2,2017-02-03 00:00:00-08:00,72.433650,44321.327353,10192.63
3,2017-02-04 00:00:00-08:00,5821.548055,53032.304312,10192.63
4,2017-02-05 00:00:00-08:00,2.112333,33350.929800,10192.63
...,...,...,...,...
349,2019-05-27 00:00:00-07:00,37029.533524,64339.707821,12338.86
350,2019-05-28 00:00:00-07:00,5296.214229,99557.727339,12338.86
351,2019-05-29 00:00:00-07:00,6063.860798,103321.775049,12338.86
352,2019-05-30 00:00:00-07:00,638.454621,107952.109967,12338.86
