The input of this notebook is a dataset that has already been 

In [35]:
import pandas as pd

import statsmodels.formula.api as smf
import statsmodels.api as sm

from src.conf import settings

TRAINING_DIR = settings.DATA_DIR / "processed/training/"

data = pd.read_parquet(TRAINING_DIR / "0_labeled_data.parquet")

In [30]:
data.head()

Unnamed: 0_level_0,timestamp,is_weekday,t,dswrf,SUNSD,curtailment_event,solar_curtailment,load
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2017-02-01 00:00:00-08:00,False,282.630219,204.382217,2130985.0,False,8480.663464,7404797.0
1,2017-02-02 00:00:00-08:00,False,282.44043,155.364441,1721099.0,False,288.956667,7367050.0
2,2017-02-03 00:00:00-08:00,False,283.654236,162.99556,1762346.0,False,869.203799,7272859.0
3,2017-02-04 00:00:00-08:00,True,283.166656,197.639999,1947685.0,True,69858.576657,6611103.0
4,2017-02-05 00:00:00-08:00,True,283.199097,174.813339,2019408.0,False,25.348,6661248.0


In [39]:
class ModelRun:
    
    def __init__(self, formula, data, threshold=.8):
        data = data.copy()
        self.formula = formula
        self.threshold = threshold
        self.test = data.sample(int(len(data)*.2//1))
        self.training = data[~data.index.isin(self.test.index)]
    
    def compute_performance(self, predictions):
        cutoff = self.threshold
        positive_predictions = predictions.query("probability > @cutoff")["curtailment_event"].value_counts()
        negative_predictions = predictions.query("probability <= @cutoff")["curtailment_event"].value_counts()

        true_positives = positive_predictions.loc[True]
        false_positives = positive_predictions.loc[False]

        true_negatives = negative_predictions.loc[False]
        false_negatives = negative_predictions[True]

        accuracy = (true_positives+true_negatives)/len(predictions)
        precision = true_positives / (true_positives + false_positives)
        
        self.performance = {
            "true_positives": true_positives,
            "false_positives": false_positives,
            "true_negatives": true_negatives,
            "false_negatives": false_negatives,
            "accuracy": accuracy, 
            "precision": precision
        }
        
    def run(self):
        """
        """
        result = smf.glm(
            self.formula,
            self.training,
            family=sm.families.Binomial()
        ).fit()
        self.result = result

        # De-label our test data
        predictions = result.predict(
            self.test.drop(columns=["curtailment_event"])
        )
        predictions.name = "probability"
        
        # We are predicting probability of False because patsy reverses categorical designations
        predictions = 1 - predictions
        
        # Validate our test data back to our predictions
        predictions = self.test.merge(predictions, left_index=True, right_index=True)
        
        self.predictions = predictions
        self.compute_performance(predictions)

In [65]:
model_run = ModelRun(
    "C(curtailment_event) ~ C(timestamp.dt.month) + C(is_weekday) + load + t + dswrf",
    data,
    0.8
)
model_run.run()
model_run.performance

{'true_positives': 2,
 'false_positives': 1,
 'true_negatives': 51,
 'false_negatives': 18,
 'accuracy': 0.7361111111111112,
 'precision': 0.6666666666666666}