## Imports

In [24]:
# imports
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

## Train Model

In [25]:
def train_model(inputFile):
    # Load the dataframe
    df = pd.read_csv(inputFile)

    # Preprocess the data
    df['time'] = pd.to_datetime(df['time'])
    df['year'] = df['time'].dt.year
    df['month'] = df['time'].dt.month
    df['day'] = df['time'].dt.day
    df['hour'] = df['time'].dt.hour
    df['weekday'] = df['time'].dt.weekday
    df = df.drop('time', axis=1)

    df['Measure'] = LabelEncoder().fit_transform(df['Measure'])
    scaler = StandardScaler()
    df['kWh'] = scaler.fit_transform(df[['kWh']])
    df['label'] = LabelEncoder().fit_transform(df['label'])

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(df.drop('label', axis=1), df['label'], test_size=0.2, random_state=42)

    # Train a Random Forest model on the training data
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

## Evaluation

In [26]:
def eval_model(fileName):
    # Preprocess
    test_data = pd.read_csv('../data/supervised-dropped/' + fileName)
    test_data['time'] = pd.to_datetime(test_data['time'])
    test_data['year'] = test_data['time'].dt.year
    test_data['month'] = test_data['time'].dt.month
    test_data['day'] = test_data['time'].dt.day
    test_data['hour'] = test_data['time'].dt.hour
    test_data['weekday'] = test_data['time'].dt.weekday
    test_data = test_data.drop('time', axis=1)
    test_data['Measure'] = LabelEncoder().fit_transform(test_data['Measure'])
    scaler = StandardScaler()
    test_data['kWh'] = scaler.fit_transform(test_data[['kWh']])
    test_data['label'] = LabelEncoder().fit_transform(test_data['label'])

    # Predict
    new_pred = model.predict(test_data.drop("label", axis=1))
    new_pred

    # Load the correct labels for the new data
    y = pd.read_csv('../data/supervised-dropped/' + fileName)['label']

    # encode y
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    # Calculate the accuracy of the model on the new data
    new_accuracy = accuracy_score(y_encoded, new_pred)
    print("Accuracy on new data:", new_accuracy)
    print(y_encoded)
    print(new_pred)

In [27]:
def eval_holdout(model):
    # Preprocess
    test_data = pd.read_csv('../data_new/holdout_v3.csv')
    test_data['time'] = pd.to_datetime(test_data['time'])
    test_data['year'] = test_data['time'].dt.year
    test_data['month'] = test_data['time'].dt.month
    test_data['day'] = test_data['time'].dt.day
    test_data['hour'] = test_data['time'].dt.hour
    test_data['weekday'] = test_data['time'].dt.weekday
    test_data = test_data.drop('time', axis=1)
    test_data['Measure'] = LabelEncoder().fit_transform(test_data['Measure'])
    scaler = StandardScaler()
    test_data['kWh'] = scaler.fit_transform(test_data[['kWh']])
    test_data['label'] = LabelEncoder().fit_transform(test_data['label'])

    # Predict
    new_pred = model.predict(test_data.drop("label", axis=1))
    new_pred

    # Load the correct labels for the new data
    y = pd.read_csv('../data_new/holdout_v3.csv')['label']

    # encode y
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    # Calculate the accuracy of the model on the new data
    new_accuracy = accuracy_score(y_encoded, new_pred)
    print("Accuracy on new data:", new_accuracy)
    print(y_encoded)
    print(new_pred)

# Automated Evaluation

In [31]:
# model = train_model('../data_new/supervised-dropped/Sub - Feeder F02.csv')# ../data_new/supervised-dropped/BS4 - Main 4R.csv

In [32]:
# with open('../models/supervised_15m_rfc_model.pickle', 'wb') as f:
    # pickle.dump(model, f)

In [30]:
labeled_paths = ['../data_new/supervised-dropped/AB3 - Main 3L.csv', '../data_new/supervised-dropped/BS1 - Main 1L.csv', '../data_new/supervised-dropped/BS4 - Main 4R.csv', '../data_new/supervised-dropped/Sub - Feeder F02.csv', '../data_new/holdout_v3.csv']

for path in labeled_paths:
    print(f"Model: {path}")
    model = train_model(path)
    
    # Evaluate the model
    eval_model('AB3 - Main 3L.csv')
    eval_model('BS1 - Main 1L.csv')
    eval_model('BS4 - Main 4R.csv')
    eval_model('Sub - Feeder F02.csv')
    eval_holdout(model)

Model: ../data_new/supervised-dropped/AB3 - Main 3L.csv
Accuracy on new data: 0.9466975666280417
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
Accuracy on new data: 0.9495301079011487
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
Accuracy on new data: 0.8893320039880359
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
Accuracy on new data: 0.8429766171701656
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
Accuracy on new data: 0.8656994047619048
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
Model: ../data_new/supervised-dropped/BS1 - Main 1L.csv
Accuracy on new data: 0.7740440324449595
[0 0 0 ... 0 0 0]
[0 0 2 ... 0 0 0]
Accuracy on new data: 0.968673860076575
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
Accuracy on new data: 0.8638085742771685
[0 0 0 ... 0 0 0]
[0 0 0 ... 1 1 0]
Accuracy on new data: 0.7441542925413893
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
Accuracy on new data: 0.8962053571428571
[0 0 0 ... 0 0 0]
[2 2 2 ... 0 0 0]
Model: ../data_new/supervised-dropped/BS4 - Main 4R.csv
Accuracy on new data: 0.8264194669756663
[0 0 0 ... 0 0 0]
[0 0