In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [2]:
# Load the data into a pandas dataframe
path = '../data_new/supervised-dropped/'
files=[]
df2 = pd.read_csv('../data_new/train_data_v3.csv')
fac_loc_list = df2.loc[:, 'Measure'].unique()

for fac_loc in fac_loc_list:
    cur_df = df2[df2['Measure'] == fac_loc]
    if len(cur_df['label'].unique()) > 1:
        # print('Current Measure: ' + fac_loc)
        files.append(fac_loc)
print(files)

# Load model
with open('../models/supervised_rfc_model.pickle', 'rb') as f:
    model = pickle.load(f)

['AB3 - Main 3L', 'Sub - Feeder F02', 'BS4 - Main 4R', 'BS1 - Main 1L']


In [18]:
# Train a logistic regression classifier on the labeled data
# clf = LogisticRegression(random_state=42)

for j in range (3):
    print(f"Training on: {files[j]}")
    df = pd.read_csv(path+files[j]+'.csv')
    
    # Split the data into a labeled set and an unlabeled set
    train_df, unlabeled_df = train_test_split(df, test_size=0.5, stratify=df['label']) # , random_state=42
    model.fit(train_df[['kWh']], train_df['label']) # clf.fit(train_df[['kWh']], train_df['label'])

    
    # Evaluate the classifier's accuracy on the labeled data
    train_preds = model.predict(train_df[['kWh']]) # train_preds = clf.predict(train_df[['kWh']])
    train_acc = accuracy_score(train_df['label'], train_preds)
    print('Training accuracy:', train_acc)
        
    # Use the classifier to predict labels for the unlabeled data
    unlabeled_preds = model.predict(unlabeled_df[['kWh']]) # unlabeled_preds = clf.predict(unlabeled_df[['kWh']])
    
    # Select the most confident predictions and add them to the labeled data
    max_confidence = np.max(model.predict_proba(unlabeled_df[['kWh']]), axis=1) # max_confidence = np.max(clf.predict_proba(unlabeled_df[['kWh']]), axis=1)
    new_labeled_df = unlabeled_df[max_confidence > 0.85].copy()
    new_labeled_df['label'] = unlabeled_preds[max_confidence > 0.85]
    prev_labeled_df = new_labeled_df.copy()

    for i in range(10):
        # Add the new labeled data to the training set
        train_df = pd.concat([train_df, new_labeled_df], axis=0)
    
        # Retrain the classifier on the updated labeled data
        model.fit(train_df[['kWh']], train_df['label']) # clf.fit(train_df[['kWh']], train_df['label'])
    
        # Evaluate the classifier's accuracy on the labeled data
        train_preds = model.predict(train_df[['kWh']]) # train_preds = clf.predict(train_df[['kWh']])
        train_acc = accuracy_score(train_df['label'], train_preds)
        print('Training accuracy:', train_acc)
    
        # Use the classifier to predict labels for the unlabeled data
        unlabeled_preds = model.predict(unlabeled_df[['kWh']]) # unlabeled_preds = clf.predict(unlabeled_df[['kWh']])
    
        # Select the most confident predictions and add them to the labeled data
        max_confidence = np.max(model.predict_proba(unlabeled_df[['kWh']]), axis=1) # max_confidence = np.max(clf.predict_proba(unlabeled_df[['kWh']]), axis=1)
        new_labeled_df = unlabeled_df[max_confidence > 0.95].copy()
        new_labeled_df['label'] = unlabeled_preds[max_confidence > 0.95]
    
        # Check for convergence by comparing the new labeled data to the previous iteration
        if new_labeled_df.equals(prev_labeled_df):
            break
        prev_labeled_df = new_labeled_df.copy()

Training on: AB3 - Main 3L
Training accuracy: 0.8538926413082119
Training accuracy: 0.9097034053460271
Training accuracy: 0.9315912117177098
Training accuracy: 0.9449528996830215
Training on: Sub - Feeder F02
Training accuracy: 0.7958904109589041
Training accuracy: 0.8620609847827345
Training accuracy: 0.8914439233186102
Training accuracy: 0.9105138695994895
Training accuracy: 0.9239140841923851
Training on: BS4 - Main 4R
Training accuracy: 0.8277861069465268
Training accuracy: 0.8894875248540824
Training accuracy: 0.9134953308565117
Training accuracy: 0.9292460578186597


In [19]:
def plot_eval(test_df, y_pred):
    test_df['predicted_label'] = model.predict(test_df[['kWh']])
    sns.scatterplot(x=test_df.index, y=test_df['kWh'], hue=y_pred)
    plt.title(f'Prediction: {files[3]}')
    plt.show()

In [20]:
test_df = pd.read_csv(path+files[3]+'.csv')  # load the test data

# Use the classifier to predict labels for the test data
y_pred = model.predict(test_df[['kWh']])
test_acc = accuracy_score(test_df['label'], y_pred)
print('Test accuracy:', test_acc)


# Use the classifier to predict labels for the test data
# plot_eval(test_df, y_pred)

Test accuracy: 0.7374316457929088
