### Prework

In [1]:
# !pip install boto3

**Import Packages**

In [2]:
import boto3
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import dotenv
import sys
import datetime
import pickle
import io
import warnings
from pulse import Pulse, get_metadata, get_median_pulse, distance_from_median, cumulative_distance_from_median # need pulse.py in same folder
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score

**Set up AWS Connection**

In [3]:
# set working directory to location of .env file
working_directory = '/Users/colinobrien/Desktop'
os.chdir(working_directory)

# Load .env and save variables

dotenv.load_dotenv()
access_key_id=os.getenv('access_key_id')
access_key_secret=os.getenv('access_key_secret')

# Connect to S3 and import metadata

s3 = boto3.resource(
    service_name='s3',
    region_name='us-east-1',
    aws_access_key_id=access_key_id,
    aws_secret_access_key=access_key_secret
)

**Load Runs** - Still requires manual process, more troubleshooting needed

In [4]:
runs = {}

for o in s3.Bucket('sds-capstone-jlab').objects.filter(Prefix='RFQ_UVA_Capstone/RFQ_UVA_Capstone'):
           
    # runs
    if o.key.endswith('.csv') and ('Run' in o.key):
        results = get_metadata(o) # metadata from filename
        results['data'] = pd.read_csv(o.get()['Body'], skiprows=2).drop(columns = 'Signal Name') # data
        runs[results['str_time']] = Pulse(results['data'], results['time'], results.get('fault_type'), results['interval'], results['result']) # create pulse object
        

# set pulses

_ = [v.set_pulses() for k, v in runs.items()]

# split into runs and faults, save to s3 as pickle


**Load Faults**

In [5]:
faults = pickle.loads(s3.Bucket("sds-capstone-jlab").Object('consolidated-data/faults.pickle').get()['Body'].read())

### Run Logistic Regression

List of components

In [6]:
components = runs[list(runs)[0]].data.columns

Get median pulse

In [7]:
median_pulse = get_median_pulse(runs)

Put all pulses in single dict

In [8]:
all_pulses = {**runs, **faults}

In [9]:
# all_pulses = runs | faults

**Calculate and Normalize L2 Values**

In [10]:
L2_values = cumulative_distance_from_median(all_pulses, median_pulse)

In [11]:
normalized = (L2_values[components] - L2_values[components].mean())/L2_values[components].mean()
normalized.dropna(axis='columns', inplace=True)
components = normalized.columns
normalized['time'] = L2_values.time
normalized['result'] = L2_values.result

**Save Normalized DF to AWS**

In [12]:
normalized.to_csv('normalized-l2s.csv', index=False)

In [13]:
bucket = 'sds-capstone-jlab'
filename = 'consolidated-data/normalized-l2s.csv'
s3.meta.client.upload_file(Filename = 'normalized-l2s.csv', Bucket= bucket, Key = filename)

########################################################################################################################################################################

# Start of New Code

########################################################################################################################################################################

In [113]:
tree_data = L2_values

In [19]:
tree_data = tree_data.drop('time', axis=1)

In [20]:
tree_data

Unnamed: 0,+ CB I,+ CB V,+ DC I,- CB I,- CB V,- DC I,A FLUX,A+ IGBT I,A+* IGBT I,AC L1 I,...,DTL1 Kly,DTL2 Kly,DV/DT,Gate In,MOD I,MOD V,Spare 2,Spare 3,Spare 4,result
0,3.260522e+08,4.414723e+05,2.234895e+06,3.575379e+08,1.644504e+06,2.205337e+06,454.490234,8.019751e+06,8.494054e+06,1.684538e+08,...,0.208982,0.668123,239.492860,0.000,8.070443e+04,1.178059e+02,14899.120169,15111.673495,14901.789914,Run
1,3.495603e+08,1.034532e+06,1.176299e+06,3.824735e+08,2.870456e+06,1.158547e+06,1070.534595,7.747017e+06,8.020463e+06,7.538326e+08,...,0.101715,0.288959,331.380211,0.000,5.651510e+04,1.296821e+02,15036.352122,14488.784709,14465.202849,Run
2,3.659673e+08,1.465824e+06,2.433918e+06,3.958175e+08,3.640460e+06,2.404235e+06,485.939708,5.917347e+06,6.364391e+06,6.579064e+08,...,0.136289,0.376288,236.298105,99.992,5.263326e+04,8.018332e+01,14450.629534,14456.036356,14642.480445,Run
3,3.572112e+08,5.559499e+05,2.182731e+06,3.925506e+08,1.576345e+06,2.166040e+06,312.466933,5.468347e+06,5.964451e+06,1.617844e+08,...,0.235908,0.532662,195.425854,99.992,4.884831e+04,7.498063e+01,15168.901182,15526.131352,14532.064170,Run
4,3.500073e+08,1.200225e+06,1.329651e+06,3.827001e+08,3.051042e+06,1.298665e+06,659.401696,6.090040e+06,6.401805e+06,7.446594e+08,...,0.119968,0.325712,240.693028,99.992,4.939026e+04,8.821072e+01,14975.093375,14500.746911,14368.278449,Run
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,3.521858e+08,4.589227e+06,6.832627e+08,3.902238e+08,5.449679e+06,6.801564e+08,5947.889486,1.695223e+07,1.763020e+07,2.251152e+05,...,0.547872,1.409775,497.903485,99.992,1.525644e+05,2.042325e+04,15292.627185,15382.943750,14649.886870,Fault
130,3.641413e+08,1.265861e+06,1.912246e+06,4.187409e+08,7.441878e+05,1.870462e+06,1564.019479,1.278216e+07,1.318180e+07,5.593693e+08,...,0.293938,0.142370,386.213082,99.992,1.420228e+05,3.679087e+02,14596.689671,15432.107027,14411.406034,Fault
131,3.251070e+08,8.901428e+05,1.278806e+06,3.714151e+08,4.955778e+05,1.253083e+06,2755.122506,1.390919e+07,1.451057e+07,2.376683e+06,...,0.159007,0.191024,496.480021,99.992,1.224144e+05,3.302045e+02,16268.850009,16466.225471,15754.281522,Fault
132,3.478586e+08,1.358071e+06,2.281253e+06,3.977681e+08,7.380655e+05,2.248938e+06,684.301095,1.500071e+07,1.586496e+07,3.733042e+08,...,0.240195,0.222741,238.284291,99.992,1.462043e+05,2.057235e+02,14711.476293,15034.213416,14599.187316,Fault


In [46]:
## pulling in libraries
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot
from sklearn.model_selection import train_test_split

In [22]:
# rename 
df = tree_data

In [23]:
# predictor response split:
df_x = df.drop('result', axis=1)
df_y = df.result

## Doing a Test / Training split 

In [112]:
## tuning parameters:
learning_rate_list = np.arange(0.01,.2,.01)
max_depth_list = np.arange(1,6,1) 
n_estimators_list = np.arange(150,200,10)    ### change to be different (take forever to run)
# subsample_list = maybe do one for this?

## redefining cv rules:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [47]:
# test / train split
X_train, X_test, Y_train, Y_test = train_test_split(df_x, df_y, test_size=0.25, random_state=2021)

In [52]:
global_df_col_names = ['Learning_Rate', 'Max_Depth', 'Number_Of_Tress','mean_score', 'std_of_scores' ]
training_global_df = pd.DataFrame(columns = global_df_col_names)

for l in learning_rate_list:
    for d in max_depth_list:
        for e in n_estimators_list:
            # getting local model:
            local_model = GradientBoostingClassifier(n_estimators=e, learning_rate=l, max_depth=d)
            
            # running a 10-fold CV 3 seperate times for that model
            local_n_scores = cross_val_score(local_model, X_train, Y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
            
            # getting the average score accross the 30 splits for the local model
            local_mean_score = mean(local_n_scores)
            
            # getting the standard deviation accross the 30 splits for the local model
            local_std = std(local_n_scores)
            
            # saving into DF
            local_df = pd.DataFrame({'Learning_Rate':l, 
                                    'Max_Depth':d, 
                                    'Number_Of_Tress':e, 
                                    'mean_score':local_mean_score, 
                                    'std_of_scores':local_std}, index=[0])
            training_global_df = training_global_df.append(local_df, ignore_index=True)
            
# training_global_df

Unnamed: 0,Learning_Rate,Max_Depth,Number_Of_Tress,mean_score,std_of_scores
0,0.01,1,150,0.723333,0.105462
1,0.01,1,160,0.723333,0.105462
2,0.01,1,170,0.723333,0.105462
3,0.01,1,180,0.726667,0.109341
4,0.01,1,190,0.726667,0.109341
...,...,...,...,...,...
470,0.19,5,150,0.750000,0.150000
471,0.19,5,160,0.760000,0.140475
472,0.19,5,170,0.733333,0.155635
473,0.19,5,180,0.760000,0.135647


In [106]:
training_global_df = training_global_df.sort_values('mean_score', ascending=False)
best_tuning_parameters_train = training_global_df.iloc[0,:]
best_tuning_parameters_train

Learning_Rate          0.18
Max_Depth                 2
Number_Of_Tress         180
mean_score             0.82
std_of_scores      0.130128
Name: 433, dtype: object

#### Above are the parameters for the model with the highest accuracy rate. I am concerned with that std, which is implying that while a mean accuracy score fo 82% within the 30 CVs for those parameters (.18 learning rate, 2 depth and 180 trees) is pretty good, the distribution is pretty wide, with scores only 1 std from the mean being below 70%. 

## Now trying with test

In [104]:
model_best_parameters = GradientBoostingClassifier(n_estimators=180, learning_rate=0.18, max_depth=2).fit(X_train, Y_train)

In [107]:
model_best_parameters.score(X_test, Y_test)  # Returns the mean accuracy on the given test data and labels.
# https://scikit-learn.org/0.15/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

0.8823529411764706

#### Test is pretty high, but again, given the high std, I am not convinced if we were given brand new data, this model would preform as well

In [115]:
results = list(model_best_parameters.predict(X_test))

true_values = list(Y_test)

df_of_test_results = pd.DataFrame({'prediction':results,
                                   'actual':true_values})

def True_Positive(row):
    if ((row['prediction'] == 'Run') and (row['actual'] == 'Run')): 
        val = 1
    else:
        val = 0
    return val
def True_Negative(row):
    if ((row['prediction'] == 'Fault') and (row['actual'] == 'Fault')): 
        val = 1
    else:
        val = 0
    return val
def False_Positive(row):
    if ((row['prediction'] == 'Run') and (row['actual'] == 'Fault')): 
        val = 1
    else:
        val = 0
    return val
def False_Negative(row):
    if ((row['prediction'] == 'Fault') and (row['actual'] == 'Run')): 
        val = 1
    else:
        val = 0
    return val

# calcing score of each observation
df_of_test_results['True Positive'] = df_of_test_results.apply(True_Positive, axis=1)
df_of_test_results['True Negative'] = df_of_test_results.apply(True_Negative, axis=1)
df_of_test_results['False Positive'] = df_of_test_results.apply(False_Positive, axis=1)
df_of_test_results['False Negative'] = df_of_test_results.apply(False_Negative, axis=1)

# getting dif scores
true_positive_rate = sum(df_of_test_results['True Positive'])/len(df_of_test_results)
true_negative_rate = sum(df_of_test_results['True Negative'])/len(df_of_test_results)
false_positive_rate = sum(df_of_test_results['False Positive'])/len(df_of_test_results)
false_negative_rate = sum(df_of_test_results['False Negative'])/len(df_of_test_results)
summary_table = pd.DataFrame({'true positive':true_positive_rate,
                                    'true negative':true_negative_rate,
                                    'false positive':false_positive_rate,
                                    'false negative':false_negative_rate}, index=[0])
summary_table['accuracy'] = summary_table['true positive']+ summary_table['true negative']

In [116]:
summary_table

Unnamed: 0,true positive,true negative,false positive,false negative,accuracy
0,0.588235,0.294118,0.058824,0.058824,0.882353


#### Final False positive rate is around 6%. Potential next steps would be implimenting a cost for false postives. Also, seeing if other boosting methods / different parameters lower the std would be something to explore. 