# Analysis with Random Forest class 1st shot
### Thorowing in all potentials imports - including the kitchen sink (needs clean up)

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from pathlib import Path

import plotly.graph_objs as go
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

#from scipy import stats
#import pydotplus
#import graphviz

from collections import Counter
from imblearn.datasets import make_imbalance
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from load_data import load_parse_save
from hlp_analysis import resample_split_data, print_prediction_results

### Load data & mark any prior & fail metrics as Failed set 

In [2]:
# Load Data from csv file & process via load_parse_save
pkl_file = './parsed_dataframe.pkl'
if os.path.isfile(pkl_file):
    # Alternative is Load from Pickled set: Saves Time 
    prev_df = pd.read_pickle(pkl_file)
else:
    tqdm.pandas(desc='load & parse csv')
    prev_df = load_parse_save(save_file='./parsed_dataframe.pkl', debug=True)

### Add column Fail_set & mark it
This is a bit too close for comfort. 
Ideally, we would cut off data at least 7 days prior to Failure if possible, 
as one does NOT want to wait until last min. 
#### So, this will be an improvement for later
Only the data prev to fail is one group & other group is no Failure + post Failure
#### N.B.: This only works because there is only ONE failure for each device.
Must deal with multiple Fails on a device with a bit more sophistication

In [3]:
prev_df['Fail_set'] = prev_df.daysDelta.apply(lambda x: True if x <= 0.0 else False)

# look at devices level & create group level Stat!
### Need to think of better stats The idea is that extreme values on the metrics are likely indicator of an issue

In [4]:
# devices = pd.DataFrame(prev_df.device.value_counts().reset_index())
    
f =  lambda x: pd.Series({
      'Fails' : True if x['failure'].sum() > 0 else False,
      'count' : x['date'].count(),
      'm1_max': x['metric1'].max(), 'm1_mean': x['metric1'].mean(), 'm1_min': x['metric1'].min(), 'm1_std' : x['metric1'].std(),
      'm2_max': x['metric2'].max(), 'm2_mean': x['metric2'].mean(), 'm2_min': x['metric2'].min(), 'm2_std' : x['metric2'].std(),
      'm3_max': x['metric3'].max(), 'm3_mean': x['metric3'].mean(), 'm3_min': x['metric3'].min(), 'm3_std' : x['metric3'].std(),
      'm4_max': x['metric4'].max(), 'm4_mean': x['metric4'].mean(), 'm4_min': x['metric4'].min(), 'm4_std' : x['metric4'].std(),
      'm5_max': x['metric5'].max(), 'm5_mean': x['metric5'].mean(), 'm5_min': x['metric5'].min(), 'm5_std' : x['metric5'].std(),
      'm6_max': x['metric6'].max(), 'm6_mean': x['metric6'].mean(), 'm6_min': x['metric6'].min(), 'm6_std' : x['metric6'].std(),
      'm7_max': x['metric7'].max(), 'm7_mean': x['metric7'].mean(), 'm7_min': x['metric7'].min(), 'm7_std' : x['metric7'].std(),
      'm8_max': x['metric8'].max(), 'm8_mean': x['metric8'].mean(), 'm8_min': x['metric8'].min(), 'm8_std' : x['metric8'].std(),
      'm9_max': x['metric9'].max(), 'm9_mean': x['metric9'].mean(), 'm9_min': x['metric9'].min(), 'm9_std' : x['metric9'].std(), })
   
ggrp = prev_df.groupby(['device'])
groupd_prev_stat = ggrp.apply(f)

### Split the data into Train & Test, then use Random Forest to Predict
#### Not sure RF is a good model as results are volatile

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = resample_split_data(groupd_prev_stat, resample_ratio=.5, debug=True)

RF_clf = RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
RF_clf.fit(X_train,y_train)

y_pred = RF_clf.predict(X_test)

Distribution before imbalancing: Counter({False: 1063, True: 106})
Distribution after  imbalancing: Counter({True: 106, False: 53})


## Accuracy of the model
#### N.B. General Accuracy is extremely misleading as imbalance class issue comes up
Only 20% ~ 50% of True fails & F1_score 25% ~ 55% achieved w/o imbalance adj
This climbs to around 80~90+% with adj
#### This is business decision as some business cannot tolerate any fails/downtime like electronic manufacturing.
In this case nearly or all potential fails must be investigated even if few dozen UnFails creep in for every REAL Failures.

In [6]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

print_prediction_results(y_train, y_test, y_pred)

Result from confusion_matrix
[[ 8  6]
 [ 3 31]]

Training data had 111 data points & 72 fails
Testing  data had 48 data points & 34 fails
Out of 34 Fails, predicted 31 or 91.2% correctly
Accuracy: 81.250%
*F1_score = 87.3%


# Boosted Models
For imbalanced data usual techniques are:
- Random over & under Resampling (Either can disregard useful data w/ undersample or overfit w/ oversample)
- Clustered over-sampling (outperfoms underampling & outperforms it, but often over fits)
- SMOT (Synthetic Minority Over-sampling Technique): Really difficult wiht hi-dim data
- Bagging: Only works if based classifiers distribution is not extreme
- Various Boosting (Ada, Gradient, & XG): Ada Sensitive, XG fast & less prone to issues

In [7]:
X_train, X_test, y_train, y_test = resample_split_data(groupd_prev_stat, resample_ratio=.5, debug=True)

#boost_clf = AdaBoostClassifier( DecisionTreeClassifier(max_depth=1), n_estimators=200 )

boost_clf = XGBClassifier()

boost_clf.fit(X_train, y_train)
predictions = boost_clf.predict(X_test)

Distribution before imbalancing: Counter({False: 1063, True: 106})
Distribution after  imbalancing: Counter({True: 106, False: 53})


### *** Volatile F1 65-75% wo/ imbalance adj BUT climbs to 85-90% w/ adj ***
- The results are better F1 score & hit ratio, but still very data random selection dependent.
- Likely results of using max/min from the metrics + needs better agg_metrics/factors
- The confusion matrix seems to indicate that model is still dominated by dominant classifier.
- Indicating that results may be better with Either sampling or SMOT 
- So, reduced resample ratio to 1/2 of the minor classification, which is a bit extreme
- Ada or XG boost do about 3+/-% better on F1 score vs RF
- Highest F1 was about 98%, which is unusual but given small data not unusual

In [8]:
print_prediction_results(y_train, y_test, predictions)

Result from confusion_matrix
[[13  0]
 [ 6 29]]

Training data had 111 data points & 71 fails
Testing  data had 48 data points & 35 fails
Out of 35 Fails, predicted 29 or 82.9% correctly
Accuracy: 87.500%
*F1_score = 90.6%
