# Analysis with Random Forest class 1st shot
### Thorowing in all potentials imports - including the kitchen sink (needs clean up)

In [1]:
import os
import pandas as pd
import numpy as np
#import pydotplus

import plotly.graph_objs as go
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

from tqdm import tqdm
from pathlib import Path

from load_data import load_parse_save
#from scipy import stats
#import pydotplus
#import graphviz
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import LabelBinarizer
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.datasets import make_imbalance
from collections import Counter
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
def resample_split_data(df, resample_ratio=1.0, test_size=0.3, seed=None, debug=False):
    """
    Resample data based on minor classification multiples, as boosting does NOT quite do the job
    Using only train & test as data availability is limited to have validation set
    And not part of this excercis
    """
    y = df['Fails']
    X = df[['count', 'm1_max' ,'m1_mean' ,'m1_min' ,'m2_max', 'm2_mean', 'm2_min',
                          'm3_max', 'm3_mean', 'm3_min', 'm4_max', 'm4_mean', 'm4_min',
                          'm5_max', 'm5_mean', 'm5_min', 'm6_max', 'm6_mean', 'm6_min',
                          'm7_max', 'm7_mean', 'm7_min', 'm8_max', 'm8_mean', 'm8_min',
                          'm9_max', 'm9_mean', 'm9_min']]
    # get the min of either (max available dominant classification or multiple of minor classification
    dominant_resample_count = min(int(y.sum() * resample_ratio), y.shape[0] - y.sum())
    X_res, y_res = make_imbalance(X, y,
                                  sampling_strategy={True: y.sum(), False: dominant_resample_count},
                                  random_state=42)
    if debug:
        print('Distribution before imbalancing: {}'.format(Counter(y)))
        print('Distribution after  imbalancing: {}'.format(Counter(y_res)))
    # return split data (1-test_size) 70% training & test_size 30% test data
    return train_test_split(X_res, y_res, test_size=test_size)

def print_prediction_results(test_y, predict_y):
    """ Quick print statement to look at stats of the model 
        F1 is really important as that reduces downtime accuracy is NOT as
    """ 
    conf_mtx = confusion_matrix(test_y, predict_y)
    print("Result from confusion_matrix\n{}\n".format(conf_mtx))

    print("Training data had {} data points & {} fails".format(y_train.shape[0], y_train.sum()))
    print("Testing  data had {} data points & {} fails".format(test_y.shape[0], test_y.sum()))
    print("Out of {} Fails, predicted {} or {:,.1f}% correctly".format(
        conf_mtx[1,:].sum(), conf_mtx[1,1], 100 * conf_mtx[1,1] / conf_mtx[1,:].sum()))

    print("Accuracy: {:,.3f}%".format(100 * metrics.accuracy_score(test_y, predict_y)))
    print("*F1_score = {:,.1f}%".format(100 * f1_score(test_y, predict_y, 'binary')))

### Load data & mark any prior & fail metrics as Failed set 

In [3]:
# Load Data from csv file & process via load_parse_save
pkl_file = './parsed_dataframe.pkl'
if os.path.isfile(pkl_file):
    # Alternative is Load from Pickled set: Saves Time 
    prev_df = pd.read_pickle(pkl_file)
else:
    tqdm.pandas(desc='load & parse csv')
    prev_df = load_parse_save(save_file='./parsed_dataframe.pkl', debug=True)

### Add column Fail_set & mark it
This is a bit too close for comfort. 
Ideally, we would cut off data at least 7 days prior to Failure if possible, 
as one does NOT want to wait until last min. 
#### So, this will be an improvement for later
Only the data prev to fail is one group & other group is no Failure + post Failure
#### N.B.: This only works because there is only ONE failure for each device.
Must deal with multiple Fails on a device with a bit more sophistication

In [4]:
prev_df['Fail_set'] = prev_df.daysDelta.apply(lambda x: True if x <= 0.0 else False)

# look at devices level & create group level Stat!
### Need to think of better stats The idea is that extreme values on the metrics are likely indicator of an issue

In [5]:
# devices = pd.DataFrame(prev_df.device.value_counts().reset_index())
    
f =  lambda x: pd.Series({
      'Fails' : True if x['failure'].sum() > 0 else False,
      'count' : x['date'].count(),
      'm1_max': x['metric1'].max(), 'm1_mean': x['metric1'].mean(), 'm1_min': x['metric1'].min(), 'm1_std' : x['metric1'].std(),
      'm2_max': x['metric2'].max(), 'm2_mean': x['metric2'].mean(), 'm2_min': x['metric2'].min(), 'm2_std' : x['metric2'].std(),
      'm3_max': x['metric3'].max(), 'm3_mean': x['metric3'].mean(), 'm3_min': x['metric3'].min(), 'm3_std' : x['metric3'].std(),
      'm4_max': x['metric4'].max(), 'm4_mean': x['metric4'].mean(), 'm4_min': x['metric4'].min(), 'm4_std' : x['metric4'].std(),
      'm5_max': x['metric5'].max(), 'm5_mean': x['metric5'].mean(), 'm5_min': x['metric5'].min(), 'm5_std' : x['metric5'].std(),
      'm6_max': x['metric6'].max(), 'm6_mean': x['metric6'].mean(), 'm6_min': x['metric6'].min(), 'm6_std' : x['metric6'].std(),
      'm7_max': x['metric7'].max(), 'm7_mean': x['metric7'].mean(), 'm7_min': x['metric7'].min(), 'm7_std' : x['metric7'].std(),
      'm8_max': x['metric8'].max(), 'm8_mean': x['metric8'].mean(), 'm8_min': x['metric8'].min(), 'm8_std' : x['metric8'].std(),
      'm9_max': x['metric9'].max(), 'm9_mean': x['metric9'].mean(), 'm9_min': x['metric9'].min(), 'm9_std' : x['metric9'].std(),
  })
   
ggrp = prev_df.groupby(['device'])
groupd_prev_stat = ggrp.apply(f)
#groupd_prev_stat.head()

### Split the data into Train & Test, then use Random Forest to Predict
#### Not sure RF is a good model as results are volatile

In [6]:
X_train, X_test, y_train, y_test = resample_split_data(groupd_prev_stat, resample_ratio=.5, debug=True)

RF_clf = RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
RF_clf.fit(X_train,y_train)

y_pred = RF_clf.predict(X_test)

Distribution before imbalancing: Counter({False: 1063, True: 106})
Distribution after  imbalancing: Counter({True: 106, False: 53})


## Accuracy of the model
#### N.B. General Accuracy is extremely misleading as imbalance class issue comes up
Only 20% ~ 50% of True fails & F1_score 25% ~ 55% achieved
Needs higher % on the True fails & F1 + Results needs to be less ***Volatile***
#### This is business decision as some business cannot tolerate any fails/downtime like electronic manufacturing.
In this case nearly or all potential fails must be investigated even if few dozen UnFails creep in for every REAL Failures.

In [7]:
print_prediction_results(y_test, y_pred)

Result from confusion_matrix
[[10  5]
 [ 6 27]]

Training data had 111 data points & 73 fails
Testing  data had 48 data points & 33 fails
Out of 33 Fails, predicted 27 or 81.8% correctly
Accuracy: 77.083%
*F1_score = 83.1%


# Boosted Models
For imbalanced data usual techniques are:
- Random over & under Resampling (Either can disregard useful data w/ undersample or overfit w/ oversample)
- Clustered over-sampling (outperfoms underampling & outperforms it, but often over fits)
- SMOT (Synthetic Minority Over-sampling Technique): Really difficult wiht hi-dim data
- Bagging: Only works if based classifiers distribution is not extreme
- Various Boosting (Ada, Gradient, & XG): Ada Sensitive, XG fast & less prone to issues

In [8]:
X_train, X_test, y_train, y_test = resample_split_data(groupd_prev_stat, resample_ratio=.5, debug=True)

#boost_clf = AdaBoostClassifier( DecisionTreeClassifier(max_depth=1), n_estimators=200 )

boost_clf = XGBClassifier()

boost_clf.fit(X_train, y_train)
predictions = boost_clf.predict(X_test)

Distribution before imbalancing: Counter({False: 1063, True: 106})
Distribution after  imbalancing: Counter({True: 106, False: 53})


### *** Volatile F1 65~75% ***
- The results are better F1 score & hit ratio, but still very data random selection dependent.
- Likely results of using max/min from the metrics + needs better agg_metrics/factors
- The confusion matrix seems to indicate that model is still dominated by dominant classifier.
- Indicating that results may be better with Either sampling or SMOT 
- Indicating that results may be better with Either sampling or SMOT 

In [9]:
print_prediction_results(y_test, predictions)

Result from confusion_matrix
[[10  9]
 [ 1 28]]

Training data had 111 data points & 77 fails
Testing  data had 48 data points & 29 fails
Out of 29 Fails, predicted 28 or 96.6% correctly
Accuracy: 79.167%
*F1_score = 84.8%
