### Based on Steps 1-5. Only doing feature selection among a2, a7 and a4 derived features

In [2]:
import pandas as pd
import sys
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE

from sklearn.linear_model import RandomizedLasso

%matplotlib inline
cmap_bold = ListedColormap(['#00FF00','#FF0000'])

In [4]:
sys.path.append('../utils')

In [5]:
import DataAggregation as da
import AlgoUtils as au

In [6]:
algos_dd = {
    "LogisticRegression": {"C": 1e9},
    "LogisticRegressionB": {"C": 1e9, "class_weight":'balanced'},
    "KNeighborsClassifier": {"n_neighbors": 7},
    "LinearDiscriminantAnalysis": {},
    "QuadraticDiscriminantAnalysis": {},
    "SVC": {}
}

fcols = ["d_mean:d_std:d_max:l_range",
         "d_mean:d_std:l_range",
         "d_std:l_range",
         "l_range",
         "d_std",
         "d_max"]
algos_str = ["LogisticRegression", 
             "LogisticRegressionB", 
             "KNeighborsClassifier",
             "LinearDiscriminantAnalysis",
             "QuadraticDiscriminantAnalysis"]

In [7]:
a2 = da.GetFrames("../data/device_failure.csv", "a2")
a7 = da.GetFrames("../data/device_failure.csv", "a7")
a4 = da.GetFrames("../data/device_failure.csv", "a4", ldays=-30, lday_strict=False)
tdf = a2.df_sfeature.drop("failure", axis=1).join(a7.df_sfeature.drop("failure", axis=1)).join(a4.df_sfeature)

In [8]:
tdf.head()

Unnamed: 0,a2d_max,a2d_mean,a2d_std,a2l_range,a7d_max,a7d_mean,a7d_std,a7l_range,a4d_max,a4d_mean,a4d_std,a4l_range,failure
S1F01085,0,0,0,0,0,0,0,0,0,0,0,0,0
S1F013BB,0,0,0,0,0,0,0,0,0,0,0,0,0
S1F0166B,0,0,0,0,0,0,0,0,0,0,0,0,0
S1F01E6Y,0,0,0,0,0,0,0,0,0,0,0,0,0
S1F01JE0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Baseline of what I think will work the best

In [10]:
algo_str = "QuadraticDiscriminantAnalysis"
scols = ["a2d_std", "a7d_std","a7l_range","a7d_mean", "a7d_max"]
analysisdf = au.do_clf_validate_new(tdf, algo_str,algos_dd[algo_str], scols, "failure")

Cross-val-score(roc_auc) = 0.72
Cross-val-score(accuracy) = 0.94
Cross-val-score(recall)   = 0.42
Cross-val-score(precision)= 0.80
Cross-val-score(f1)       = 0.80


### Baseline with all features

In [11]:
algo_str = "QuadraticDiscriminantAnalysis"
scols = tdf.columns[:-1]
analysisdf = au.do_clf_validate_new(tdf, algo_str,algos_dd[algo_str], scols, "failure")

Cross-val-score(roc_auc) = 0.78
Cross-val-score(accuracy) = 0.93
Cross-val-score(recall)   = 0.56
Cross-val-score(precision)= 0.65
Cross-val-score(f1)       = 0.65


### Recall is definitely much better with a4 derived Features!
### Loss in precision might be worth it if we can do hypothesis testing from field data
### Let's see if we can get similar or better performance with lesser features

### Variance Threshold based Feature selection

In [16]:
X = tdf[tdf.columns[:-1]]
y = tdf["failure"]
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_new = sel.fit_transform(X)
sel.get_support()
cnt = 0
allcols = tdf.columns[:-1]
for decision in sel.get_support():
    if decision == True:
        print allcols[cnt]
    cnt = cnt + 1

a2l_range
a7l_range
a4d_max
a4l_range


In [17]:
#Adding a2l_range, a4d_max and a4l_range based on above analysis
scols = ["a2l_range", "a2d_std", "a7d_std","a7l_range","a4d_max", "a4l_range"]
algo_str = "QuadraticDiscriminantAnalysis"
scols = ["a2l_range", "a2d_std", "a7d_std","a7l_range","a4d_max", "a4l_range"]
analysisdf = au.do_clf_validate_new(tdf, algo_str,algos_dd[algo_str], scols, "failure")

Cross-val-score(roc_auc) = 0.78
Cross-val-score(accuracy) = 0.93
Cross-val-score(recall)   = 0.51
Cross-val-score(precision)= 0.65
Cross-val-score(f1)       = 0.65


### Variance Threshold Feature Selection Summary
##### a2l_range, a4d_max and a4l_range: Definitely adding more sensitivity (expense of precision)
##### But for same precision, All features have recall = 0.56 (better than 0.51)

### Randomized Lasso  
#### (Surprisingly, of no use: TBD: Need to understand why)

In [23]:
fcols = tdf.columns[:-1] 
X = tdf[fcols]
Y = tdf["failure"]
rlasso = RandomizedLasso(alpha=0.025)
rlasso.fit(X, Y)
 
print "Features sorted by their score:"
print sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), 
                 fcols), reverse=True)

Features sorted by their score:
[(0.0, 'a7l_range'), (0.0, 'a7d_std'), (0.0, 'a7d_mean'), (0.0, 'a7d_max'), (0.0, 'a4l_range'), (0.0, 'a4d_std'), (0.0, 'a4d_mean'), (0.0, 'a4d_max'), (0.0, 'a2l_range'), (0.0, 'a2d_std'), (0.0, 'a2d_mean'), (0.0, 'a2d_max')]


### Recursive Feature Elimination (QDA doesn't support RFE). Use LogisticRegression instead

In [24]:
fcols = [x for x in tdf.columns[:-1]]
X = tdf[fcols]
Y = tdf["failure"]
clf = LogisticRegression()
rfe = RFE(clf, n_features_to_select=2)
rfe.fit(X,Y)
 
print "Features sorted by their rank:"
print sorted(zip(rfe.ranking_, fcols))

Features sorted by their rank:
[(1, 'a2l_range'), (1, 'a7l_range'), (2, 'a7d_max'), (3, 'a4d_mean'), (4, 'a4l_range'), (5, 'a7d_std'), (6, 'a7d_mean'), (7, 'a4d_std'), (8, 'a2d_std'), (9, 'a2d_mean'), (10, 'a2d_max'), (11, 'a4d_max')]


In [25]:
#a4d_mean: Need to add this
#a2l_range, a4l_range: Already added this based on Variance Threshold analysis
#a7l_range, a7d_max, a7d_std, a7d_mean: This is already in baseline
algo_str = "QuadraticDiscriminantAnalysis"
scols = ["a2l_range", "a2d_std", "a7d_std","a7l_range","a4d_max", "a4l_range", "a4d_mean"]
analysisdf = au.do_clf_validate_new(tdf, algo_str,algos_dd[algo_str], scols, "failure")


Cross-val-score(roc_auc) = 0.79
Cross-val-score(accuracy) = 0.93
Cross-val-score(recall)   = 0.55
Cross-val-score(precision)= 0.64
Cross-val-score(f1)       = 0.64


#### Summary for RFE
##### Adding a4d_mean definitely boosting recall, without too much impact on precision

### Correlation of failure with all variables

In [27]:
fcdf = pd.DataFrame(tdf.corr()["failure"])
fcdf.loc[:,"failure_abs"] = fcdf.failure.map(lambda x: abs(x))
fcdf.sort_values(by="failure_abs", ascending=False, inplace=True)
fcdf.drop(["failure_abs"], axis=1, inplace=True)
fcdf

Unnamed: 0,failure
failure,1.0
a4l_range,0.482712
a7d_std,0.453876
a7l_range,0.447489
a7d_mean,-0.421852
a2l_range,0.386144
a2d_std,0.346241
a2d_max,0.31964
a2d_mean,-0.287751
a4d_std,0.247814


#### For QDA, what matters most is difference in correlation between different classes

In [29]:
tdf[tdf["failure"] == 0].corr()

Unnamed: 0,a2d_max,a2d_mean,a2d_std,a2l_range,a7d_max,a7d_mean,a7d_std,a7l_range,a4d_max,a4d_mean,a4d_std,a4l_range,failure
a2d_max,1.0,0.876467,0.979157,0.924475,0.280718,0.004785,0.11643,0.155355,0.011553,0.055736,0.029822,0.115258,
a2d_mean,0.876467,1.0,0.830472,0.820134,0.405954,-0.00897,0.178529,0.236265,0.013675,0.06489,0.039523,0.153533,
a2d_std,0.979157,0.830472,1.0,0.959579,0.248684,0.012098,0.098964,0.130937,0.009871,0.057031,0.026434,0.105819,
a2l_range,0.924475,0.820134,0.959579,1.0,0.42138,-0.018684,0.189667,0.254386,0.019428,0.066123,0.047078,0.163158,
a7d_max,0.280718,0.405954,0.248684,0.42138,1.0,-0.158516,0.497531,0.706093,0.006534,-0.027512,0.066296,0.276579,
a7d_mean,0.004785,-0.00897,0.012098,-0.018684,-0.158516,1.0,-0.925136,-0.782458,0.00054,0.014731,-0.011642,-0.04501,
a7d_std,0.11643,0.178529,0.098964,0.189667,0.497531,-0.925136,1.0,0.957153,0.002103,-0.018053,0.033861,0.139098,
a7l_range,0.155355,0.236265,0.130937,0.254386,0.706093,-0.782458,0.957153,1.0,0.003069,-0.030069,0.047045,0.194575,
a4d_max,0.011553,0.013675,0.009871,0.019428,0.006534,0.00054,0.002103,0.003069,1.0,0.971616,0.985479,0.365307,
a4d_mean,0.055736,0.06489,0.057031,0.066123,-0.027512,0.014731,-0.018053,-0.030069,0.971616,1.0,0.938878,0.264671,


In [30]:
tdf[tdf["failure"] == 1][fcols].corr()

Unnamed: 0,a2d_max,a2d_mean,a2d_std,a2l_range,a7d_max,a7d_mean,a7d_std,a7l_range,a4d_max,a4d_mean,a4d_std,a4l_range
a2d_max,1.0,-0.1877,0.379686,0.565935,0.100255,-0.111395,0.163974,0.172496,0.129601,0.096996,0.189036,0.291658
a2d_mean,-0.1877,1.0,-0.976761,-0.908298,0.010513,0.222996,-0.192964,-0.170859,0.094696,0.195499,-0.03245,-0.39156
a2d_std,0.379686,-0.976761,1.0,0.973922,0.022193,-0.222135,0.207586,0.188264,-0.060944,-0.157794,0.071098,0.430994
a2l_range,0.565935,-0.908298,0.973922,1.0,0.044539,-0.2238,0.221222,0.207453,-0.028156,-0.125853,0.104044,0.448053
a7d_max,0.100255,0.010513,0.022193,0.044539,1.0,-0.127484,0.322978,0.454068,0.077046,0.277936,0.172314,0.15487
a7d_mean,-0.111395,0.222996,-0.222135,-0.2238,-0.127484,1.0,-0.962195,-0.910083,-0.200717,-0.16655,-0.258298,-0.212284
a7d_std,0.163974,-0.192964,0.207586,0.221222,0.322978,-0.962195,1.0,0.985359,0.229747,0.242056,0.301807,0.218112
a7l_range,0.172496,-0.170859,0.188264,0.207453,0.454068,-0.910083,0.985359,1.0,0.228868,0.267453,0.312119,0.22358
a4d_max,0.129601,0.094696,-0.060944,-0.028156,0.077046,-0.200717,0.229747,0.228868,1.0,0.852552,0.894497,0.197314
a4d_mean,0.096996,0.195499,-0.157794,-0.125853,0.277936,-0.16655,0.242056,0.267453,0.852552,1.0,0.802666,-0.07844


In [32]:
#Based on: a2d_std Vs a2d_mean: +ve corr for good. -ve corr for bad
#Possibly no improvement because of lack of signal in this selection.
#May need to cross validate
algo_str = "QuadraticDiscriminantAnalysis"
scols = ["a2l_range", "a2d_std", "a2d_mean", "a7d_std","a7l_range","a4d_max", "a4l_range", "a4d_mean"]
analysisdf = au.do_clf_validate_new(tdf, algo_str,algos_dd[algo_str], scols, "failure")

Cross-val-score(roc_auc) = 0.79
Cross-val-score(accuracy) = 0.93
Cross-val-score(recall)   = 0.53
Cross-val-score(precision)= 0.64
Cross-val-score(f1)       = 0.64


In [35]:
algo_str = "QuadraticDiscriminantAnalysis"
scols = tdf.columns[:-1]
analysisdf = au.do_clf_validate_new(tdf, algo_str,algos_dd[algo_str], scols, "failure")

Cross-val-score(roc_auc) = 0.78
Cross-val-score(accuracy) = 0.93
Cross-val-score(recall)   = 0.56
Cross-val-score(precision)= 0.65
Cross-val-score(f1)       = 0.65


### OVERALL SUMMARY
#### Looks like need to use all Features derived from a2, a7 and a4