In [1]:
import pandas as pd
import numpy as np

### 1.Split prepared data from Milestone 1 into training and testing

#### Read and merge data

In [2]:
#read two data files
df = pd.read_excel('secom.xlsx',header= None)
label = pd.read_excel('secom_labels.xlsx', header = None)

In [3]:
#merge two data sets
label.columns = ['class', 'time']
df = df.add_prefix('sensor_')
data = pd.concat([label,df], axis=1)
data.head()

Unnamed: 0,class,time,sensor_0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,...,sensor_580,sensor_581,sensor_582,sensor_583,sensor_584,sensor_585,sensor_586,sensor_587,sensor_588,sensor_589
0,-1,19/07/2008 11:55:00,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,...,,,0.5005,0.0118,0.0035,2.363,,,,
1,-1,19/07/2008 12:32:00,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,...,0.006,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045
2,1,19/07/2008 13:17:00,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,-1,19/07/2008 14:43:00,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,...,0.0044,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,-1,19/07/2008 15:22:00,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,...,,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432


#### Clean and prepare data

Clean dataset, impute missing values. 

First, check whether target attribute has NaN, if yes, drop NaN in class, since imputing class label is not that reasonable.

In [4]:
print(sum(data['class'].isnull()))

0


Target varibles have no NaN. 

Check whether time_stamp has NaN. Again, this one is not reasonable to impute values. 

In [5]:
print(sum(data['time'].isnull()))

0


No missing values in time_stamp.

Next step, we can loop through each column, and impute missing values with median.

In [6]:
for i in range(0,590):
    name = 'sensor_'+str(i) 
    HasNan =data[name].isnull()
    if sum(HasNan) > 0:
        data.loc[HasNan, name] =  np.mean(data[name])

Data is cleaned with missing values.

There is no more missing values.  Let's take care of time_stamp data. Time_stamp data is a little bit messy, has different kind of format

In [7]:
#convert time_stamp to datetime format
data['time'] = pd.to_datetime(data['time'])

Now, the data is cleaned and ready for further exploration.

#### Handle class imbalance problem

In [8]:
# import SMOTE package
from collections import Counter
from imblearn.over_sampling import SMOTE 
from sklearn.model_selection import train_test_split

In [9]:
# handle imbalance data set using SMOTE
X = data.iloc[:,2:]
y= data['class']

#split data first, then apply SMOTE
X_train, X_test, y_train, y_test = train_test_split (X,y, random_state =0)

print('Original dataset shape {}'.format(Counter(y)))
sm = SMOTE(random_state=0)
X_res, y_res = sm.fit_sample(X_train, y_train)
print('Resampled dataset shape {}'.format(Counter(y_res)))

Original dataset shape Counter({-1: 1463, 1: 104})
Resampled dataset shape Counter({-1: 1092, 1: 1092})


### 2. Build a decision tree model that detects faulty products
### 3. Build an ensemble model that detects faulty products
### 4. Build an SVM model

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

dt = DecisionTreeClassifier().fit(X_res, y_res)
dt_pred = dt.predict(X_test)

rf = RandomForestClassifier().fit(X_res, y_res)
rf_pred = rf.predict(X_test)

svc = SVC().fit(X_res, y_res)
svc_pred = svc.predict(X_test)


### 5 .Evaluate all three models

In [11]:
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score

def roc (pred, clf):
    roc = roc_auc_score(y_test, pred)
    print ('roc for ', clf, ' is: %.2f'%roc)

def f1 (pred, clf):
    f1 = f1_score (y_test, pred)
    print ('F1 for ', clf, ' is: %.2f'%f1)

def recall (pred, clf):
    sco = recall_score (y_test, pred)
    print ('Recall for ', clf, ' is: %.2f'%sco)
    
def precision (pred, clf):
    sco = precision_score (y_test, pred)
    print ('Precision for ', clf, ' is: %.2f'%sco)
    

In [12]:
roc(dt_pred, 'Decision tree')
f1(dt_pred, 'Decision tree')
recall(dt_pred, 'Decision tree')
precision(dt_pred, 'Decision tree')

roc for  Decision tree  is: 0.58
F1 for  Decision tree  is: 0.17
Recall for  Decision tree  is: 0.29
Precision for  Decision tree  is: 0.12


In [13]:
roc(rf_pred, 'Random Forest')
f1(rf_pred, 'Random Forest')
recall(rf_pred, 'Random Forest')
precision(rf_pred, 'Random Forest')

roc for  Random Forest  is: 0.49
F1 for  Random Forest  is: 0.00
Recall for  Random Forest  is: 0.00
Precision for  Random Forest  is: 0.00


In [14]:
roc(svc_pred, 'SVC')
f1(svc_pred, 'SVC')
recall(svc_pred, 'SVC')
precision(svc_pred, 'SVC')

roc for  SVC  is: 0.45
F1 for  SVC  is: 0.07
Recall for  SVC  is: 0.33
Precision for  SVC  is: 0.04


Using the default model, performance of decision tree, random forest and SVC are all bad. Hyperparameters need to be optimized. GridSearchCV will beused to search for optimized paramters.

### All default models perform very poor, let's try to use GridSearchCV to optimize the parameters (use f1 as scoring method)

##### GridSearch on DecisionTree

In [15]:
from sklearn.model_selection import GridSearchCV

dt_para = {'max_depth': np.arange(1,5),'min_samples_leaf': np.arange(1,5), 'criterion':['gini', 'entropy']}
dt_cv = GridSearchCV(DecisionTreeClassifier(), dt_para, scoring = 'f1').fit(X_res, y_res)
print('Best decision tree model:', dt_cv.best_params_)

Best decision tree model: {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 3}


In [16]:
dt_cv = DecisionTreeClassifier(criterion ='gini', max_depth = 4, min_samples_leaf= 3).fit(X_res, y_res)
dt_cv_pred = dt_cv.predict(X_test)
roc(dt_cv_pred, 'Decision Tree')
f1(dt_cv_pred, 'Decision Tree')
recall(dt_cv_pred, 'Decision Tree')
precision(dt_cv_pred, 'Decision Tree')

roc for  Decision Tree  is: 0.59
F1 for  Decision Tree  is: 0.14
Recall for  Decision Tree  is: 0.48
Precision for  Decision Tree  is: 0.08


The overall perfornance for the best decision tree model does not improve a lot, suggesting that search range might not have the optimal parameters. 

###### GridSearch on RandomForest

In [17]:
rf_para = {'n_estimators': [80,100,120], 
           'max_depth': np.arange(1,5),
           'min_samples_leaf': np.arange(1,5), 
           'criterion':['gini', 'entropy']}
rf_cv =GridSearchCV(RandomForestClassifier(), rf_para, scoring ='f1').fit(X_res, y_res)
print('Best random forest model:', rf_cv.best_params_)

Best random forest model: {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 1, 'n_estimators': 80}


In [19]:
rf_cv = RandomForestClassifier(criterion = 'entropy', max_depth = 4, min_samples_leaf = 1, n_estimators=80).fit(X_res, y_res)
rf_cv_pred = rf_cv.predict(X_test)
roc(rf_cv_pred, 'Random Forest')
f1(rf_cv_pred, 'Random Forest')
recall(rf_cv_pred, 'Random Forest')
precision(rf_cv_pred, 'Random Forest')

roc for  Random Forest  is: 0.64
F1 for  Random Forest  is: 0.21
Recall for  Random Forest  is: 0.43
Precision for  Random Forest  is: 0.14


The overall perfornance for the best random forest model improves a lot, and its performance is better than the best decision tree model.

##### GridSearch on SVC

In [26]:
cv_params = {'kernel':['rbf'], 'C':[0.001,0.01,1], }
svc_cv = GridSearchCV(SVC(), cv_params, scoring ='f1').fit(X_res, y_res)
print('Best model:', svc_cv.best_params_)

Best model: {'C': 1, 'kernel': 'rbf'}


In [27]:
svc_cv = SVC(kernel = 'rbf', C=1).fit(X_res, y_res)
# svc_cv = SVC().fit(X_res, y_res)
svc_cv_pred = svc_cv.predict(X_test)
roc(svc_cv_pred, 'SVC')
f1(svc_cv_pred, 'SVC')
recall(svc_cv_pred, 'SVC')
precision(svc_cv_pred, 'SVC')

roc for  SVC  is: 0.45
F1 for  SVC  is: 0.07
Recall for  SVC  is: 0.33
Precision for  SVC  is: 0.04


The best model using GridSearchCV is rbf with C=1,which surprisingly is the same as the default model. 

##### 6. Solicit specific feedback on your code 

1. How to determine which model to be used in general? 
2. How to determine which metrics to be used when comparing different models? I understand that stick with one metrics is necessary, but I am not sure should I choose roc or f1, or others?
3. What can I if the performance is still not good after GridSearchCV?