In [25]:
import pandas as pd
import numpy as np 
import random
from datetime import datetime
import seaborn as sns
from xgboost import XGBClassifier
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from collections import Counter 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
import matplotlib.pyplot as plt
from sklearn import svm
import imblearn
from imblearn.over_sampling import SMOTE

# Uploading the data

In [None]:
path_file = 'C:\\Users\\vivi\\Documents\\AWS\\proyect\\data\\device_failure.csv'

In [None]:
import csv
row_file = []
with open(path_file, newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    for row in spamreader:
        row_file.append(row)

In [None]:
df = pd.DataFrame(row_file[1:], columns = row_file[0], dtype = float) 
date_list = list(df['date']) 
date_time = [datetime.strptime(j,'%Y-%m-%d') for j in date_list]
df['date_time']= date_time
df=df.sort_values(['device','date_time'])
unique_device = df['device'].drop_duplicates()

In [None]:
dif_days=[]
for device in unique_device:
    sample_data = df[df['device']==device]
    dif_days_device = list(np.arange(0,sample_data.shape[0]))
    dif_days=dif_days+dif_days_device
df['day'] = dif_days    

In [None]:
df.to_csv('C:\\Users\\vivi\\Documents\\AWS\\proyect\\data\\device_failure_dataframe.csv',index=False)

In [None]:
df.head()

# Exploratory Data Analysis

In [None]:
df=pd.read_csv('C:\\Users\\vivi\\Documents\\AWS\\proyect\\data\\device_failure_dataframe.csv')

In [None]:
print('Minimum date:',min(df['date']))
print('Maxiimum date:',max(df['date']))

In [None]:
varx=['attribute1','attribute2','attribute3','attribute4','attribute5','attribute6','attribute7','attribute8','attribute9','day']
df[varx].describe()

In [None]:
trans_data = pd.pivot_table(df[['date','device','failure']], values='failure', index=['device'], columns=['date'], aggfunc=np.sum)
trans_data_vec = list(trans_data.sum(axis = 0, skipna = True)) 
trans_data_vec =[sum(trans_data_vec[:j])/sum(trans_data_vec) for j in range(len(trans_data_vec))]
plt.plot(trans_data_vec)
plt.plot(total_device_day)
plt.ylabel('Total number of failures')
plt.show()

In [None]:
sns.boxplot(x='failure', y='attribute1', 
                 data=df, 
                 palette="colorblind")


# Preparing data from the model 

In [2]:
df=pd.read_csv('C:\\Users\\vivi\\Documents\\AWS\\proyect\\data\\device_failure_dataframe.csv')

In [3]:
# total number of unique devices
df['device'].drop_duplicates().shape[0]

1169

In [4]:
fail_devices = list(df[df['failure']==1.0]['device'].drop_duplicates())
print(len(fail_devices))
no_fail_devices = list(set(list(df['device'].drop_duplicates()))-set(fail_devices))

106


In [5]:
y_fail = list(np.repeat(1,len(fail_devices)))+list(np.repeat(0,len(no_fail_devices)))
X_fail = fail_devices +  no_fail_devices

In [210]:
X_trainf, X_testf, y_trainf, y_testf = train_test_split(X_fail, y_fail, test_size=0.25, random_state=42,stratify=y_fail)

In [211]:
var = ['attribute1','attribute2','attribute3','attribute4','attribute5','attribute6','attribute7','attribute8','attribute9']
train = df[df['device'].isin(X_trainf)]
X_train = train[var]
y_train = train['failure']

test = df[df['device'].isin(X_testf)]
X_test = test[var]
y_test = test['failure']

In [171]:
print(Counter(list(y_trainf)))
print(Counter(list(y_testf)))

Counter({0: 637, 1: 64})
Counter({0: 426, 1: 42})


In [172]:
print(74/(774+74)*100)
print((32/(32+319))*100)

8.726415094339622
9.116809116809117


# Balancing the training dataset

In [212]:
# transform the dataset
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_trainup, y_trainup = oversample.fit_resample(X_train, y_train)

In [191]:
Counter(y_trainup)

Counter({0.0: 98424, 1.0: 98424})

# Random Forest 

In [222]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
clf.fit(X_trainup, y_trainup)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [223]:
pd.DataFrame({'var':X_trainup.columns,'feature_importance':clf.feature_importances_}).sort_values(['feature_importance'],ascending=False)

Unnamed: 0,var,feature_importance
3,attribute4,0.340032
1,attribute2,0.291658
6,attribute7,0.16385
7,attribute8,0.163495
8,attribute9,0.021722
4,attribute5,0.012007
5,attribute6,0.004298
2,attribute3,0.001823
0,attribute1,0.001115


In [224]:
predictions=clf.predict(X_test)
confusion_matrix(y_test, predictions)

array([[30318,  2782],
       [    7,    20]], dtype=int64)

In [225]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
auc = roc_auc_score(y_test, predictions)
print("AUC: %.2f%%" % (auc * 100.0))
F1=f1_score(y_test, predictions)
print("F1-score: %.2f%%" % (F1 * 100.0))
recall = recall_score(y_test, predictions)
print("recall: %.2f%%" % (recall * 100.0))
precision = precision_score(y_test, predictions)
print("precision: %.2f%%" % (precision * 100.0))

Accuracy: 91.58%
AUC: 82.83%
F1-score: 1.41%
recall: 74.07%
precision: 0.71%


In [37]:
Counter(y_train)

Counter({0.0: 84951, 1.0: 74})

# KNeighbors model 

In [219]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_trainup, y_trainup)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [220]:
predictions = neigh.predict(X_test)
confusion_matrix(y_test, predictions)

array([[26045,  7055],
       [   18,     9]], dtype=int64)

In [221]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
auc = roc_auc_score(y_test, predictions)
print("AUC: %.2f%%" % (auc * 100.0))
F1=f1_score(y_test, predictions)
print("F1-score: %.2f%%" % (F1 * 100.0))
recall = recall_score(y_test, predictions)
print("recall: %.2f%%" % (recall * 100.0))
precision = precision_score(y_test, predictions)
print("precision: %.2f%%" % (precision * 100.0))

Accuracy: 78.65%
AUC: 56.01%
F1-score: 0.25%
recall: 33.33%
precision: 0.13%


# XGBoost model

In [213]:
model = XGBClassifier()

In [214]:
model.fit(X_trainup, y_trainup)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [215]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [216]:
confusion_matrix(y_test, predictions)

array([[32955,   145],
       [   22,     5]], dtype=int64)

In [217]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
auc = roc_auc_score(y_test, predictions)
print("AUC: %.2f%%" % (auc * 100.0))
F1=f1_score(y_test, predictions)
print("F1-score: %.2f%%" % (F1 * 100.0))
recall = recall_score(y_test, predictions)
print("recall: %.2f%%" % (recall * 100.0))
precision = precision_score(y_test, predictions)
print("precision: %.2f%%" % (precision * 100.0))

Accuracy: 99.50%
AUC: 59.04%
F1-score: 5.65%
recall: 18.52%
precision: 3.33%


In [227]:
from sklearn.model_selection import cross_val_score
model = XGBClassifier()
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.5f' % mean(scores))

NameError: name 'mean' is not defined

# Deep learning  model 

In [146]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.externals import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split



In [161]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [179]:
model = tf.keras.Sequential()
model.add(layers.Dense(150, input_shape=(9,),activation='relu'))
model.add(layers.Dense(100, activation='relu'))
model.add(layers.Dense(80, activation='relu'))
model.add(layers.Dense(50, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [166]:
X_trainup.shape

(169902, 9)

In [180]:
model.fit(X_trainup, y_trainup, epochs=20, batch_size=2000, validation_data=(X_test, y_test))

Train on 145912 samples, validate on 51474 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1de3c8dcf98>

In [None]:
df[var].shape

In [62]:
df['failure'].shape

(124494,)

In [65]:
predict_XGboost = model.predict(df[var])
confusion_matrix(df['failure'], predict_XGboost)

df['y_pred']=predict_XGboost

In [64]:
accuracy = accuracy_score(df['failure'], predict_XGboost)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
auc = roc_auc_score(df['failure'], predict_XGboost)
print("AUC: %.2f%%" % (auc * 100.0))
F1=f1_score(df['failure'], predict_XGboost, average='weighted')
print("F1-score: %.2f%%" % (F1 * 100.0))
recall = recall_score(df['failure'], predict_XGboost, average='weighted')
print("recall: %.2f%%" % (recall * 100.0))
precision = precision_score(df['failure'], predict_XGboost, average='weighted')
print("precision: %.2f%%" % (precision * 100.0))

Accuracy: 99.77%
AUC: 81.98%
F1-score: 99.83%
recall: 99.77%
precision: 99.90%


In [68]:
X_fail

['S1F023H2',
 'S1F03YZM',
 'S1F09DZQ',
 'S1F0CTDN',
 'S1F0DSTY',
 'S1F0F4EB',
 'S1F0GG8X',
 'S1F0GJW3',
 'S1F0GKFX',
 'S1F0GKL6',
 'S1F0GPFZ',
 'S1F0GSD9',
 'S1F0GSHB',
 'S1F0J5JH',
 'S1F0JD7P',
 'S1F0JGJV',
 'S1F0L0DW',
 'S1F0LCTV',
 'S1F0LCVC',
 'S1F0LD15',
 'S1F0LD2C',
 'S1F0P3G2',
 'S1F0PJJW',
 'S1F0QF3R',
 'S1F0QY11',
 'S1F0RR35',
 'S1F0RRB1',
 'S1F0RSZP',
 'S1F0S2WJ',
 'S1F0S4CA',
 'S1F0S4EG',
 'S1F0S4T6',
 'S1F0S57T',
 'S1F0S65X',
 'S1F0T2LA',
 'S1F0TQCV',
 'S1F10E6M',
 'S1F11MB0',
 'S1F13589',
 'S1F135TN',
 'S1F136J0',
 'S1F13H80',
 'W1F03D4L',
 'W1F03DP4',
 'W1F08EDA',
 'W1F0F6BN',
 'W1F0FKWW',
 'W1F0FW0S',
 'W1F0GCAZ',
 'W1F0KCP2',
 'W1F0M35B',
 'W1F0M4BZ',
 'W1F0NZZZ',
 'W1F0P114',
 'W1F0PAXH',
 'W1F0PNA5',
 'W1F0Q8FH',
 'W1F0SGHR',
 'W1F0T034',
 'W1F0T074',
 'W1F0T0B1',
 'W1F0TA59',
 'W1F0VDH2',
 'W1F0WBTM',
 'W1F0X4FC',
 'W1F0X5GW',
 'W1F0Z1W9',
 'W1F0Z3KR',
 'W1F0Z4EA',
 'W1F11ZG9',
 'W1F1230J',
 'W1F13SRV',
 'W1F14XGD',
 'W1F15S4D',
 'W1F19BPT',
 'W1F1BFP5',
 'W1F1BS0H',

In [69]:
df[df['device']=='S1F023H2'].sort_values(['device','date_time'])

Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9,date_time,day,y_pred
407,2015-01-01,S1F023H2,0.0,141503600.0,0.0,0.0,1.0,19.0,494462.0,16.0,16.0,3.0,2015-01-01,0,0.0
408,2015-01-02,S1F023H2,0.0,161679800.0,0.0,0.0,1.0,19.0,495730.0,16.0,16.0,3.0,2015-01-02,1,0.0
409,2015-01-03,S1F023H2,0.0,182358672.0,0.0,0.0,1.0,19.0,496974.0,16.0,16.0,3.0,2015-01-03,2,0.0
410,2015-01-04,S1F023H2,0.0,204752808.0,0.0,0.0,1.0,19.0,497559.0,16.0,16.0,3.0,2015-01-04,3,0.0
411,2015-01-05,S1F023H2,0.0,226982888.0,0.0,0.0,1.0,19.0,498753.0,16.0,16.0,3.0,2015-01-05,4,0.0
412,2015-01-06,S1F023H2,0.0,10387472.0,0.0,0.0,1.0,19.0,499964.0,16.0,16.0,3.0,2015-01-06,5,0.0
413,2015-01-07,S1F023H2,0.0,30083248.0,0.0,0.0,1.0,19.0,501239.0,16.0,16.0,3.0,2015-01-07,6,0.0
414,2015-01-08,S1F023H2,0.0,55079280.0,0.0,0.0,1.0,19.0,502542.0,16.0,16.0,3.0,2015-01-08,7,0.0
415,2015-01-09,S1F023H2,0.0,78898848.0,0.0,0.0,1.0,19.0,503812.0,16.0,16.0,3.0,2015-01-09,8,0.0
416,2015-01-10,S1F023H2,0.0,107573856.0,0.0,0.0,1.0,19.0,505079.0,16.0,16.0,3.0,2015-01-10,9,0.0
