In [451]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold

In [502]:
# delete the colums in the dateframe that contains all NAs
def drop_colomn_with_all_na(df):
    colomns = list()
    for i in range(0, len(df.columns)):
        if df.iloc[:, i].isnull().all():
            colomns.append(i)
    df.drop(df.columns[colomns],axis=1,inplace=True)

In [503]:
# Perform data clean and data normalization
def extract_and_clean_data(file):
    df = pd.read_csv(file)
    # replace string 'na'  with NaN
    test = df.replace('na', np.nan)
    X_test = test.iloc[:, 1:]
    # set float type to calculate mean 
    X_test = X_test.astype(float)
    X_test = X_test.fillna(X_test.mean())
    # perform normalization(max-min method)
    X_test = X_test.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
    drop_colomn_with_all_na(X_test)
    
    y_test = test.iloc[:, 0]

    return X_test, y_test
    

In [504]:
X_train, y_train = extract_and_clean_data('./data/ida_2016_training_set_update.csv')

In [505]:
# A glance of the traning data set for features
X_train

Unnamed: 0,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,0.027925,0.003496,9.999998e-01,3.261769e-08,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.015918,0.013067,7.421784e-03,8.179508e-03,1.073013e-02,0.001321,0.003800,0.000000,0.000000,0.000000
1,0.012036,0.003496,0.000000e+00,2.220573e-05,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.005407,0.004716,3.019031e-03,4.272917e-03,4.228511e-03,0.000679,0.005064,0.000394,0.000000,0.000000
2,0.014942,0.003496,1.070067e-07,1.164918e-08,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.003559,0.004232,4.364196e-03,7.130884e-03,1.014768e-02,0.001321,0.004937,0.000135,0.000000,0.000000
3,0.000004,0.000000,3.285295e-08,7.688457e-09,0.0,0.000498,0.0,0.0,0.0,0.000005,...,0.000003,0.000001,5.970003e-07,7.660803e-07,3.163775e-07,0.000000,0.000000,0.000000,0.008299,0.027923
4,0.022164,0.003496,6.420405e-07,5.335323e-08,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.007981,0.006086,4.171776e-03,6.044861e-03,9.078578e-03,0.002605,0.022523,0.000320,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,0.055707,0.003496,3.116337e-07,2.166747e-08,0.0,0.000000,0.0,0.0,0.0,0.000040,...,0.012812,0.015013,1.328221e-02,2.121074e-02,3.226316e-02,0.006002,0.046641,0.007503,0.000000,0.000000
59996,0.000832,0.003496,9.999999e-01,2.609416e-08,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000136,0.000179,2.174522e-04,1.191324e-03,4.302733e-06,0.000000,0.000000,0.000000,0.000000,0.000000
59997,0.000041,0.000000,9.999998e-01,2.096852e-09,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000010,0.000010,4.652485e-06,2.507172e-06,4.619111e-06,0.000022,0.000000,0.000000,0.000000,0.000000
59998,0.029234,0.003496,9.999998e-01,5.754693e-08,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.008974,0.005897,3.575600e-03,3.930061e-03,6.151643e-03,0.001380,0.041639,0.101946,0.000000,0.000000


In [506]:
# A glance of training data set for labels
y_train

0        neg
1        neg
2        neg
3        neg
4        neg
        ... 
59995    neg
59996    neg
59997    neg
59998    neg
59999    neg
Name: class, Length: 60000, dtype: object

In [507]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

print('X train dataset shape: {}'.format(X_train.shape))
print('X test dataset shape: {}'.format(y_train.shape))

X train dataset shape: (60000, 169)
X test dataset shape: (60000,)


In [508]:
# Fit into MLP to do classification, used KFold to do the evulation
neural_network_classifier = MLPClassifier(random_state=0)
for train, test in KFold(n_splits=5, shuffle=True).split(X_train):
    X_po_train = X_train[train]
    X_po_test = X_train[test]
    y_po_train = y_train[train]
    y_po_test = y_train[test]
    
    neural_network_classifier.fit(X_po_train, y_po_train)
    print('X train dataset shape: {}'.format(X_po_train.shape))
    print('X test dataset shape: {}'.format(X_po_test.shape))
    print('y train dataset shape: {}'.format(y_po_train.shape))
    print('y test dataset shape: {}'.format(y_po_test.shape))
  
    print("Accuracy: %.1f%%" %(100*neural_network_classifier.score(X_po_test, y_po_test)))

X train dataset shape: (48000, 169)
X test dataset shape: (12000, 169)
y train dataset shape: (48000,)
y test dataset shape: (12000,)
Accuracy: 99.4%
X train dataset shape: (48000, 169)
X test dataset shape: (12000, 169)
y train dataset shape: (48000,)
y test dataset shape: (12000,)
Accuracy: 99.3%
X train dataset shape: (48000, 169)
X test dataset shape: (12000, 169)
y train dataset shape: (48000,)
y test dataset shape: (12000,)
Accuracy: 99.4%
X train dataset shape: (48000, 169)
X test dataset shape: (12000, 169)
y train dataset shape: (48000,)
y test dataset shape: (12000,)
Accuracy: 99.2%
X train dataset shape: (48000, 169)
X test dataset shape: (12000, 169)
y train dataset shape: (48000,)
y test dataset shape: (12000,)
Accuracy: 99.2%


In [509]:
# calculate precisions, recall and f1-score to add more evaluation

from sklearn.metrics import classification_report
X_train_po, X_test_po, y_train_po, y_test_po = train_test_split(X_train, y_train, train_size=.8)
neural_network_classifier.fit(X_po_train, y_po_train)

from sklearn.metrics import classification_report

c_matrix_report = classification_report(y_test_po, neural_network_classifier.predict(X_test_po), output_dict=True)

df = pandas.DataFrame(c_matrix_report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
neg,0.996194,0.99966,0.997924,11782.0
pos,0.977401,0.793578,0.875949,218.0
accuracy,0.995917,0.995917,0.995917,0.995917
macro avg,0.986797,0.896619,0.936937,12000.0
weighted avg,0.995852,0.995917,0.995708,12000.0


In [519]:
# function to deal with test data set
def extract_and_clean_data_for_predict(file):
    df = pd.read_csv(file)
    test = df.replace('na', np.nan)
    X_test = test.iloc[:, 2:]
    X_test = X_test.astype(float)
    X_test = X_test.fillna(X_test.mean())
    X_test = X_test.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
    drop_colomn_with_all_na(X_test)
    
    df_result = test.iloc[:, :2]
    
    return X_test, df_result

In [520]:
# get test data set, clean and fit into the trained MLP model, save the predicted result into dataframe
X_test, df_result = extract_and_clean_data_for_predict('./data/ida_2016_test_set_update.csv')
y_predict = neural_network_classifier.predict(X_test)
df_result['class'] = y_predict

In [525]:
# A glance of the result
df_result

Unnamed: 0,id,class
0,1,neg
1,2,neg
2,3,neg
3,4,neg
4,5,neg
...,...,...
15995,15996,neg
15996,15997,neg
15997,15998,neg
15998,15999,neg


In [531]:
# write the result into text file
f = open("predict_result.txt", "w")
for index, row in df_result.iterrows():
    f.write("id_"+ str(row['id'])+", "+row['class'] + '\n')
f.close()
!ls


[1m[36mdata[m[m                           truck_failure_prediction.ipynb
predict_result.txt
