In [12]:
import warnings
warnings.filterwarnings("ignore")
import pickle
import scipy.misc
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle

# import data visualization packages
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')
from pandas.plotting import scatter_matrix
import pickle
# import matplotlib
# matplotlib.use('TkAgg')
# import data preparing packages
import pandas as pd
import numpy as np


import numpy.random as rng
import random

# Setting seed for reproducability
np.random.seed(630)


import os, sys
import math
from math import sqrt
import itertools
from errno import EEXIST
from os import makedirs,path
import logging as logger
import time
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

In [2]:
def read_train_test_data_from_file(file_name):

    df_raw = pd.read_csv(str.format('{0}/{1}', _path_, file_name), sep=' ', header=None)
    df_raw.drop([26, 27], axis=1, inplace=True)

    col_names = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8',
                 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']

    df_raw.columns = col_names

    return df_raw

In [3]:
def read_label_data_from_file(file_name):

    df_raw = pd.read_csv(str.format('{0}/{1}', _path_, file_name), sep=' ', header=None)

    df_raw.drop([1], axis=1, inplace=True)
    df_raw.columns = [label_reg]

    return df_raw

In [4]:
def prepare_train_data(df_in, period):

    # make a dataframe to hold the last cycle for each enginge in the dataset
    df_max_cycle = pd.DataFrame(df_in.groupby('id')['cycle'].max())
    df_max_cycle.reset_index(level=0, inplace=True)
    df_max_cycle.columns = ['id', 'last_cycle']

    # add time-to-failure RUL as a new column - regression label
    df_in = pd.merge(df_in, df_max_cycle, on='id')
    df_in[label_reg] = df_in['last_cycle'] - df_in['cycle']
    df_in.drop(['last_cycle'], axis=1, inplace=True)

    # create binary classification label
    df_in[label_binc] = df_in[label_reg].apply(lambda x: 1 if x <= period else 0)

    # create multi-class classification label
    df_in[label_mcc] = df_in[label_reg].apply(lambda x: 2 if x <= period / 2 else 1 if x <= period else 0)

    return df_in

In [5]:
def prepare_test_data(df_test_in, df_truth_in, period):

    df_tst_last_cycle = pd.DataFrame(df_test_in.groupby('id')['cycle'].max())

    df_tst_last_cycle.reset_index(level=0, inplace=True)
    df_tst_last_cycle.columns = ['id', 'last_cycle']

    df_test_in = pd.merge(df_test_in, df_tst_last_cycle, on='id')

    df_test_in = df_test_in[df_test_in['cycle'] == df_test_in['last_cycle']]

    df_test_in.drop(['last_cycle'], axis=1, inplace=True)

    df_test_in.reset_index(drop=True, inplace=True)

    df_test_in = pd.concat([df_test_in, df_truth_in], axis=1)

    # create binary classification label
    df_test_in[label_binc] = df_test_in[label_reg].apply(lambda x: 1 if x <= period else 0)

    # create multi-class classification label
    df_test_in[label_mcc] = df_test_in[label_reg].apply(lambda x: 2 if x <= period / 2 else 1 if x <= period else 0)

    return df_test_in

In [6]:
def prepare_all_test_data(df_test_in, df_truth_in, period):

    df_tst_last_cycle = pd.DataFrame(df_test_in.groupby('id')['cycle'].max())

    df_tst_last_cycle.reset_index(level=0, inplace=True)
    df_tst_last_cycle.columns = ['id', 'last_cycle']

    df_tst_last_cycle['last_cycle'] = df_tst_last_cycle['last_cycle'].add(df_truth_in[label_reg])

    df_test_in = pd.merge(df_test_in, df_tst_last_cycle, on='id')

    df_test_in[label_reg] = df_test_in['last_cycle'] - df_test_in['cycle']
    df_test_in.drop(['last_cycle'], axis=1, inplace=True)

    df_test_in.reset_index(drop=True, inplace=True)

    # create binary classification label
    df_test_in[label_binc] = df_test_in[label_reg].apply(lambda x: 1 if x <= period else 0)

    # create multi-class classification label
    df_test_in[label_mcc] = df_test_in[label_reg].apply(lambda x: 2 if x <= period / 2 else 1 if x <= period else 0)

    return df_test_in

In [7]:
_path_ = 'turbofan datasets/dataset'

In [8]:
label_reg = 'RUL'
label_binc = 'BINC'
label_mcc = 'MCC'

In [9]:
data_set_number = 1
categoric_features = ['id']
numeric_features = ['setting1', 'setting2', 'setting3',
                    's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11',
                    's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21' ]
    
test_data_types = ['online', 'batch']

In [10]:
# read data from file
df_train_raw = read_train_test_data_from_file(file_name='train_FD00{0}.txt'.format(data_set_number))
df_test_raw = read_train_test_data_from_file(file_name='test_FD00{0}.txt'.format(data_set_number))
df_truth = read_label_data_from_file(file_name='RUL_FD00{0}.txt'.format(data_set_number))

In [13]:
df_train_raw.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,47.28,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [14]:
# add labels to training data using period of 30 cycles for classification
df_train = prepare_train_data(df_in=df_train_raw, period=30)
# save the training data to csv file for later use
df_train.to_csv('{0}/train{1}.csv'.format(_path_, data_set_number), index=False)

In [15]:
df_train.tail()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21,RUL,BINC,MCC
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,21.61,551.43,2388.19,9065.52,1.3,48.07,519.49,2388.26,8137.6,8.4956,0.03,397,2388,100.0,38.49,22.9735,4,1,2
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.5,1433.58,14.62,21.61,550.86,2388.23,9065.11,1.3,48.04,519.68,2388.22,8136.5,8.5139,0.03,395,2388,100.0,38.3,23.1594,3,1,2
20628,100,198,0.0004,0.0,100.0,518.67,643.42,1602.46,1428.18,14.62,21.61,550.94,2388.24,9065.9,1.3,48.09,520.01,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333,2,1,2
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,21.61,550.68,2388.25,9073.72,1.3,48.39,519.67,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.064,1,1,2
20630,100,200,-0.0032,-0.0005,100.0,518.67,643.85,1600.38,1432.14,14.62,21.61,550.79,2388.26,9061.48,1.3,48.2,519.3,2388.26,8137.33,8.5036,0.03,396,2388,100.0,38.37,23.0522,0,1,2


In [16]:
# add labels to test data using period of 30 cycles for classification
df_test = prepare_all_test_data(df_test_in=df_test_raw, df_truth_in=df_truth, period=30)
# save the test data to csv file for later use
df_test.to_csv('{0}/test{1}.csv'.format(_path_, data_set_number), index=False)

df_test = prepare_test_data(df_test_in=df_test_raw, df_truth_in=df_truth, period=30)   # get for last-max- line
# save the test data to csv file for later use
df_test.to_csv('{0}/_test_{1}.csv'.format(_path_, data_set_number), index=False)

In [17]:
df_test.tail()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21,RUL,BINC,MCC
95,96,97,-0.0006,0.0003,100.0,518.67,642.3,1590.88,1397.94,14.62,21.61,553.99,2388.03,9062.41,1.3,47.14,522.3,2388.01,8148.24,8.411,0.03,391,2388,100.0,38.96,23.4606,137,0,0
96,97,134,0.0013,-0.0001,100.0,518.67,642.59,1582.96,1410.92,14.62,21.61,554.05,2388.06,9076.36,1.3,47.38,521.58,2388.06,8155.48,8.45,0.03,395,2388,100.0,38.61,23.2953,82,0,0
97,98,121,0.0017,0.0001,100.0,518.67,642.68,1599.51,1415.47,14.62,21.61,553.44,2388.13,9062.34,1.3,47.66,521.53,2388.09,8146.39,8.4235,0.03,394,2388,100.0,38.76,23.3608,59,0,0
98,99,97,0.0047,-0.0,100.0,518.67,642.0,1585.03,1397.98,14.62,21.61,554.75,2388.01,9067.16,1.3,47.26,521.82,2388.02,8150.38,8.4003,0.03,391,2388,100.0,38.95,23.3595,117,0,0
99,100,198,0.0013,0.0003,100.0,518.67,642.95,1601.62,1424.99,14.62,21.61,552.48,2388.06,9155.03,1.3,47.8,521.07,2388.05,8214.64,8.4903,0.03,396,2388,100.0,38.7,23.1855,20,1,1


In [19]:
import pandas_profiling

pandas_profiling.ProfileReport(df_train)

Tab(children=(HTML(value='<div id="overview-content" class="row variable spacing">\n    <div class="row">\n   …



In [24]:
def explore_features(data):

    features = [ 
        'setting1', 'setting2', 'setting3',
        's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11',
        's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21'
    ]

    engine_num = 10

    """Plot 4 main graphs for a single feature.

        plot1: histogram
        plot2: boxplot
        plot3: line plot (time series over cycle)
        plot4: scatter plot vs. regression label RUL

    Args:
        feature (str): The column name of the feature to be plotted.
        engine_num (int): The number of random engines to be plotted for plot 3. 
        Range from 1 -100, 0:all engines, >100: all engines.

    Returns:
        plots

    """

    for feature in features:

        fig = plt.figure(figsize=(10, 8))

        sub1 = fig.add_subplot(221)
        sub1.set_title(feature + ' histogram')
        sub1.hist(data[feature])

        sub2 = fig.add_subplot(222)
        sub2.set_title(feature + ' boxplot')
        sub2.boxplot(data[feature])

        # np.random.seed(12345)

        if engine_num > 100 or engine_num <= 0:
            select_engines = list(pd.unique(data.id))
        else:
            select_engines = np.random.choice(range(1, 101), engine_num, replace=False)

        sub3 = fig.add_subplot(223)
        sub3.set_title('time series: ' + feature + ' / cycle')
        sub3.set_xlabel('cycle')
        for i in select_engines:
            df = data[['cycle', feature]][data.id == i]
            sub3.plot(df['cycle'], df[feature], label='engine ' + str(i))
            sub3.legend(loc="upper right")

        sub4 = fig.add_subplot(224)
        sub4.set_title("scatter: " + feature + " / RUL (regr label)")
        sub4.set_xlabel(label_reg)
        sub4.scatter(data[label_reg], data[feature])

        plt.tight_layout()
        
        plt.savefig(str.format('turbofan datasets/data_visualization/plot_hist_etc/{}', feature))

In [25]:
df_tr_lbl = pd.read_csv('{0}/train{1}.csv'.format(_path_, data_set_number))

In [26]:
explore_features(data=df_tr_lbl)

In [49]:
def plot_time_series(data):

    features = ['setting1', 'setting2', 'setting3', 
                's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11',
                's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']

    engine_num = 10

    """Plot time series of a single sensor for 10 random sample engines.

        Args:
        s (str): The column name of the sensor to be plotted.

    Returns:
        plots

    """

    for feature in features:

        fig, axes = plt.subplots(engine_num, 1, sharex=True, figsize=(25, 15))
        fig.suptitle(feature + ' time series / cycle', fontsize=15)

        # np.random.seed(12345)
        select_engines = np.random.choice(range(1, 101), engine_num, replace=False).tolist()

        for e_id in select_engines:
            df = data[['cycle', feature]][data.id == e_id]
            i = select_engines.index(e_id)
            axes[i].plot(df['cycle'], df[feature], color='lime')
            axes[i].set_ylabel('engine ' + str(e_id))
            axes[i].set_xlabel('cycle')
            axes[i].set_title('engine ' + str(e_id), loc='right')

        plt.tight_layout()
        plt.subplots_adjust(wspace=0, hspace=0)
        plt.savefig(str.format('turbofan datasets/data_visualization/time_series/{}', feature))

In [41]:
plot_time_series(data=df_tr_lbl)

In [44]:
def get_correlation_with_pandas_lib(data):

    # # plot and compare the standard deviation of input features:
    # data[features].std().plot(kind='bar', figsize=(8, 6), title="Features Standard Deviation")
    #
    # # plot and compare the log standard deviation of input features:
    # data[features].std().plot(kind='bar', figsize=(8, 6), logy=True, title="Features Standard Deviation (log)")

    features = ['setting1', 'setting2', 'setting3', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11',
                's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']

    # get ordered list features correlation with regression label RUL
    corr_data = data[features].corrwith(data.RUL).sort_values(ascending=False).dropna()

    # get correlation features names
    correl_features = list(corr_data.index.values)

    correl_features_lbl = correl_features + [label_reg]

    corr_matrix = np.corrcoef(data[correl_features_lbl].values.T)

    sns.set(font_scale=1.0)
    fig = plt.figure(figsize=(10, 8))
    hm = sns.heatmap(corr_matrix, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 8},
                     yticklabels=correl_features_lbl, xticklabels=correl_features_lbl)
    plt.title('Features Correlation Heatmap')
    plt.savefig('turbofan datasets/data_visualization/features_correlation_heatmap')
    plt.clf()

    # reset matplotlib original theme
    sns.reset_orig()

    # create scatter matrix to disply relatiohships and distribution among features and regression label
    scatter_matrix(data[correl_features_lbl], alpha=0.2, figsize=(20, 20), diagonal='kde')  # diagonal='hist'
    plt.savefig('turbofan datasets/data_visualization/scatter_matrix')
    plt.clf()

    '''

        Most of the features have normal distribution which has positive effect on machine learning algorithms. 
        Most of the features have non-linear relationship with the regression label RUL, 
        so using polynomial models may lead to better results

        There is a very high correlation (> 0.8) between some features e.g.(s14 & s9), (s11 & s4), 
        (s11 & s7), (s11 & s12), (s4 & s12), (s8 & s13), (s7 & s12). 
        This multicollinearity may hurt the performance of some machine learning algorithms. 
        So, part of these features will be target for elimination 
        in feature selection during the modeling phase.  
        Most features have nonlinear relation with the RUL, hence adding their polynomial transforms 
        may enhance models performance.
        Most features exhibit normal distribution which is likely improves models performance.
        AUC ROC should be used for classification models evaluation instead of Accuracy due to 
        class’s imbalance in the training data.

    '''

In [45]:
get_correlation_with_pandas_lib(data=df_tr_lbl)

In [47]:
def print_stat_for_classification_labels(data):

    # BINC
    df_bin_analysis = data[[label_binc]]

    df_bin_analysis['freq'] = df_bin_analysis.groupby(label_binc)[label_binc].transform('count')
    df_bin_analysis.drop_duplicates(keep='first', inplace=True)
    df_bin_analysis.reset_index(drop=True, inplace=True)

    total = np.sum(df_bin_analysis.ix[:, 'freq':].values)
    df_bin_analysis['percent'] = df_bin_analysis.ix[:, 'freq':].sum(axis=1) / total * 100

    writer = pd.ExcelWriter('turbofan datasets/data_visualization/bin_statistics.xlsx')
    df_bin_analysis.to_excel(writer, 'Sheet1', index=False)
    writer.save()

    # MCC
    df_mcc_analysis = data[[label_mcc]]

    df_mcc_analysis['freq'] = df_mcc_analysis.groupby(label_mcc)[label_mcc].transform('count')
    df_mcc_analysis.drop_duplicates(keep='first', inplace=True)
    df_mcc_analysis.reset_index(drop=True, inplace=True)

    total = np.sum(df_mcc_analysis.ix[:, 'freq':].values)
    df_mcc_analysis['percent'] = df_mcc_analysis.ix[:, 'freq':].sum(axis=1) / total * 100

    writer = pd.ExcelWriter('turbofan datasets/data_visualization/mcc_statistics.xlsx')
    df_mcc_analysis.to_excel(writer, 'Sheet1', index=False)
    writer.save()

    print("\nRecord #/% for each class-binaryclassification- :\n", data[label_binc].value_counts())
    print('\nNegative samples =  {0:.0%}'.format(
        data[label_binc].value_counts()[0] / data[label_binc].count()))
    print('\nPositive samples =  {0:.0%}'.format(
        data[label_binc].value_counts()[1] / data[label_binc].count()))

    # print stat for multiclass classification label
    print("\n\nRecord #/% for each class-multiclassification- :\n", data[label_mcc].value_counts())
    print('\nClass 0 samples =  {0:.0%}'.format(
        data[label_mcc].value_counts()[0] / data[label_mcc].count()))
    print('\nClass 1 samples =  {0:.0%}'.format(
        data[label_mcc].value_counts()[1] / data[label_mcc].count()))
    print('\nClass 2 samples =  {0:.0%}'.format(
        data[label_mcc].value_counts()[2] / data[label_mcc].count()))

In [48]:
print_stat_for_classification_labels(data=df_tr_lbl)


Record #/% for each class-binaryclassification- :
 0    17531
1     3100
Name: BINC, dtype: int64

Negative samples =  85%

Positive samples =  15%


Record #/% for each class-multiclassification- :
 0    17531
2     1600
1     1500
Name: MCC, dtype: int64

Class 0 samples =  85%

Class 1 samples =  7%

Class 2 samples =  8%


In [52]:
df_ts_lbl = pd.read_csv('{0}/test{1}.csv'.format(_path_, data_set_number))

In [53]:
removed_columns = ['s1', 's5', 's10', 's16', 's18', 's19', 'setting3']
train_data = df_tr_lbl.drop(removed_columns, axis=1)
test_data = df_ts_lbl.drop(removed_columns, axis=1)

In [54]:
train_data.shape, test_data.shape

((20631, 22), (13096, 22))

In [58]:
feature_names = list(set(train_data.columns).difference(['BINC', 'MCC', 'RUL', 'id', 'cycle']))

In [164]:
# BINC CLASSIFICATION

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import OneClassSVM
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, ExtraTreesClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, fbeta_score
import seaborn as sns

In [62]:
y_train = train_data['BINC']
X_train = train_data[feature_names]

y_test = test_data['BINC']
X_test = test_data[feature_names]

In [63]:
X_train.head()

Unnamed: 0,s21,s17,setting2,s11,s7,s20,s2,s14,s15,s12,s6,s3,s9,s8,s4,s13,setting1
0,23.419,392,-0.0004,47.47,554.36,39.06,641.82,8138.62,8.4195,521.66,21.61,1589.7,9046.19,2388.06,1400.6,2388.02,-0.0007
1,23.4236,392,-0.0003,47.49,553.75,39.0,642.15,8131.49,8.4318,522.28,21.61,1591.82,9044.07,2388.04,1403.14,2388.07,0.0019
2,23.3442,390,0.0003,47.27,554.26,38.95,642.35,8133.23,8.4178,522.42,21.61,1587.99,9052.94,2388.08,1404.2,2388.03,-0.0043
3,23.3739,392,0.0,47.13,554.45,38.88,642.35,8133.83,8.3682,522.86,21.61,1582.79,9049.48,2388.11,1401.87,2388.08,0.0007
4,23.4044,393,-0.0002,47.28,554.0,38.9,642.37,8133.8,8.4294,522.19,21.61,1582.85,9055.15,2388.06,1406.22,2388.04,-0.0019


In [99]:
def plot_confusion_matrix(conf_matrix, title='', path='results/binc/'):
    
    # Plot confusion matrix in a beautiful manner
    ax= plt.subplot()
    sns.heatmap(conf_matrix, annot=True, ax = ax, fmt = 'g'); #annot=True to annotate cells
    # labels, title and ticks
    ax.set_xlabel('Predicted', fontsize=20)
    ax.xaxis.set_label_position('top') 
    ax.xaxis.set_ticklabels(['health', 'failure'], fontsize = 15)
    ax.xaxis.tick_top()

    ax.set_ylabel('Actual', fontsize=20)
    ax.yaxis.set_ticklabels(['health', 'failure'], fontsize = 15)
    plt.savefig(path + title)
    plt.clf()

In [135]:
def run_LogisticRegression_algorithm(X_train, X_test, y_train, y_test):

    clf = LogisticRegression(random_state=42, verbose=0)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)  
    f1_score_ = f1_score(y_test, y_pred) 
    conf_matrix = confusion_matrix(y_test, y_pred)    
    
    return score, conf_matrix, f1_score_   

In [142]:
def run_DecisionTree_algorithm(X_train, X_test, y_train, y_test):

    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)  
    f1_score_ = f1_score(y_test, y_pred) 
    conf_matrix = confusion_matrix(y_test, y_pred)    
    
    return score, conf_matrix, f1_score_

In [141]:
def run_RandomForest_algorithm(X_train, X_test, y_train, y_test):

    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)  
    f1_score_ = f1_score(y_test, y_pred) 
    conf_matrix = confusion_matrix(y_test, y_pred)    
    
    return score, conf_matrix, f1_score_


In [140]:
def run_GradientBoosting_algorithm(X_train, X_test, y_train, y_test):

    clf = GradientBoostingClassifier(random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)  
    f1_score_ = f1_score(y_test, y_pred) 
    conf_matrix = confusion_matrix(y_test, y_pred)    
    
    return score, conf_matrix, f1_score_

In [139]:
def run_BaggingClassifier_algorithm(X_train, X_test, y_train, y_test):

    clf = BaggingClassifier(random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)  
    f1_score_ = f1_score(y_test, y_pred) 
    conf_matrix = confusion_matrix(y_test, y_pred)    
    
    return score, conf_matrix, f1_score_

In [137]:
def run_GaussianNB_algorithm(X_train, X_test, y_train, y_test):

    clf = GaussianNB()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred) 
    f1_score_ = f1_score(y_test, y_pred) 
    conf_matrix = confusion_matrix(y_test, y_pred)    
    
    return score, conf_matrix, f1_score_

In [168]:
def run_SVMClassifier_algorithm(X_train, X_test, y_train, y_test):

    clf = SVC(kernel='linear')
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred) 
    f1_score_ = f1_score(y_test, y_pred) 
    conf_matrix = confusion_matrix(y_test, y_pred)   
    
    return score, conf_matrix, f1_score_

In [144]:
log_score, log_conf_matrix, log_f1_score = run_LogisticRegression_algorithm(X_train, X_test, y_train, y_test)
plot_confusion_matrix(log_conf_matrix, title='LogisticRegression')
print('LogisticRegression accuracy is: ', log_score)  
print('LogisticRegression f1 score: ', log_f1_score)  

LogisticRegression accuracy is:  0.9846518020769701
LogisticRegression f1 score:  0.6479859894921192


In [145]:
dt_score, dt_conf_matrix, dt_f1_score = run_DecisionTree_algorithm(X_train, X_test, y_train, y_test)
plot_confusion_matrix(dt_conf_matrix, title='DecisionTree')
print('DecisionTree accuracy is: ', dt_score) 
print('DecisionTree f1 score: ', dt_f1_score)

DecisionTree accuracy is:  0.976634086744044
DecisionTree f1 score:  0.5513196480938416


In [146]:
rf_score, rf_conf_matrix, rf_f1_score = run_RandomForest_algorithm(X_train, X_test, y_train, y_test)
plot_confusion_matrix(rf_conf_matrix, title='RandomForest')
print('RandomForest accuracy is: ', rf_score) 
print('RandomForest f1 score: ', rf_f1_score)

RandomForest accuracy is:  0.9865607819181429
RandomForest f1 score:  0.6955017301038062


In [147]:
gb_score, gb_conf_matrix, gb_f1_score = run_GradientBoosting_algorithm(X_train, X_test, y_train, y_test)
plot_confusion_matrix(gb_conf_matrix, title='GradientBoosting')
print('GradientBoosting accuracy is: ', gb_score) 
print('GradientBoosting f1 score: ', gb_f1_score)

GradientBoosting accuracy is:  0.9870189370800244
GradientBoosting f1 score:  0.7128378378378377


In [148]:
bc_score, bc_conf_matrix, bc_f1_score = run_BaggingClassifier_algorithm(X_train, X_test, y_train, y_test)
plot_confusion_matrix(bc_conf_matrix, title='BaggingClassifier')
print('BaggingClassifier accuracy is: ', bc_score)
print('BaggingClassifier f1 score: ', bc_f1_score)

BaggingClassifier accuracy is:  0.9853390348197923
BaggingClassifier f1 score:  0.6595744680851063


In [149]:
nb_score, nb_conf_matrix, nb_f1_score = run_GaussianNB_algorithm(X_train, X_test, y_train, y_test)
plot_confusion_matrix(nb_conf_matrix, title='GaussianNB')
print('GaussianNB accuracy is: ', nb_score)
print('GaussianNB f1 score: ', nb_f1_score)

GaussianNB accuracy is:  0.9741142333536957
GaussianNB f1 score:  0.6327193932827736


In [169]:
svm_score, svm_conf_matrix, svm_f1_score = run_SVMClassifier_algorithm(X_train, X_test, y_train, y_test)
plot_confusion_matrix(nb_conf_matrix, title='SVMClassifier')
print('SVMClassifier accuracy is: ', svm_score)
print('SVMClassifier f1 score: ', svm_f1_score)

SVMClassifier accuracy is:  0.9849572388515577
SVMClassifier f1 score:  0.6475849731663686


In [183]:
from sklearn import model_selection
from sklearn.metrics import classification_report

In [188]:
# Multi-class CLASSIFICATION

y_train = train_data['MCC']
X_train = train_data[feature_names]

y_test = test_data['MCC']
X_test = test_data[feature_names]

In [190]:
def run_LogisticRegression_algorithm_(X_train, X_test, y_train, y_test):

    clf = LogisticRegression(random_state=42, verbose=0)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)  
    conf_matrix = confusion_matrix(y_test, y_pred)  
    f1_score_macro = f1_score(y_test, y_pred, average='macro') 
    f1_score_micro = f1_score(y_test, y_pred, average='micro')    
    
    return score, conf_matrix, f1_score_macro, f1_score_micro 

In [196]:
def run_DecisionTree_algorithm_(X_train, X_test, y_train, y_test):

    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred) 
    conf_matrix = confusion_matrix(y_test, y_pred) 
    f1_score_macro = f1_score(y_test, y_pred, average='macro') 
    f1_score_micro = f1_score(y_test, y_pred, average='micro')    
    
    return score, conf_matrix, f1_score_macro, f1_score_micro 

In [199]:
def run_RandomForest_algorithm_(X_train, X_test, y_train, y_test):

    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)  
    conf_matrix = confusion_matrix(y_test, y_pred) 
    f1_score_macro = f1_score(y_test, y_pred, average='macro') 
    f1_score_micro = f1_score(y_test, y_pred, average='micro')    
    
    return score, conf_matrix, f1_score_macro, f1_score_micro 

In [203]:
def run_GradientBoosting_algorithm_(X_train, X_test, y_train, y_test):

    clf = GradientBoostingClassifier(random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)  
    conf_matrix = confusion_matrix(y_test, y_pred) 
    f1_score_macro = f1_score(y_test, y_pred, average='macro') 
    f1_score_micro = f1_score(y_test, y_pred, average='micro')    
    
    return score, conf_matrix, f1_score_macro, f1_score_micro 

In [205]:
def run_BaggingClassifier_algorithm_(X_train, X_test, y_train, y_test):

    clf = BaggingClassifier(random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)  
    conf_matrix = confusion_matrix(y_test, y_pred) 
    f1_score_macro = f1_score(y_test, y_pred, average='macro') 
    f1_score_micro = f1_score(y_test, y_pred, average='micro')    
    
    return score, conf_matrix, f1_score_macro, f1_score_micro 

In [207]:
def run_GaussianNB_algorithm_(X_train, X_test, y_train, y_test):

    clf = GaussianNB()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)  
    conf_matrix = confusion_matrix(y_test, y_pred) 
    f1_score_macro = f1_score(y_test, y_pred, average='macro') 
    f1_score_micro = f1_score(y_test, y_pred, average='micro')    
    
    return score, conf_matrix, f1_score_macro, f1_score_micro

In [209]:
def run_SVMClassifier_algorithm_(X_train, X_test, y_train, y_test):

    clf = SVC(kernel='linear')
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)  
    conf_matrix = confusion_matrix(y_test, y_pred) 
    f1_score_macro = f1_score(y_test, y_pred, average='macro') 
    f1_score_micro = f1_score(y_test, y_pred, average='micro')    
    
    return score, conf_matrix, f1_score_macro, f1_score_micro

In [193]:
log_score, log_conf_matrix, log_f1_score_macro, log_f1_score_micro = run_LogisticRegression_algorithm_(X_train, X_test, y_train, y_test)
plot_confusion_matrix(log_conf_matrix, title='LogisticRegression_')
print('LogisticRegression_ f1 score-macro-: ', log_f1_score_macro)
print('LogisticRegression_ f1 score-micro-: ', log_f1_score_micro)

LogisticRegression_ f1 score-macro-:  0.6028655633748056
LogisticRegression_ f1 score-micro-:  0.9793830177153329


In [198]:
dt_score, dt_conf_matrix, dt_f1_score_macro, dt_f1_score_micro = run_DecisionTree_algorithm_(X_train, X_test, y_train, y_test)
plot_confusion_matrix(dt_conf_matrix, title='DecisionTree_')
print('DecisionTree_ f1 score-macro-: ', dt_f1_score_macro)
print('DecisionTree_ f1 score-micro-: ', dt_f1_score_micro)

DecisionTree_ f1 score-macro-:  0.6304340755867722
DecisionTree_ f1 score-micro-:  0.974801466096518


In [201]:
rf_score, rf_conf_matrix, rf_f1_score_macro, rf_f1_score_micro = run_RandomForest_algorithm_(X_train, X_test, y_train, y_test)
plot_confusion_matrix(rf_conf_matrix, title='RandomForest_')
print('RandomForest_ f1 score-macro-: ', rf_f1_score_macro)
print('RandomForest_ f1 score-micro-: ', rf_f1_score_micro)

RandomForest_ f1 score-macro-:  0.7153792200690358
RandomForest_ f1 score-micro-:  0.9835064141722664


In [204]:
gb_score, gb_conf_matrix, gb_f1_score_macro,gb_f1_score_micro = run_GradientBoosting_algorithm_(X_train, X_test, y_train, y_test)
plot_confusion_matrix(gb_conf_matrix, title='GradientBoosting_')
print('GradientBoosting_ f1 score-macro-: ', gb_f1_score_macro)
print('GradientBoosting_ f1 score-micro-: ', gb_f1_score_micro)

GradientBoosting_ f1 score-macro-:  0.7227036514272202
GradientBoosting_ f1 score-micro-:  0.9835064141722664


In [206]:
bc_score, bc_conf_matrix, bc_f1_score_macro, bc_f1_score_micro = run_BaggingClassifier_algorithm_(X_train, X_test, y_train, y_test)
plot_confusion_matrix(bc_conf_matrix, title='BaggingClassifier_')
print('BaggingClassifier_ f1 score-macro-: ', bc_f1_score_macro)
print('BaggingClassifier_ f1 score-micro-: ', bc_f1_score_micro)

BaggingClassifier_ f1 score-macro-:  0.6556091307849957
BaggingClassifier_ f1 score-micro-:  0.9809102015882712


In [208]:
nb_score, nb_conf_matrix, nb_f1_score_macro, nb_f1_score_micro = run_GaussianNB_algorithm_(X_train, X_test, y_train, y_test)
plot_confusion_matrix(nb_conf_matrix, title='GaussianNB_')
print('GaussianNB_ f1 score-macro-: ', nb_f1_score_macro)
print('GaussianNB_ f1 score-micro-: ', nb_f1_score_micro)

GaussianNB_ f1 score-macro-:  0.6986124136766104
GaussianNB_ f1 score-micro-:  0.9580788026878436


In [210]:
svm_score, svm_conf_matrix, svm_f1_score_macro, svm_f1_score_micro = run_SVMClassifier_algorithm_(X_train, X_test, y_train, y_test)
plot_confusion_matrix(nb_conf_matrix, title='SVMClassifier_')
print('SVMClassifier_ f1 score-macro-: ', svm_f1_score_macro)
print('SVMClassifier_ f1 score-micro-: ', svm_f1_score_micro)

SVMClassifier_ f1 score-macro-:  0.6687152136123921
SVMClassifier_ f1 score-micro-:  0.9816737935247404


In [247]:
# REGRESSION

import math
from sklearn.metrics import mean_squared_error, r2_score

# BINC CLASSIFICATION

from sklearn.svm import SVR, LinearSVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor

In [258]:
scaler = MinMaxScaler()

train_data_ = pd.DataFrame(scaler.fit_transform(train_data[feature_names]), index=train_data.index, columns=feature_names)
test_data_ = pd.DataFrame(scaler.transform(test_data[feature_names]), index=test_data.index, columns=feature_names)

In [259]:
y_train = train_data['RUL']
X_train = train_data_.copy()

y_test = test_data['RUL']
X_test = test_data_.copy()

In [219]:
def _run_LogisticRegression_algorithm_(X_train, X_test, y_train, y_test):

    clf = LogisticRegression(random_state=42, verbose=0)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    rmse = sqrt(mean_squared_error(y_test, y_pred))  
    r2_score_ = r2_score(y_test, y_pred)
    
    return rmse, r2_score_

In [225]:
def _run_DecisionTree_algorithm_(X_train, X_test, y_train, y_test):

    clf = DecisionTreeRegressor(random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    rmse = sqrt(mean_squared_error(y_test, y_pred)) 
    r2_score_ = r2_score(y_test, y_pred)
    
    return rmse, r2_score_ 

In [227]:
def _run_RandomForest_algorithm_(X_train, X_test, y_train, y_test):

    clf = RandomForestRegressor(random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    rmse = sqrt(mean_squared_error(y_test, y_pred)) 
    r2_score_ = r2_score(y_test, y_pred)
    
    return rmse, r2_score_ 

In [229]:
def _run_GradientBoosting_algorithm_(X_train, X_test, y_train, y_test):

    clf = GradientBoostingRegressor(random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    rmse = sqrt(mean_squared_error(y_test, y_pred)) 
    r2_score_ = r2_score(y_test, y_pred)
    
    return rmse, r2_score_

In [231]:
def _run_BaggingRegressor_algorithm_(X_train, X_test, y_train, y_test):

    clf = BaggingRegressor(random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    rmse = sqrt(mean_squared_error(y_test, y_pred)) 
    r2_score_ = r2_score(y_test, y_pred)
    
    return rmse, r2_score_

In [233]:
def _run_KNeighborsRegressor_algorithm_(X_train, X_test, y_train, y_test):

    clf = KNeighborsRegressor()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    rmse = sqrt(mean_squared_error(y_test, y_pred)) 
    r2_score_ = r2_score(y_test, y_pred)
    
    return rmse, r2_score_

In [245]:
def _run_SVMRegressor_algorithm_(X_train, X_test, y_train, y_test):

    clf = LinearSVR()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    rmse = sqrt(mean_squared_error(y_test, y_pred)) 
    r2_score_ = r2_score(y_test, y_pred)
    
    return rmse, r2_score_

In [260]:
log_rmse, log_r2_score_ = _run_LogisticRegression_algorithm_(X_train, X_test, y_train, y_test)
print('_LogisticRegression_ rmse: ', log_rmse)
print('_LogisticRegression_ r2_score: ', log_r2_score_)

_LogisticRegression_ rmse:  58.08243896345779
_LogisticRegression_ r2_score:  0.03013421936841454


In [261]:
dt_rmse, dt_r2_score_ = _run_DecisionTree_algorithm_(X_train, X_test, y_train, y_test)
print('_DecisionTree_ rmse: ', dt_rmse)
print('_DecisionTree_ r2_score: ', dt_r2_score_)

_DecisionTree_ rmse:  68.68230604095379
_DecisionTree_ r2_score:  -0.35616235201609747


In [262]:
rf_rmse, rf_r2_score_ = _run_RandomForest_algorithm_(X_train, X_test, y_train, y_test)
print('_RandomForest_ rmse: ', rf_rmse)
print('_RandomForest_ r2_score: ', rf_r2_score_)

_RandomForest_ rmse:  46.36436407688364
_RandomForest_ r2_score:  0.3819970301854727


In [263]:
gb_rmse, gb_r2_score_ = _run_GradientBoosting_algorithm_(X_train, X_test, y_train, y_test)
print('_GradientBoosting_ rmse: ', gb_rmse)
print('_GradientBoosting_ r2_score: ', gb_r2_score_)

_GradientBoosting_ rmse:  45.75235307806861
_GradientBoosting_ r2_score:  0.39820466439627


In [264]:
br_rmse, br_r2_score_ = _run_BaggingRegressor_algorithm_(X_train, X_test, y_train, y_test)
print('_BaggingRegressor_ rmse: ', br_rmse)
print('_BaggingRegressor_ r2_score: ', br_r2_score_)

_BaggingRegressor_ rmse:  48.8160272439566
_BaggingRegressor_ r2_score:  0.31491128696043746


In [265]:
kn_rmse, kn_r2_score_ = _run_KNeighborsRegressor_algorithm_(X_train, X_test, y_train, y_test)
print('_KNeighborsRegressor_ rmse: ', kn_rmse)
print('_KNeighborsRegressor_ r2_score: ', kn_r2_score_)

_KNeighborsRegressor_ rmse:  51.97555381973974
_KNeighborsRegressor_ r2_score:  0.22335920653209473


In [266]:
svm_rmse, svm_r2_score_ = _run_SVMRegressor_algorithm_(X_train, X_test, y_train, y_test)
print('_SVMRegressor_ rmse: ', svm_rmse)
print('_SVMRegressor_ r2_score: ', svm_r2_score_)

_SVMRegressor_ rmse:  51.243821713678855
_SVMRegressor_ r2_score:  0.24507297913396942
