In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, RocCurveDisplay, fbeta_score, accuracy_score, precision_score, recall_score, precision_recall_curve




import os
import sys
sys.path.append("..")
from src.data.preprocessing import load_preprocess_data
from src.features.feature_engineering import create_features, unwrap_smart_7

# Read Data

In [2]:
# read data
repo_path = os.getcwd() + "/../"
df_source = load_preprocess_data(path=repo_path)
df = create_features(df_source, interval=30, trigger_percentage=0.05)
df = unwrap_smart_7(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.smart_7_mod[temp_data.index] = temp_data.smart_7_raw
100%|██████████| 608/608 [00:13<00:00, 43.80it/s]
100%|██████████| 608/608 [00:13<00:00, 46.25it/s]


# Check Duplicates

In [3]:
serials_failed = df_source.query("failure == 1")["serial_number"].to_list()
serials_failed_unique = set(serials_failed)
print("Number of failed days:", len(serials_failed))
print("Number of unique serial number on the failed day:", len(serials_failed_unique))
tmp = serials_failed[:]
for serial in serials_failed_unique:
    tmp.remove(serial)

if len(tmp) == 0:
    print("There is no duplicate")
else:
    print(f"There {len(tmp)} duplicates")
    print("   duplicated serial numbers",tmp)
    df.drop(298155, inplace=True)
    df.drop(309081, inplace=True)
    df.reset_index(inplace=True)
    print("Number of rows before drop:", len(df_source))
    print("Number of rows after drop:", len(df))   

Number of failed days: 610
Number of unique serial number on the failed day: 608
There 2 duplicates
   duplicated serial numbers ['Z302SQFX', 'Z303VDR4']
Number of rows before drop: 462966
Number of rows after drop: 462964


# Create Target and Features

In [4]:
def get_smart_999_accumulated(df_in):
    df_tmp = df_in.drop(["index"], axis=1).copy()
    serials = df_tmp['serial_number'].unique()
    #serials = serials[0:2]
    #print(serials)
    for i, serial in enumerate(serials):
        df_serial = df_tmp.query('serial_number == @serial').sort_values(by='date').reset_index(drop=True)
        df_serial['smart_999_accumulated'] = df_serial['smart_999'].cumsum()
        if i == 0:
            df = df_serial.copy()
        else:
            df = pd.concat([df, df_serial], ignore_index=True)
    return df

In [5]:
# select features
"""
cols = ['date', 'serial_number', 'model', 'failure', 'smart_1_raw',
       'smart_4_raw', 'smart_5_raw', 'smart_7_mod', 'smart_9_raw',
       'smart_12_raw', 'smart_183_raw', 'smart_184_raw', 'smart_187_raw',
       'smart_188_raw', 'smart_189_raw', 'smart_190_raw', 'smart_192_raw',
       'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_198_raw',
       'smart_199_raw', 'smart_240_raw', 'smart_241_raw', 'smart_242_raw',
       'date_failure', 'countdown','smart_999']
df = df[pd.Index(cols)]
"""

# create smart_999_accumulated
df = get_smart_999_accumulated(df)

# create target
df['failure_in_30_days'] = df['countdown'].apply(lambda x : 1 if x <= 30 else 0)
df.head(5)

Unnamed: 0,date,serial_number,model,failure,smart_1_raw,smart_4_raw,smart_5_raw,smart_7_raw,smart_9_raw,smart_12_raw,...,smart_188_raw_trigger,smart_189_raw_trigger,smart_193_raw_trigger,smart_192_raw_trigger,smart_197_raw_trigger,smart_198_raw_trigger,smart_199_raw_trigger,smart_999,smart_999_accumulated,failure_in_30_days
0,2019-01-01,Z304JG86,ST4000DM000,0,168829568.0,10.0,0.0,257899631.0,27368.0,10.0,...,False,False,False,False,False,False,False,0,0,0
1,2019-01-02,Z304JG86,ST4000DM000,0,50913560.0,10.0,0.0,259688596.0,27392.0,10.0,...,False,False,False,False,False,False,False,0,0,0
2,2019-01-03,Z304JG86,ST4000DM000,0,176805800.0,10.0,0.0,261018343.0,27416.0,10.0,...,False,False,False,False,False,False,False,0,0,0
3,2019-01-04,Z304JG86,ST4000DM000,0,217666344.0,10.0,0.0,261751900.0,27440.0,10.0,...,False,False,False,False,False,False,False,0,0,0
4,2019-01-05,Z304JG86,ST4000DM000,0,218092400.0,10.0,0.0,263166092.0,27464.0,10.0,...,False,False,False,False,False,False,False,0,0,0


# Split Data

In [6]:
# split data
RSEED = 200
serial_train, serial_test, dummy1, dummy2 = train_test_split(df['serial_number'].unique(), df['serial_number'].unique(), test_size=0.2, random_state=RSEED)
df_train = df[df['serial_number'].isin(serial_train)]
df_test = df[df['serial_number'].isin(serial_test)]

print("number of unique disk in data:", df["serial_number"].nunique())
print("number of unique disk in train:", df_train["serial_number"].nunique())
print("number of unique disk in test:", df_test["serial_number"].nunique())

number of unique disk in data: 608
number of unique disk in train: 486
number of unique disk in test: 122


In [7]:
df.columns

Index(['date', 'serial_number', 'model', 'failure', 'smart_1_raw',
       'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw',
       'smart_12_raw', 'smart_183_raw', 'smart_184_raw', 'smart_187_raw',
       'smart_188_raw', 'smart_189_raw', 'smart_190_raw', 'smart_192_raw',
       'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_198_raw',
       'smart_199_raw', 'smart_240_raw', 'smart_241_raw', 'smart_242_raw',
       'date_failure', 'countdown', 'smart_7_mod', 'smart_1_raw_ema',
       'smart_4_raw_ema', 'smart_5_raw_ema', 'smart_7_raw_ema',
       'smart_9_raw_ema', 'smart_12_raw_ema', 'smart_183_raw_ema',
       'smart_184_raw_ema', 'smart_187_raw_ema', 'smart_188_raw_ema',
       'smart_189_raw_ema', 'smart_190_raw_ema', 'smart_192_raw_ema',
       'smart_193_raw_ema', 'smart_194_raw_ema', 'smart_197_raw_ema',
       'smart_198_raw_ema', 'smart_199_raw_ema', 'smart_240_raw_ema',
       'smart_241_raw_ema', 'smart_242_raw_ema', 'smart_7_mod_ema',
       'smart_4_ra

# Set X and y

In [8]:
# set X and y

cols_feature = [
    'smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_mod', 'smart_9_raw',
    'smart_12_raw', 'smart_183_raw', 'smart_184_raw', 'smart_187_raw',
    'smart_188_raw', 'smart_189_raw', 'smart_190_raw', 'smart_192_raw',
    'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_198_raw',
    'smart_199_raw', 'smart_240_raw', 'smart_241_raw', 'smart_242_raw',
    'smart_999', 'smart_999_accumulated']

"""
cols_feature =['smart_1_raw',
       'smart_4_raw', 'smart_5_raw', 'smart_7_mod', 'smart_9_raw',
       'smart_12_raw', 'smart_183_raw', 'smart_184_raw', 'smart_187_raw',
       'smart_188_raw', 'smart_189_raw', 'smart_190_raw', 'smart_192_raw',
       'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_198_raw',
       'smart_199_raw', 'smart_240_raw', 'smart_241_raw', 'smart_242_raw',
       'smart_999', 'smart_999_accumulated',
       'smart_4_raw_trigger', 'smart_5_raw_trigger', 'smart_12_raw_trigger',
       'smart_183_raw_trigger', 'smart_184_raw_trigger',
       'smart_187_raw_trigger', 'smart_188_raw_trigger',
       'smart_189_raw_trigger', 'smart_193_raw_trigger',
       'smart_192_raw_trigger', 'smart_197_raw_trigger',
       'smart_198_raw_trigger', 'smart_199_raw_trigger']
"""

X_train = df_train[cols_feature]
y_train = df_train['failure_in_30_days']

X_test = df_test[cols_feature]
y_test = df_test['failure_in_30_days']

# Log and Scale

In [9]:
# log 
X_train_log = X_train.apply(lambda x : np.log10(x+1))
X_test_log = X_test.apply(lambda x : np.log10(x+1))

# scale
scaler = MinMaxScaler()
scaler.fit(X_train_log)

X_train_log_scaled = pd.DataFrame(scaler.transform(X_train_log), columns=X_train_log.columns)
X_test_log_scaled = pd.DataFrame(scaler.transform(X_test_log), columns=X_test_log.columns)
X_train_log_scaled.describe()

Unnamed: 0,smart_1_raw,smart_4_raw,smart_5_raw,smart_7_mod,smart_9_raw,smart_12_raw,smart_183_raw,smart_184_raw,smart_187_raw,smart_188_raw,...,smart_193_raw,smart_194_raw,smart_197_raw,smart_198_raw,smart_199_raw,smart_240_raw,smart_241_raw,smart_242_raw,smart_999,smart_999_accumulated
count,373587.0,373587.0,373587.0,373587.0,373587.0,373587.0,373587.0,373587.0,373587.0,373587.0,...,373587.0,373587.0,373587.0,373587.0,373587.0,373587.0,373587.0,373587.0,373587.0,373587.0
mean,0.94819,0.392936,0.022483,0.427393,0.718505,0.389219,0.078963,0.002497,0.086983,0.004014,...,0.582136,0.275498,0.039591,0.039591,0.006742,0.716863,0.804964,0.406216,0.062541,0.384067
std,0.051992,0.112721,0.109289,0.043015,0.120044,0.114837,0.143712,0.036929,0.177273,0.041515,...,0.136698,0.096927,0.108436,0.108436,0.066073,0.120699,0.101998,0.135322,0.172667,0.306584
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.928428,0.313298,0.0,0.40352,0.632535,0.313724,0.0,0.0,0.0,0.0,...,0.521479,0.202956,0.0,0.0,0.0,0.63024,0.768417,0.298243,0.0,0.0
50%,0.964129,0.385267,0.0,0.428716,0.720894,0.385791,0.0,0.0,0.0,0.0,...,0.572584,0.270698,0.0,0.0,0.0,0.719115,0.822254,0.399058,0.0,0.454041
75%,0.985082,0.455361,0.0,0.449706,0.808877,0.45598,0.166946,0.0,0.0,0.0,...,0.618604,0.349349,0.0,0.0,0.0,0.807789,0.867832,0.49924,0.0,0.636346
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Function for Printing Metrics

In [10]:
def show_metrics(y_train, y_train_pred, y_test, y_test_pred, digit=20):
    for data in ["train", "test"]:
        print("=======================================")
        print(f"# For {data} data:")
        if data == "train":
            y_true = y_train
            y_pred = y_train_pred
        else:
            y_true = y_test
            y_pred = y_test_pred
        f2 = fbeta_score(y_true, y_pred, beta=2)
        pre = precision_score(y_true, y_pred)
        rec = recall_score(y_true, y_pred)
        acc = accuracy_score(y_true, y_pred)
        print("  f2-score  :", (f2*100).round(digit), "%")
        print("  precision :", (pre*100).round(digit), "%")
        print("  recall    :", (rec*100).round(digit), "%")
        print("  accuracy  :", (acc*100).round(digit), "%")

# Neural Network