# TPS-Aug-2022

In [1]:
class Config:
    NB = '108'

    raw_data_dir = '../data/raw/'
    processed_data_dir = '../data/processed/'
    interim_dir = '../data/interim/'
    submission_dir = '../data/submission/'

    random_seed = 42
    n_folds = 5

    row_id = 'id'
    target = 'failure'

## Import libralies

In [2]:
import gc
import warnings
warnings.filterwarnings('ignore')

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(style='white', context='notebook', palette='deep')

In [3]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

plotly_template = dict(
    layout=go.Layout(
        template='plotly_dark',
        font=dict(
            family="Franklin Gothic",
            size=12
        ),
        height=500,
        width=1000,
    )
)


color_palette = {
    'Bin': ['#016CC9','#E876A3'],
    'Cat5': ['#E876A3', '#E0A224', '#63B70D', '#6BCFF6', '#13399E'],
}

In [6]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import KNNImputer

## Load data

In [7]:
df_train = pd.read_csv("../data/raw/train.csv")
df_test = pd.read_csv("../data/raw/test.csv")

## 欠損値であるかどうかのBOOL列の追加

In [8]:
col_list = ['loading', 'measurement_3', 'measurement_4', 'measurement_5', 'measurement_9']

for col in col_list:
    for df in [df_train, df_test]:
        df[f'missing_{col}'] = df[col].isna()

In [9]:
df_train

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure,missing_loading,missing_measurement_3,missing_measurement_4,missing_measurement_5,missing_measurement_9
0,0,A,80.10,material_7,material_8,9,5,7,8,4,18.040,12.518,15.748,19.292,11.739,20.155,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.100,0,False,False,False,False,False
1,1,A,84.89,material_7,material_8,9,5,14,3,3,18.213,11.540,17.717,17.893,12.748,17.889,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0,False,False,False,False,False
2,2,A,82.43,material_7,material_8,9,5,12,1,5,18.057,11.652,16.738,18.240,12.718,18.288,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0,False,False,False,False,False
3,3,A,101.07,material_7,material_8,9,5,13,2,6,17.295,11.188,18.576,18.339,12.583,19.060,12.471,16.346,18.377,10.020,15.250,15.562,16.154,17.172,826.282,0,False,False,False,False,False
4,4,A,188.06,material_7,material_8,9,5,9,2,8,19.346,12.950,16.990,15.746,11.306,18.093,10.337,17.082,19.932,12.428,16.182,12.760,13.153,16.412,579.885,0,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26565,26565,E,158.95,material_7,material_6,6,9,6,16,4,16.301,13.259,18.068,15.505,10.865,19.354,,12.177,17.942,10.112,15.795,18.572,16.144,,729.131,0,False,False,False,False,True
26566,26566,E,146.02,material_7,material_6,6,9,10,12,8,17.543,,17.984,19.078,11.139,19.563,11.242,14.179,20.564,10.234,14.450,14.322,13.146,16.471,853.924,0,False,False,True,False,False
26567,26567,E,115.62,material_7,material_6,6,9,1,10,1,15.670,11.535,16.778,18.385,11.630,19.279,11.407,16.437,17.476,8.668,15.069,16.599,15.590,14.065,750.364,0,False,False,False,False,False
26568,26568,E,106.38,material_7,material_6,6,9,2,9,4,18.059,,16.918,18.101,11.713,19.358,11.392,17.064,17.814,14.928,16.273,15.485,13.624,12.865,730.156,0,False,False,True,False,False


## 欠損補完
- trainとtestを混ぜて実施してはいけない処理（な気がする）

In [10]:
col_list = [col for col in df_train.columns if col == 'loading' or col.startswith('measurement')]

imputer = KNNImputer(n_neighbors=3)
imputer.fit(df_train[col_list])
for df in [df_train, df_test]:
    df[col_list] = imputer.transform(df[col_list])

In [11]:
df_test.isnull().sum()

id                       0
product_code             0
loading                  0
attribute_0              0
attribute_1              0
attribute_2              0
attribute_3              0
measurement_0            0
measurement_1            0
measurement_2            0
measurement_3            0
measurement_4            0
measurement_5            0
measurement_6            0
measurement_7            0
measurement_8            0
measurement_9            0
measurement_10           0
measurement_11           0
measurement_12           0
measurement_13           0
measurement_14           0
measurement_15           0
measurement_16           0
measurement_17           0
missing_loading          0
missing_measurement_3    0
missing_measurement_4    0
missing_measurement_5    0
missing_measurement_9    0
dtype: int64

## TrainとTestの統合（処理を簡潔にするため）

In [12]:
df_dataset = pd.concat(objs=[df_train, df_test], axis=0).reset_index(drop=True)

## one-hotEncoding

In [18]:
col_list = ['attribute_0', 'attribute_1']
output_col_list = ['ohe0_5', 'ohe0_7', 'ohe1_5', 'ohe1_6', 'ohe1_7', 'ohe1_8']

ohe = OneHotEncoder(sparse=False)
ohe.fit(df_dataset[col_list])

df_dataset[output_col_list] = ohe.transform(df_dataset[col_list])
df_dataset.drop(columns=col_list, inplace=True)
df_dataset


Unnamed: 0,id,product_code,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure,missing_loading,missing_measurement_3,missing_measurement_4,missing_measurement_5,missing_measurement_9,ohe0_5,ohe0_7,ohe1_5,ohe1_6,ohe1_7,ohe1_8
0,0,A,80.10,9,5,7.0,8.0,4.0,18.040,12.518000,15.748,19.292000,11.739,20.155,10.672,15.859000,17.594,15.193,15.029,14.537333,13.034,14.684000,764.100,0.0,False,False,False,False,False,0.0,1.0,0.0,0.0,0.0,1.0
1,1,A,84.89,9,5,14.0,3.0,3.0,18.213,11.540000,17.717,17.893000,12.748,17.889,12.448,17.947000,17.915,11.755,14.732,15.425000,14.395,15.631000,682.057,0.0,False,False,False,False,False,0.0,1.0,0.0,0.0,0.0,1.0
2,2,A,82.43,9,5,12.0,1.0,5.0,18.057,11.652000,16.738,18.240000,12.718,18.288,12.715,15.607000,19.391,13.798,16.711,18.631000,14.094,17.946000,663.376,0.0,False,False,False,False,False,0.0,1.0,0.0,0.0,0.0,1.0
3,3,A,101.07,9,5,13.0,2.0,6.0,17.295,11.188000,18.576,18.339000,12.583,19.060,12.471,16.346000,18.377,10.020,15.250,15.562000,16.154,17.172000,826.282,0.0,False,False,False,False,False,0.0,1.0,0.0,0.0,0.0,1.0
4,4,A,188.06,9,5,9.0,2.0,8.0,19.346,12.950000,16.990,15.746000,11.306,18.093,10.337,17.082000,19.932,12.428,16.182,12.760000,13.153,16.412000,579.885,0.0,False,False,False,False,False,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47340,47340,I,144.74,9,5,0.0,4.0,9.0,18.465,12.570000,18.146,17.089000,11.204,18.573,11.691,15.664333,19.771,11.562,17.246,15.131000,15.209,16.027667,696.466,,False,False,False,False,False,0.0,1.0,1.0,0.0,0.0,0.0
47341,47341,I,74.53,9,5,4.0,8.0,7.0,18.900,9.896000,18.288,18.713333,11.747,17.917,10.980,16.027000,15.694,13.564,15.494,15.296000,13.812,16.501000,613.249,,False,False,False,False,False,0.0,1.0,1.0,0.0,0.0,0.0
47342,47342,I,67.73,9,5,10.0,11.0,2.0,18.656,12.439667,18.242,17.910000,12.081,19.630,10.436,16.137000,20.612,11.134,16.519,15.525000,14.175,17.728000,783.349,,False,False,True,False,False,0.0,1.0,1.0,0.0,0.0,0.0
47343,47343,I,126.15,9,5,8.0,16.0,11.0,16.536,11.226000,18.144,17.250000,12.692,19.575,12.672,15.422000,19.496,9.319,15.817,17.403000,16.437,15.179000,745.210,,False,False,False,False,False,0.0,1.0,1.0,0.0,0.0,0.0


## 値のClip
- measurement_2 は値が11以上になるとFailureと正の相関関係がみられる
  - その関係を反映するため、下限を11としてClipする

In [19]:
df_dataset['measurement_2'] = df_dataset['measurement_2'].clip(11, None)
df_dataset['measurement_2'].describe()

count    47345.000000
mean        11.224100
std          1.021155
min         11.000000
25%         11.000000
50%         11.000000
75%         11.000000
max         28.000000
Name: measurement_2, dtype: float64

In [20]:
df_dataset

Unnamed: 0,id,product_code,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure,missing_loading,missing_measurement_3,missing_measurement_4,missing_measurement_5,missing_measurement_9,ohe0_5,ohe0_7,ohe1_5,ohe1_6,ohe1_7,ohe1_8
0,0,A,80.10,9,5,7.0,8.0,11.0,18.040,12.518000,15.748,19.292000,11.739,20.155,10.672,15.859000,17.594,15.193,15.029,14.537333,13.034,14.684000,764.100,0.0,False,False,False,False,False,0.0,1.0,0.0,0.0,0.0,1.0
1,1,A,84.89,9,5,14.0,3.0,11.0,18.213,11.540000,17.717,17.893000,12.748,17.889,12.448,17.947000,17.915,11.755,14.732,15.425000,14.395,15.631000,682.057,0.0,False,False,False,False,False,0.0,1.0,0.0,0.0,0.0,1.0
2,2,A,82.43,9,5,12.0,1.0,11.0,18.057,11.652000,16.738,18.240000,12.718,18.288,12.715,15.607000,19.391,13.798,16.711,18.631000,14.094,17.946000,663.376,0.0,False,False,False,False,False,0.0,1.0,0.0,0.0,0.0,1.0
3,3,A,101.07,9,5,13.0,2.0,11.0,17.295,11.188000,18.576,18.339000,12.583,19.060,12.471,16.346000,18.377,10.020,15.250,15.562000,16.154,17.172000,826.282,0.0,False,False,False,False,False,0.0,1.0,0.0,0.0,0.0,1.0
4,4,A,188.06,9,5,9.0,2.0,11.0,19.346,12.950000,16.990,15.746000,11.306,18.093,10.337,17.082000,19.932,12.428,16.182,12.760000,13.153,16.412000,579.885,0.0,False,False,False,False,False,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47340,47340,I,144.74,9,5,0.0,4.0,11.0,18.465,12.570000,18.146,17.089000,11.204,18.573,11.691,15.664333,19.771,11.562,17.246,15.131000,15.209,16.027667,696.466,,False,False,False,False,False,0.0,1.0,1.0,0.0,0.0,0.0
47341,47341,I,74.53,9,5,4.0,8.0,11.0,18.900,9.896000,18.288,18.713333,11.747,17.917,10.980,16.027000,15.694,13.564,15.494,15.296000,13.812,16.501000,613.249,,False,False,False,False,False,0.0,1.0,1.0,0.0,0.0,0.0
47342,47342,I,67.73,9,5,10.0,11.0,11.0,18.656,12.439667,18.242,17.910000,12.081,19.630,10.436,16.137000,20.612,11.134,16.519,15.525000,14.175,17.728000,783.349,,False,False,True,False,False,0.0,1.0,1.0,0.0,0.0,0.0
47343,47343,I,126.15,9,5,8.0,16.0,11.0,16.536,11.226000,18.144,17.250000,12.692,19.575,12.672,15.422000,19.496,9.319,15.817,17.403000,16.437,15.179000,745.210,,False,False,False,False,False,0.0,1.0,1.0,0.0,0.0,0.0


## Save Dataset

In [21]:
## Separate train dataset and test dataset
train_len = len(df_train)

train = df_dataset[:train_len]
test = df_dataset[train_len:]
test.drop(columns=[Config.target], inplace=True)

In [22]:
pd.to_pickle(train, Config.processed_data_dir + f"nb{Config.NB}_train.pkl", compression='zip')
pd.to_pickle(test, Config.processed_data_dir + f"nb{Config.NB}_test.pkl", compression='zip')

## 検証メモ