# TPS-Aug-2022

In [1]:
class Config:
    NB = '105'
    dataset_NB = '104'

    raw_data_dir = '../data/raw/'
    processed_data_dir = '../data/processed/'
    interim_dir = '../data/interim/'
    submission_dir = '../data/submission/'

    random_seed = 42
    n_folds = 5

    row_id = 'id'
    target = 'failure'

## Import libralies

In [2]:
import gc
import warnings
warnings.filterwarnings('ignore')

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(style='white', context='notebook', palette='deep')

In [3]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

plotly_template = dict(
    layout=go.Layout(
        template='plotly_dark',
        font=dict(
            family="Franklin Gothic",
            size=12
        ),
        height=500,
        width=1000,
    )
)


color_palette = {
    'Bin': ['#016CC9','#E876A3'],
    'Cat5': ['#E876A3', '#E0A224', '#63B70D', '#6BCFF6', '#13399E'],
}

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline

## Load data

In [5]:
df_train = pd.read_pickle(f"../data/processed/nb{Config.dataset_NB}_train.pkl", compression='zip')
df_test = pd.read_pickle(f"../data/processed/nb{Config.dataset_NB}_test.pkl", compression='zip')

df_dataset = pd.concat(objs=[df_train, df_test], axis=0).reset_index(drop=True)
df_dataset

Unnamed: 0,id,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure,attribute_0_material_5,attribute_0_material_7,attribute_1_material_5,attribute_1_material_6,attribute_1_material_7,attribute_1_material_8
0,0,0.249005,1.0,0.2,0.368421,0.40,0.285714,0.545796,0.647744,0.238275,0.842618,0.504651,0.717593,0.352201,0.463325,0.321003,0.984393,0.388937,0.491051,0.231624,0.280078,0.606211,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1,0.274415,1.0,0.2,0.736842,0.15,0.214286,0.578266,0.463910,0.610628,0.572957,0.700194,0.280478,0.701258,0.752842,0.360849,0.492125,0.337836,0.404926,0.414849,0.392442,0.466937,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2,0.261365,1.0,0.2,0.631579,0.05,0.357143,0.548986,0.484962,0.425492,0.639842,0.694380,0.357446,0.753734,0.428383,0.552632,0.784651,0.678338,0.863975,0.374327,0.667121,0.435225,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,3,0.360246,1.0,0.2,0.684211,0.10,0.428571,0.405968,0.397744,0.773071,0.658924,0.668217,0.506366,0.705778,0.530851,0.418198,0.243700,0.426961,0.424542,0.651656,0.575285,0.711770,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,4,0.821707,1.0,0.2,0.473684,0.10,0.571429,0.790916,0.728947,0.473147,0.159117,0.420736,0.319830,0.286360,0.632903,0.611221,0.588488,0.587319,0.023339,0.247644,0.485109,0.293493,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47340,47340,0.591905,1.0,0.2,0.000000,0.20,0.642857,0.625563,0.657519,0.691755,0.417984,0.400969,0.412423,0.552476,0.497989,0.591236,0.464490,0.770389,0.362829,0.524435,0.515009,0.491398,,0.0,1.0,1.0,0.0,0.0,0.0
47341,47341,0.219458,1.0,0.2,0.210526,0.40,0.500000,0.707207,0.154887,0.718608,0.497976,0.506202,0.285880,0.412736,0.486620,0.085154,0.751145,0.468944,0.386455,0.336362,0.495669,0.350131,,0.0,1.0,1.0,0.0,0.0,0.0
47342,47342,0.183385,1.0,0.2,0.526316,0.55,0.142857,0.661411,0.500564,0.709909,0.576234,0.570930,0.616319,0.305818,0.501872,0.695631,0.403207,0.645303,0.419244,0.385232,0.641255,0.638888,,0.0,1.0,1.0,0.0,0.0,0.0
47343,47343,0.493289,1.0,0.2,0.421053,0.80,0.785714,0.263514,0.404887,0.691377,0.449017,0.689341,0.605710,0.745283,0.402732,0.557100,0.143328,0.524518,0.688144,0.689755,0.338811,0.574144,,0.0,1.0,1.0,0.0,0.0,0.0


## 正規化&標準化

In [6]:
def clip_outlier(df, col):
    '''clip outlier
    '''

    Q1 = np.percentile(df[col], 25)
    Q3 = np.percentile(df[col], 75)

    IQR = Q3 - Q1
    outlier_step = 1.5 * IQR

    df.loc[:, col] = np.maximum(df[col], Q1 - outlier_step)
    df.loc[:, col] = np.minimum(df[col], Q3 + outlier_step)

    return df


def clip_manual_outlier(df, col, percentile, upper=True):
    '''clip manual outlier
    '''

    Q = np.quantile(df[col], percentile)

    if upper:
        df.loc[:, col] = np.minimum(df[col], Q)
    else:
        df.loc[:, col] = np.maximum(df[col], Q)

    return df


def preprocess_dataset(df):
    """Preprocess the data (select columns and scale)
    ### MinMaxScaler: 正規化（値を0~1の範囲に）
    ### RobustScalar: 正規化（ただし、四分位範囲を分母とする）
    ### StandardScaler: 標準化（平均との差をとり標準偏差で割る）
    """
    # preproc = make_pipeline(MinMaxScaler(), StandardScaler(with_std=False))
    preproc = make_pipeline(RobustScaler(), StandardScaler(with_std=False))
    df_f = pd.DataFrame(preproc.fit_transform(df), columns=df.columns, index=df.index)

    return df_f

def normalize_dataset(df):
    """Preprocess the data (select columns and scale)
    ### MinMaxScaler: 正規化（値を0~1の範囲に）
    """
    preproc = make_pipeline(MinMaxScaler())
    df_f = pd.DataFrame(preproc.fit_transform(df), columns=df.columns, index=df.index)

    return df_f

def standardize_dataset(df):
    """Preprocess the data (select columns and scale)
    ### StandardScaler: 標準化（平均との差をとり標準偏差で割る）
    """
    preproc = make_pipeline(StandardScaler(with_std=False))
    df_f = pd.DataFrame(preproc.fit_transform(df), columns=df.columns, index=df.index)

    return df_f

In [7]:
feature_list = [col for col in df_dataset.columns if col not in [Config.row_id, Config.target]]

In [8]:
df_dataset[feature_list] = standardize_dataset(df_dataset[feature_list])
df_dataset.describe()

Unnamed: 0,id,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure,attribute_0_material_5,attribute_0_material_7,attribute_1_material_5,attribute_1_material_6,attribute_1_material_7,attribute_1_material_8
count,47345.0,47345.0,47345.0,47345.0,47345.0,47345.0,47345.0,47345.0,47345.0,47345.0,47345.0,47345.0,47345.0,47345.0,47345.0,47345.0,47345.0,47345.0,47345.0,47345.0,47345.0,47345.0,26570.0,47345.0,47345.0,47345.0,47345.0,47345.0,47345.0
mean,23672.0,4.352252e-18,1.056547e-16,1.200621e-16,4.802485e-18,5.883045e-17,5.522858e-17,6.633433000000001e-17,5.616657000000001e-17,-3.2004060000000005e-17,-7.888458e-18,-1.9660170000000002e-17,5.3127500000000006e-17,4.352252e-18,4.052097e-18,-3.3017090000000005e-17,-1.080559e-17,2.401243e-18,5.4027960000000004e-17,-1.650854e-17,2.5213050000000003e-17,-1.082435e-17,0.212608,5.762983e-17,-5.762983e-17,9.604971e-17,1.920994e-17,0.0,0.0
std,13667.468584,0.195252,0.3711066,0.3505433,0.2168308,0.2098766,0.2397496,0.1854004,0.1850241,0.1856493,0.1877915,0.1887152,0.1887331,0.1899869,0.1952933,0.1866626,0.1960642,0.2008793,0.1965434,0.197026,0.1896377,0.1998389,0.40916,0.4713896,0.4713896,0.4699586,0.4720797,0.0,0.0
min,0.0,-0.4979867,-0.546008,-0.5565023,-0.3900973,-0.4264452,-0.4381304,-0.4993271,-0.499631,-0.5000053,-0.4998123,-0.499878,-0.5000817,-0.5001936,-0.4996057,-0.4996486,-0.4981696,-0.5022421,-0.4991222,-0.5020886,-0.50006,-0.4996969,0.0,-0.333277,-0.666723,-0.329285,-0.3352413,0.0,0.0
25%,11836.0,-0.1433106,-0.296008,-0.3565023,-0.179571,-0.1764452,-0.1524162,-0.1243271,-0.124631,-0.1250053,-0.1248123,-0.124878,-0.1250817,-0.1251936,-0.1246057,-0.1246486,-0.1231696,-0.1272421,-0.1241222,-0.1270886,-0.12506,-0.1246969,0.0,-0.333277,-0.666723,-0.329285,-0.3352413,0.0,0.0
50%,23672.0,-0.02535929,-0.04600803,0.04349773,-0.02167627,-0.02644524,-0.009559012,-0.001391632,0.0001810498,0.000183833,-5.074791e-06,-0.001815948,0.0006898689,0.0004942705,-9.103326e-05,0.007302745,0.004980432,-0.01230746,0.0001618533,0.002017435,-0.0008312665,-0.0006008772,0.0,-0.333277,0.333277,-0.329285,-0.3352413,0.0,0.0
75%,35508.0,0.114819,0.453992,0.2434977,0.1362185,0.1235548,0.1332981,0.1256729,0.125369,0.1249947,0.1251877,0.125122,0.1249183,0.1248064,0.1253943,0.1253514,0.1268304,0.1227579,0.1258778,0.1229114,0.12494,0.1253031,0.0,0.666723,0.333277,0.670715,0.6647587,0.0,0.0
max,47344.0,0.5020133,0.453992,0.4434977,0.6099027,0.5735548,0.5618696,0.5006729,0.500369,0.4999947,0.5001877,0.500122,0.4999183,0.4998064,0.5003943,0.5003514,0.5018304,0.4977579,0.5008778,0.4979114,0.49994,0.5003031,1.0,0.666723,0.333277,0.670715,0.6647587,0.0,0.0


In [9]:
df_dataset

Unnamed: 0,id,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure,attribute_0_material_5,attribute_0_material_7,attribute_1_material_5,attribute_1_material_6,attribute_1_material_7,attribute_1_material_8
0,0,-0.248981,0.453992,-0.356502,-0.021676,-0.026445,-0.152416,0.046469,0.148113,-0.261730,0.342805,0.004773,0.217511,-0.147992,-0.036281,-0.178646,0.486223,-0.113305,-0.008071,-0.270465,-0.219982,0.106514,0.0,-0.333277,0.333277,-0.329285,-0.335241,0.0,0.0
1,1,-0.223572,0.453992,-0.356502,0.346745,-0.276445,-0.223845,0.078939,-0.035721,0.110623,0.073145,0.200316,-0.219603,0.201064,0.253237,-0.138800,-0.006045,-0.164407,-0.094197,-0.087239,-0.107618,-0.032760,0.0,-0.333277,0.333277,-0.329285,-0.335241,0.0,0.0
2,2,-0.236621,0.453992,-0.356502,0.241482,-0.376445,-0.080988,0.049659,-0.014669,-0.074514,0.140030,0.194502,-0.142636,0.253541,-0.071222,0.052983,0.286481,0.176096,0.364853,-0.127762,0.167061,-0.064472,0.0,-0.333277,0.333277,-0.329285,-0.335241,0.0,0.0
3,3,-0.137741,0.453992,-0.356502,0.294113,-0.326445,-0.009559,-0.093359,-0.101887,0.273066,0.159112,0.168339,0.006284,0.205585,0.031246,-0.081451,-0.254470,-0.075281,-0.074580,0.149567,0.075225,0.212073,0.0,-0.333277,0.333277,-0.329285,-0.335241,0.0,0.0
4,4,0.323720,0.453992,-0.356502,0.083587,-0.326445,0.133298,0.291589,0.229316,-0.026859,-0.340695,-0.079142,-0.180251,-0.213834,0.133298,0.111573,0.090318,0.085077,-0.475783,-0.254445,-0.014951,-0.206204,0.0,-0.333277,0.333277,-0.329285,-0.335241,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47340,47340,0.093918,0.453992,-0.356502,-0.390097,-0.226445,0.204727,0.126236,0.157888,0.191750,-0.081829,-0.098909,-0.087659,0.052283,-0.001616,0.091588,-0.033679,0.268147,-0.136293,0.022346,0.014949,-0.008299,,-0.333277,0.333277,0.670715,-0.335241,0.0,0.0
47341,47341,-0.278529,0.453992,-0.356502,-0.179571,-0.026445,0.061870,0.207880,-0.344744,0.218603,-0.001836,0.006324,-0.214202,-0.087458,-0.012986,-0.414495,0.252976,-0.033299,-0.112667,-0.165726,-0.004391,-0.149566,,-0.333277,0.333277,0.670715,-0.335241,0.0,0.0
47342,47342,-0.314601,0.453992,-0.356502,0.136218,0.123555,-0.295273,0.162084,0.000933,0.209904,0.076421,0.071052,0.116238,-0.194376,0.002266,0.195982,-0.094962,0.143061,-0.079878,-0.116857,0.141195,0.139191,,-0.333277,0.333277,0.670715,-0.335241,0.0,0.0
47343,47343,-0.004697,0.453992,-0.356502,0.030955,0.373555,0.347584,-0.235814,-0.094744,0.191371,-0.050795,0.189463,0.105628,0.245089,-0.096874,0.057452,-0.354842,0.022276,0.189022,0.187666,-0.161249,0.074447,,-0.333277,0.333277,0.670715,-0.335241,0.0,0.0


## Save Dataset

In [10]:
## Separate train dataset and test dataset
train_len = len(df_train)

train = df_dataset[:train_len]
test = df_dataset[train_len:]
test.drop(columns=[Config.target], inplace=True)

In [11]:
pd.to_pickle(train, Config.processed_data_dir + f"nb{Config.NB}_train.pkl", compression='zip')
pd.to_pickle(test, Config.processed_data_dir + f"nb{Config.NB}_test.pkl", compression='zip')

## 検証メモ