# TPS-Aug-2022

In [1]:
class Config:
    NB = '104'
    dataset_NB = '103'

    raw_data_dir = '../data/raw/'
    processed_data_dir = '../data/processed/'
    interim_dir = '../data/interim/'
    submission_dir = '../data/submission/'

    random_seed = 42
    n_folds = 5

    row_id = 'id'
    target = 'failure'

## Import libralies

In [2]:
import gc
import warnings
warnings.filterwarnings('ignore')

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(style='white', context='notebook', palette='deep')

In [3]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

plotly_template = dict(
    layout=go.Layout(
        template='plotly_dark',
        font=dict(
            family="Franklin Gothic",
            size=12
        ),
        height=500,
        width=1000,
    )
)


color_palette = {
    'Bin': ['#016CC9','#E876A3'],
    'Cat5': ['#E876A3', '#E0A224', '#63B70D', '#6BCFF6', '#13399E'],
}

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline

## Load data

In [5]:
df_train = pd.read_pickle(f"../data/processed/nb{Config.dataset_NB}_train.pkl", compression='zip')
df_test = pd.read_pickle(f"../data/processed/nb{Config.dataset_NB}_test.pkl", compression='zip')

df_dataset = pd.concat(objs=[df_train, df_test], axis=0).reset_index(drop=True)
df_dataset

Unnamed: 0,id,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure,attribute_0_material_5,attribute_0_material_7,attribute_1_material_5,attribute_1_material_6,attribute_1_material_7,attribute_1_material_8
0,0,80.10,9,5,7,8,4,18.040,12.518,15.748,19.292,11.739,20.155,10.672,15.859,17.594,15.193,15.029,16.0265,13.034,14.684,764.100,0.0,0,1,0,0,0,1
1,1,84.89,9,5,14,3,3,18.213,11.540,17.717,17.893,12.748,17.889,12.448,17.947,17.915,11.755,14.732,15.4250,14.395,15.631,682.057,0.0,0,1,0,0,0,1
2,2,82.43,9,5,12,1,5,18.057,11.652,16.738,18.240,12.718,18.288,12.715,15.607,19.460,13.798,16.711,18.6310,14.094,17.946,663.376,0.0,0,1,0,0,0,1
3,3,101.07,9,5,13,2,6,17.295,11.188,18.576,18.339,12.583,19.060,12.471,16.346,18.377,10.020,15.250,15.5620,16.154,17.172,826.282,0.0,0,1,0,0,0,1
4,4,188.06,9,5,9,2,8,19.346,12.950,16.990,15.746,11.306,18.093,10.337,17.082,19.932,12.428,16.182,12.7600,13.153,16.412,579.885,0.0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47340,47340,144.74,9,5,0,4,9,18.465,12.570,18.146,17.089,11.204,18.573,11.691,16.109,19.771,11.562,17.246,15.1310,15.209,16.664,696.466,,0,1,1,0,0,0
47341,47341,74.53,9,5,4,8,7,18.900,9.896,18.288,17.504,11.747,17.917,10.980,16.027,15.694,13.564,15.494,15.2960,13.812,16.501,613.249,,0,1,1,0,0,0
47342,47342,67.73,9,5,10,11,2,18.656,11.735,18.242,17.910,12.081,19.630,10.436,16.137,20.612,11.134,16.519,15.5250,14.175,17.728,783.349,,0,1,1,0,0,0
47343,47343,126.15,9,5,8,16,11,16.536,11.226,18.144,17.250,12.692,19.575,12.672,15.422,19.496,9.319,15.817,17.4030,16.437,15.179,745.210,,0,1,1,0,0,0


## 正規化&標準化

In [None]:
def clip_outlier(df, col):
    '''clip outlier
    '''

    Q1 = np.percentile(df[col], 25)
    Q3 = np.percentile(df[col], 75)

    IQR = Q3 - Q1
    outlier_step = 1.5 * IQR

    df.loc[:, col] = np.maximum(df[col], Q1 - outlier_step)
    df.loc[:, col] = np.minimum(df[col], Q3 + outlier_step)

    return df


def clip_manual_outlier(df, col, percentile, upper=True):
    '''clip manual outlier
    '''

    Q = np.quantile(df[col], percentile)

    if upper:
        df.loc[:, col] = np.minimum(df[col], Q)
    else:
        df.loc[:, col] = np.maximum(df[col], Q)

    return df


def preprocess_dataset(df):
    """Preprocess the data (select columns and scale)
    ### MinMaxScaler: 正規化（値を0~1の範囲に）
    ### RobustScalar: 正規化（ただし、四分位範囲を分母とする）
    ### StandardScaler: 標準化（平均との差をとり標準偏差で割る）
    """
    # preproc = make_pipeline(MinMaxScaler(), StandardScaler(with_std=False))
    preproc = make_pipeline(RobustScaler(), StandardScaler(with_std=False))
    df_f = pd.DataFrame(preproc.fit_transform(df), columns=df.columns, index=df.index)

    return df_f

def normalize_dataset(df):
    """Preprocess the data (select columns and scale)
    ### MinMaxScaler: 正規化（値を0~1の範囲に）
    """
    preproc = make_pipeline(MinMaxScaler())
    df_f = pd.DataFrame(preproc.fit_transform(df), columns=df.columns, index=df.index)

    return df_f

def standardize_dataset(df):
    """Preprocess the data (select columns and scale)
    ### StandardScaler: 標準化（平均との差をとり標準偏差で割る）
    """
    preproc = make_pipeline(StandardScaler(with_std=False))
    df_f = pd.DataFrame(preproc.fit_transform(df), columns=df.columns, index=df.index)

    return df_f

In [None]:
feature_list = [col for col in df_dataset.columns if col not in [Config.row_id, Config.target]]

for feature in feature_list:
    df_dataset = clip_outlier(df_dataset, feature)

df_dataset.describe()

In [None]:
df_dataset[feature_list] = normalize_dataset(df_dataset[feature_list])
df_dataset.describe()

In [None]:
df_dataset

## Save Dataset

In [None]:
## Separate train dataset and test dataset
train_len = len(df_train)

train = df_dataset[:train_len]
test = df_dataset[train_len:]
test.drop(columns=[Config.target], inplace=True)

In [None]:
pd.to_pickle(train, Config.processed_data_dir + f"nb{Config.NB}_train.pkl", compression='zip')
pd.to_pickle(test, Config.processed_data_dir + f"nb{Config.NB}_test.pkl", compression='zip')

## 検証メモ

In [6]:
df_test.shape

(20775, 28)

In [9]:
submission = pd.read_csv(Config.raw_data_dir + 'sample_submission.csv')
submission

Unnamed: 0,id,failure
0,26570,0.0
1,26571,0.0
2,26572,0.0
3,26573,0.0
4,26574,0.0
...,...,...
20770,47340,0.0
20771,47341,0.0
20772,47342,0.0
20773,47343,0.0
