# TPS-Aug-2022

In [1]:
class Config:
    NB = '101'

    raw_data_dir = '../data/raw/'
    processed_data_dir = '../data/processed/'
    interim_dir = '../data/interim/'
    submission_dir = '../data/submission/'

    random_seed = 42
    n_folds = 5

    row_id = 'id'
    target = 'failure'

## Import libralies

In [2]:
import gc
import warnings
warnings.filterwarnings('ignore')

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(style='white', context='notebook', palette='deep')

In [3]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

plotly_template = dict(
    layout=go.Layout(
        template='plotly_dark',
        font=dict(
            family="Franklin Gothic",
            size=12
        ),
        height=500,
        width=1000,
    )
)


color_palette = {
    'Bin': ['#016CC9','#E876A3'],
    'Cat5': ['#E876A3', '#E0A224', '#63B70D', '#6BCFF6', '#13399E'],
}

In [4]:
from sklearn.preprocessing import LabelEncoder

## Load data

In [5]:
# Load data
##### Load train and Test set

df_train = pd.read_csv("../data/raw/train.csv")
df_test = pd.read_csv("../data/raw/test.csv")

df_dataset = pd.concat(objs=[df_train, df_test], axis=0).reset_index(drop=True)
df_dataset

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,0,A,80.10,material_7,material_8,9,5,7,8,4,18.040,12.518,15.748,19.292,11.739,20.155,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.100,0.0
1,1,A,84.89,material_7,material_8,9,5,14,3,3,18.213,11.540,17.717,17.893,12.748,17.889,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0.0
2,2,A,82.43,material_7,material_8,9,5,12,1,5,18.057,11.652,16.738,18.240,12.718,18.288,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0.0
3,3,A,101.07,material_7,material_8,9,5,13,2,6,17.295,11.188,18.576,18.339,12.583,19.060,12.471,16.346,18.377,10.020,15.250,15.562,16.154,17.172,826.282,0.0
4,4,A,188.06,material_7,material_8,9,5,9,2,8,19.346,12.950,16.990,15.746,11.306,18.093,10.337,17.082,19.932,12.428,16.182,12.760,13.153,16.412,579.885,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47340,47340,I,144.74,material_7,material_5,9,5,0,4,9,18.465,12.570,18.146,17.089,11.204,18.573,11.691,,19.771,11.562,17.246,15.131,15.209,,696.466,
47341,47341,I,74.53,material_7,material_5,9,5,4,8,7,18.900,9.896,18.288,,11.747,17.917,10.980,16.027,15.694,13.564,15.494,15.296,13.812,16.501,613.249,
47342,47342,I,67.73,material_7,material_5,9,5,10,11,2,18.656,,18.242,17.910,12.081,19.630,10.436,16.137,20.612,11.134,16.519,15.525,14.175,17.728,783.349,
47343,47343,I,126.15,material_7,material_5,9,5,8,16,11,16.536,11.226,18.144,17.250,12.692,19.575,12.672,15.422,19.496,9.319,15.817,17.403,16.437,15.179,745.210,


## product_codeをLabelEncodingする

In [6]:
category_cols = ['product_code']

In [7]:
for category in category_cols:
    le = LabelEncoder()
    le.fit(df_dataset[category])
    df_dataset[category] = le.transform(df_dataset[category])


In [8]:
df_dataset.describe()

Unnamed: 0,id,product_code,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
count,47345.0,47345.0,46872.0,47345.0,47345.0,47345.0,47345.0,47345.0,46635.0,46398.0,46161.0,45925.0,45688.0,45451.0,45214.0,44978.0,44741.0,44504.0,44268.0,44031.0,43794.0,43557.0,43321.0,26570.0
mean,23672.0,3.974865,127.742337,7.184032,6.782511,7.432422,8.552793,6.199683,17.792378,11.729944,17.132315,17.51297,11.71412,19.027367,11.425098,16.12046,19.029202,11.795419,15.689501,16.081604,15.048413,16.537596,701.322119,0.212608
std,13667.468584,2.56568,39.08441,1.484426,1.752717,4.186657,4.274405,3.550439,1.001631,1.000809,1.001835,0.997767,1.000943,1.007187,0.999505,1.477943,1.559309,1.438893,1.248694,1.457224,1.548537,1.682775,126.381679,0.40916
min,0.0,0.0,33.16,5.0,4.0,0.0,0.0,0.0,13.565,7.384,12.073,12.715,7.853,14.885,7.537,9.167,12.461,5.167,9.209,8.415,8.417,9.701,1.671,0.0
25%,11836.0,2.0,99.7775,6.0,5.0,4.0,5.0,4.0,17.117,11.05,16.45,16.843,11.041,18.346,10.753,15.164,17.958,10.861,14.884,15.137,14.008,15.371,618.905,0.0
50%,23672.0,4.0,122.26,7.0,7.0,7.0,8.0,6.0,17.788,11.731,17.132,17.514,11.708,19.029,11.423,16.118,19.053,11.821,15.672,16.081,15.01,16.558,701.22,0.0
75%,35508.0,6.0,149.01,9.0,8.0,10.0,11.0,8.0,18.474,12.411,17.813,18.186,12.388,19.708,12.098,17.08175,20.107,12.751,16.467,17.048,16.056,17.698,784.373,0.0
max,47344.0,8.0,385.86,9.0,9.0,30.0,33.0,28.0,21.499,16.484,21.681,21.543,15.828,23.807,15.412,23.354,25.64,18.962,22.713,23.14,22.097,24.094,1312.794,1.0


## attribute_0/1をone-hotEncodingする

In [9]:
display(df_dataset['attribute_0'].unique())
display(df_dataset['attribute_1'].unique())

array(['material_7', 'material_5'], dtype=object)

array(['material_8', 'material_5', 'material_6', 'material_7'],
      dtype=object)

In [10]:
df_dataset = pd.get_dummies(df_dataset)

## Save Dataset

In [11]:
## Separate train dataset and test dataset
train_len = len(df_train)

train = df_dataset[:train_len]
test = df_dataset[train_len:]
test.drop(columns=[Config.target], inplace=True)

In [12]:
pd.to_pickle(train, Config.processed_data_dir + f"nb{Config.NB}_train.pkl", compression='zip')
pd.to_pickle(test, Config.processed_data_dir + f"nb{Config.NB}_test.pkl", compression='zip')

## 検証メモ