# TPS-Aug-2022

In [1]:
class Config:
    NB = '002'

    raw_data_dir = '../data/raw/'
    processed_data_dir = '../data/processed/'
    interim_dir = '../data/interim/'
    submission_dir = '../data/submission/'

    random_seed = 42
    n_folds = 5

    row_id = 'id'
    target = 'failure'

## Import libralies

In [2]:
import gc
import warnings
warnings.filterwarnings('ignore')

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(style='white', context='notebook', palette='deep')

In [3]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

plotly_template = dict(
    layout=go.Layout(
        template='plotly_dark',
        font=dict(
            family="Franklin Gothic",
            size=12
        ),
        height=500,
        width=1000,
    )
)


color_palette = {
    'Bin': ['#016CC9','#E876A3'],
    'Cat5': ['#E876A3', '#E0A224', '#63B70D', '#6BCFF6', '#13399E'],
}

## Load and check data

### Load data

In [4]:
# Load data
##### Load train and Test set

df_train = pd.read_csv("../data/raw/train.csv")
df_test = pd.read_csv("../data/raw/test.csv")

In [5]:
df_train.describe()

Unnamed: 0,id,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
count,26570.0,26320.0,26570.0,26570.0,26570.0,26570.0,26570.0,26189.0,26032.0,25894.0,25774.0,25633.0,25522.0,25343.0,25270.0,25102.0,24969.0,24796.0,24696.0,24561.0,24460.0,24286.0,26570.0
mean,13284.5,127.826233,6.754046,7.240459,7.415883,8.232518,6.256568,17.791528,11.731988,17.127804,17.510759,11.716624,19.024714,11.430725,16.117711,19.172085,11.702464,15.652904,16.048444,14.995554,16.460727,701.269059,0.212608
std,7670.242662,39.03002,1.471852,1.456493,4.11669,4.199401,3.309109,1.0012,0.996085,0.996414,0.99598,1.000836,1.008591,0.999137,1.405978,1.520785,1.488838,1.155247,1.491923,1.549226,1.708935,123.304161,0.40916
min,0.0,33.16,5.0,5.0,0.0,0.0,0.0,13.968,8.008,12.073,12.715,7.968,15.217,7.537,9.323,12.461,5.167,10.89,9.14,9.104,9.701,196.787,0.0
25%,6642.25,99.9875,6.0,6.0,4.0,5.0,4.0,17.117,11.051,16.443,16.839,11.045,18.34025,10.757,15.209,18.17,10.703,14.89,15.057,13.957,15.268,618.9615,0.0
50%,13284.5,122.39,6.0,8.0,7.0,8.0,6.0,17.787,11.733,17.132,17.516,11.712,19.021,11.43,16.127,19.2115,11.717,15.6285,16.04,14.969,16.436,701.0245,0.0
75%,19926.75,149.1525,8.0,8.0,10.0,11.0,8.0,18.469,12.41,17.805,18.178,12.391,19.708,12.102,17.025,20.207,12.709,16.374,17.082,16.018,17.628,784.09025,0.0
max,26569.0,385.86,9.0,9.0,29.0,29.0,24.0,21.499,16.484,21.425,21.543,15.419,23.807,15.412,22.479,25.64,17.663,22.713,22.303,21.626,24.094,1312.794,1.0


In [6]:
df_test.describe()

Unnamed: 0,id,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
count,20775.0,20552.0,20775.0,20775.0,20775.0,20775.0,20775.0,20446.0,20366.0,20267.0,20151.0,20055.0,19929.0,19871.0,19708.0,19639.0,19535.0,19472.0,19335.0,19233.0,19097.0,19035.0
mean,36957.0,127.634895,7.733959,6.196823,7.453574,8.962407,6.126931,17.793466,11.727331,17.13808,17.515797,11.710919,19.030765,11.417921,16.123986,18.846571,11.91423,15.736104,16.123958,15.115915,16.636052,701.389816
std,5997.370257,39.154642,1.308535,1.917478,4.274477,4.3342,3.835881,1.002206,1.006834,1.008714,1.000067,1.001096,1.005401,0.999953,1.565414,1.588642,1.363253,1.357019,1.410569,1.545069,1.643463,130.205829
min,26570.0,37.7,6.0,4.0,0.0,0.0,0.0,13.565,7.384,12.215,13.539,7.853,14.885,7.578,9.167,13.127,6.116,9.209,8.415,8.417,10.162,1.671
25%,31763.5,99.47,6.0,4.0,4.0,6.0,3.0,17.119,11.04825,16.457,16.847,11.035,18.351,10.744,15.095,17.714,11.069,14.871,15.238,14.082,15.512,618.7235
50%,36957.0,122.11,7.0,5.0,7.0,9.0,6.0,17.789,11.729,17.132,17.51,11.704,19.04,11.414,16.1095,18.81,11.941,15.734,16.119,15.062,16.706,701.379
75%,42150.5,148.84,9.0,7.0,10.0,12.0,8.0,18.478,12.411,17.8245,18.197,12.385,19.707,12.093,17.156,19.9675,12.791,16.605,17.0015,16.107,17.781,784.8725
max,47344.0,385.57,9.0,9.0,30.0,33.0,28.0,21.389,15.623,21.681,21.183,15.828,23.092,15.091,23.354,24.95,18.962,21.677,23.14,22.097,22.27,1242.786


In [7]:
features_list = [col for col in df_train.columns if col not in [Config.row_id, Config.target, 'product_code']]
features_list

['loading',
 'attribute_0',
 'attribute_1',
 'attribute_2',
 'attribute_3',
 'measurement_0',
 'measurement_1',
 'measurement_2',
 'measurement_3',
 'measurement_4',
 'measurement_5',
 'measurement_6',
 'measurement_7',
 'measurement_8',
 'measurement_9',
 'measurement_10',
 'measurement_11',
 'measurement_12',
 'measurement_13',
 'measurement_14',
 'measurement_15',
 'measurement_16',
 'measurement_17']

## 基本情報

In [8]:
df_train.groupby(['product_code','attribute_0', 'attribute_1', 'attribute_2', 'attribute_3']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,id,loading,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
product_code,attribute_0,attribute_1,attribute_2,attribute_3,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
A,material_7,material_8,9,5,5100,5051,5100,5100,5100,5032,4998,4949,4925,4924,4913,4875,4843,4804,4774,4770,4784,4723,4712,4674,5100
B,material_5,material_5,8,8,5250,5209,5250,5250,5250,5166,5155,5135,5117,5057,5038,4999,4983,4963,4952,4889,4909,4862,4791,4800,5250
C,material_7,material_8,5,8,5765,5716,5765,5765,5765,5689,5642,5603,5583,5573,5523,5495,5484,5469,5432,5404,5337,5327,5323,5296,5765
D,material_7,material_5,6,6,5112,5063,5112,5112,5112,5043,5016,5001,4956,4941,4917,4874,4871,4824,4798,4733,4720,4706,4710,4646,5112
E,material_7,material_6,6,9,5343,5281,5343,5343,5343,5259,5221,5206,5193,5138,5131,5100,5089,5042,5013,5000,4946,4943,4924,4870,5343


In [9]:
df_test.groupby(['product_code','attribute_0', 'attribute_1', 'attribute_2', 'attribute_3']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,id,loading,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
product_code,attribute_0,attribute_1,attribute_2,attribute_3,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
F,material_5,material_6,6,4,5422,5355,5422,5422,5422,5345,5312,5301,5258,5246,5201,5186,5183,5137,5081,5093,5042,5014,4966,4959
G,material_5,material_6,9,7,5107,5058,5107,5107,5107,5019,4990,4981,4945,4941,4889,4898,4815,4800,4795,4798,4766,4747,4669,4677
H,material_7,material_7,7,9,5018,4967,5018,5018,5018,4933,4930,4881,4868,4823,4830,4786,4749,4741,4708,4694,4664,4631,4615,4585
I,material_7,material_5,9,5,5228,5172,5228,5228,5228,5149,5134,5104,5080,5045,5009,5001,4961,4961,4951,4887,4863,4841,4847,4814


## 各特徴量とTargetの関係

In [13]:
float_cols = [f for f in df_test.columns if df_test[f].dtype in [float, int]]
bin_num = 20
df_target_prob = pd.DataFrame(data={'idx':range(bin_num-1)})

fig = make_subplots(
    rows=6,
    cols=4,
    #start_cell='bottom-left', # どのセルを起点とするか
    subplot_titles=float_cols,
    shared_xaxes=False, # x軸を共有する場合
    shared_yaxes=False, # y軸を共有する場合
)

for idx, f in enumerate(float_cols):
    min_val = min(df_train[f].min(), df_test[f].min())
    max_val = max(df_train[f].max(), df_test[f].max())
    bins = np.linspace(min_val, max_val, bin_num)

    total, _ = np.histogram(df_train[f], bins=bins)
    failures, _ = np.histogram(df_train[f][df_train.failure == 1], bins=bins)

    df_target_prob[f'{f}_fail'] = failures
    df_target_prob[f'{f}_total'] = total
    df_target_prob[f'{f}_prob'] = failures / total


    #with warnings.catch_warnings(): # ignore divide by zero for empty bins
    #    warnings.filterwarnings('ignore', category=RuntimeWarning)

    fig.add_trace(
        go.Scatter(
            x=(bins[1:] + bins[:-1]) / 2,
            y=failures / total,
            #y=total,
            name='Failure probability',
            mode='markers',
            #marker=dict(size=1, color='red'),
            #line=dict(color='black'),
            customdata=total,
            hovertemplate="<b>%{y:.2f}</b> <br>total: %{customdata:.2f}<extra></extra>",
        ),
        row=int(idx/4) + 1,
        col=idx%4 + 1
    )

fig.update_layout(
    barmode='overlay',
    width=1500,
    height=1500,
)

fig.show()

In [14]:
# 相関行列を生成
df_corr = df_train.corr()

fig = go.Figure(layout=plotly_template['layout'])

fig.add_trace(
    go.Heatmap(
        x=df_corr.columns,
        y=df_corr.index,
        z=np.array(df_corr),
        #annotation_text=np.around(np.array(df_corr), decimals=2),
        #y=total,
        name='Corr',
        #mode='markers',
        #marker=dict(size=1, color='red'),
        #line=dict(color='black'),
        #customdata=total,
        #hovertemplate="<b>%{y:.2f}</b> <br>total: %{customdata:.2f}<extra></extra>",
    ),
)

fig.update_layout(
    width=1500,
    height=1500,
)

fig.show()

### Check distribution

In [None]:
fig = go.Figure(layout=plotly_template['layout'])

fig.add_trace(
    go.Histogram(
        x=df_train['measurement_4'],
        name='Train',
        histnorm='probability',
        #marker=dict(size=15, color='red'),
        #line=dict(color='black')
    )
)
fig.add_trace(
    go.Histogram(
        x=df_test['measurement_4'],
        name='Test',
        histnorm='probability',
        #marker=dict(size=15, color='red'),
        #line=dict(color='black')
    )
)

fig.update_layout(barmode='overlay')
fig.show()

### Train と Test　の分布

In [None]:
fig = make_subplots(
    rows=6,
    cols=4,
    #start_cell='bottom-left', # どのセルを起点とするか
    subplot_titles=features_list,
    shared_xaxes=False, # x軸を共有する場合
    shared_yaxes=False, # y軸を共有する場合
)

for idx, col in enumerate(features_list):
    fig.add_trace(
        go.Histogram(
            x=df_train[col],
            name=f'{col}(Train)',
            histnorm='probability',
            marker=dict(color=color_palette['Bin'][0]),
            #line=dict(color='black')
        ),
        row=int(idx/4) + 1,
        col=idx%4 + 1
    )
    fig.add_trace(
        go.Histogram(
            x=df_test[col],
            name=f'{col}(Test)',
            histnorm='probability',
            marker=dict(color=color_palette['Bin'][1]),
            opacity=0.5
            #line=dict(color='black')
        ),
        row=int(idx/4) + 1,
        col=idx%4 + 1
    )


fig.update_layout(
    barmode='overlay',
    width=1500,
    height=1500,
)
fig.show()

### Targetの分布

In [None]:
df_train[Config.target].unique()

In [None]:
fig = make_subplots(
    rows=6,
    cols=4,
    #start_cell='bottom-left', # どのセルを起点とするか
    subplot_titles=features_list,
    shared_xaxes=False, # x軸を共有する場合
    shared_yaxes=False, # y軸を共有する場合
)

for idx, col in enumerate(features_list):
    fig.add_trace(
        go.Histogram(
            x=df_train[df_train[Config.target] == 0][col],
            name=f'{col}(False)',
            histnorm='probability',
            marker=dict(color=color_palette['Bin'][0]),
            #line=dict(color='black')
        ),
        row=int(idx/4) + 1,
        col=idx%4 + 1
    )
    fig.add_trace(
        go.Histogram(
            x=df_train[df_train[Config.target] == 1][col],
            name=f'{col}(True)',
            histnorm='probability',
            marker=dict(color=color_palette['Bin'][1]),
            opacity=0.5
            #line=dict(color='black')
        ),
        row=int(idx/4) + 1,
        col=idx%4 + 1
    )


fig.update_layout(
    barmode='overlay',
    width=1500,
    height=1500,
)
fig.show()

In [None]:
fig = make_subplots(
    rows=6,
    cols=4,
    #start_cell='bottom-left', # どのセルを起点とするか
    subplot_titles=features_list,
    shared_xaxes=False, # x軸を共有する場合
    shared_yaxes=False, # y軸を共有する場合
)

for idx, col in enumerate(features_list):
    fig.add_trace(
        go.Box(
            x=df_train[df_train[Config.target] == 0][col],
            name=f'{col}(False)',
            #histnorm='probability',
            marker=dict(color=color_palette['Bin'][0]),
            #line=dict(color='black')
        ),
        row=int(idx/4) + 1,
        col=idx%4 + 1
    )
    fig.add_trace(
        go.Box(
            x=df_train[df_train[Config.target] == 1][col],
            name=f'{col}(True)',
            #histnorm='probability',
            marker=dict(color=color_palette['Bin'][1]),
            #opacity=0.5
            #line=dict(color='black')
        ),
        row=int(idx/4) + 1,
        col=idx%4 + 1
    )


fig.update_layout(
    barmode='overlay',
    width=1500,
    height=1500,
)
fig.show()

### product_code ごとの分布

In [None]:
fig = make_subplots(
    rows=6,
    cols=4,
    #start_cell='bottom-left', # どのセルを起点とするか
    subplot_titles=features_list,
    shared_xaxes=False, # x軸を共有する場合
    shared_yaxes=False, # y軸を共有する場合
)

for idx, col in enumerate(features_list):
    fig.add_trace(
        go.Histogram(
            x=df_train[(df_train['product_code'] == 'A') & (df_train[Config.target] == 0)][col],
            name=f'{col}(False)',
            histnorm='probability',
            marker=dict(color=color_palette['Cat5'][0]),
            #line=dict(color='black')
        ),
        row=int(idx/4) + 1,
        col=idx%4 + 1
    )
    fig.add_trace(
        go.Histogram(
            x=df_train[(df_train['product_code'] == 'A') & (df_train[Config.target] == 1)][col],
            name=f'{col}(True)',
            histnorm='probability',
            marker=dict(color=color_palette['Cat5'][1]),
            opacity=0.5
            #line=dict(color='black')
        ),
        row=int(idx/4) + 1,
        col=idx%4 + 1
    )


fig.update_layout(
    barmode='overlay',
    width=1500,
    height=1500,
)
fig.show()

In [None]:
df_target_pivot = pd.DataFrame(data={'Positive':df_train.groupby('product_code')[Config.target].mean() * 100})
df_target_pivot['Negative'] = (1 - df_train.groupby('product_code')[Config.target].mean()) * 100
df_target_pivot

In [None]:
fig = go.Figure(layout=plotly_template['layout'])

fig.add_trace(go.Bar(x=df_target_pivot.index, y=df_target_pivot['Positive'], name='Positive',
                     text=df_target_pivot['Positive'], texttemplate='%{text:.0f}%',
                     textposition='inside', insidetextanchor="middle",
                     marker=dict(color=color_palette['Bin'][1]),
                     hovertemplate = "<b>%{x}</b><br>Paid accounts: %{y:.2f}%"))

fig.add_trace(go.Bar(x=df_target_pivot.index, y=df_target_pivot['Negative'], name='Negative',
                     text=df_target_pivot['Negative'], texttemplate='%{text:.0f}%',
                     textposition='inside',insidetextanchor="middle",
                     marker=dict(color=color_palette['Bin'][0]),
                     hovertemplate = "<b>%{x}</b><br>Default accounts: %{y:.2f}%"))

fig.update_layout(title='Distribution of Failure',
                  barmode='relative', yaxis_ticksuffix='%', width=1400,
                  #legend=dict(orientation="h", traceorder="reversed", yanchor="bottom", y=0, xanchor="left", x=0),
                  legend=dict(orientation="h", traceorder="reversed", y=1.1, x=0.8),
                  xaxis_title='product code', yaxis_title='Percentage of Positive')

fig.show()

## 欠損値

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
df_train[df_train['product_code'] == 'A']['attribute_0'].unique()

In [None]:
df_train

## 欠損を各プロダクトの中央値で補完する

In [None]:
f = lambda x: x.fillna(x.median())
df_train = df_train.groupby('product_code').transform(f)
df_test = df_test.groupby('product_code').transform(f)

## loadingの検証

In [None]:
df_train.groupby([Config.target, 'product_code']).mean()