In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!ls /content/drive/MyDrive/Amex/

Models	Predictions	 test.parquet	   train_labels.csv
OOF	test_fe.parquet  train_fe.parquet  train.parquet


## データを見ていろいろ考える
- 日付ごとの特徴があるか
- 曜日ごとに特徴があるか
- testとtrainで分布が同じか

In [4]:
# ====================================================
# Library
# ====================================================
import gc
import warnings
warnings.filterwarnings('ignore')
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools

In [5]:
train = pd.read_parquet('/content/drive/MyDrive/Amex/train.parquet')

In [6]:
train_labels = pd.read_csv('/content/drive/MyDrive/Amex/train_labels.csv')

In [7]:
train = train.merge(train_labels, how = 'inner', on = 'customer_ID')

In [8]:
train['S_2'] = pd.to_datetime(train['S_2'])
train['year'] = train['S_2'].dt.year
train['month'] = train['S_2'].dt.month
train['day'] = train['S_2'].dt.day
train['dayofweek'] = train['S_2'].dt.dayofweek

In [9]:
train_0 = train.query('target == 0')
train_1 = train.query('target == 1')

In [10]:
date_cols = ['year', 'month', 'day', 'dayofweek', 'target']

In [11]:
#train_0 = train_0.drop(['customer_ID', 'target'] , axis=1)
#train_1 = train_1.drop(['customer_ID', 'target'] , axis=1)
train_0 = train_0[['S_2', 'year', 'month', 'day', 'day', 'dayofweek']]
train_1 = train_1[['S_2', 'year', 'month', 'day', 'day', 'dayofweek']]

In [12]:
train_0.head()

Unnamed: 0,S_2,year,month,day,day.1,dayofweek
0,2017-03-09,2017,3,9,9,3
1,2017-04-07,2017,4,7,7,4
2,2017-05-28,2017,5,28,28,6
3,2017-06-13,2017,6,13,13,1
4,2017-07-16,2017,7,16,16,6


In [13]:
# 月毎のカウント数
# 週ごとのカウント数
# 日ごとのカウント数
# 曜日ごとのカウント数

In [14]:
train_0

Unnamed: 0,S_2,year,month,day,day.1,dayofweek
0,2017-03-09,2017,3,9,9,3
1,2017-04-07,2017,4,7,7,4
2,2017-05-28,2017,5,28,28,6
3,2017-06-13,2017,6,13,13,1
4,2017-07-16,2017,7,16,16,6
...,...,...,...,...,...,...
5531446,2017-11-05,2017,11,5,5,6
5531447,2017-12-23,2017,12,23,23,5
5531448,2018-01-06,2018,1,6,6,5
5531449,2018-02-06,2018,2,6,6,1


In [15]:
weight = train_0.shape[0] / train_1.shape[0]

In [16]:
weight

3.0144970240276834

In [17]:
train_1.describe()

Unnamed: 0,year,month,day,day.1,dayofweek
count,1377869.0,1377869.0,1377869.0,1377869.0,1377869.0
mean,2017.254,6.244771,16.15215,16.15215,2.942635
std,0.4354552,3.519213,8.61214,8.61214,1.926155
min,2017.0,1.0,1.0,1.0,0.0
25%,2017.0,3.0,9.0,9.0,1.0
50%,2017.0,6.0,16.0,16.0,3.0
75%,2018.0,9.0,23.0,23.0,5.0
max,2018.0,12.0,31.0,31.0,6.0


In [18]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
pal, color=['#016CC9','#DEB078'], ['#8DBAE2','#EDD3B3']
rgb=['rgba'+str(matplotlib.colors.to_rgba(i,0.7)) for i in pal]
temp=dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), 
                           height=500, width=2000))


fig = make_subplots(rows=1, cols=4, 
                    subplot_titles=date_cols[:-1], # targetだけ除く
                    vertical_spacing=0.1)
row=0
c=[1,2,3,4]*5
plot_df=train[date_cols]
for i,col in enumerate(date_cols[:-1]):
    if i%4==0:
        row+=1
    plot_df[col]=plot_df[col].astype(object)
    df=plot_df.groupby(col)['target'].value_counts().rename('count').reset_index().replace('',np.nan)
    
    fig.add_trace(go.Bar(x=df[df.target==1][col], y=df[df.target==1]['count'] * weight, # scale揃える
                         marker_color=rgb[1], marker_line=dict(color=pal[1],width=2), 
                         hovertemplate='Value %{x} Frequency = %{y}',
                         name='Default', showlegend=(True if i==0 else False)),
                  row=row, col=c[i])
    fig.add_trace(go.Bar(x=df[df.target==0][col], y=df[df.target==0]['count'],
                         marker_color=rgb[0], marker_line=dict(color=pal[0],width=2),
                         hovertemplate='Value %{x} Frequency = %{y}',
                         name='Paid', showlegend=(True if i==0 else False)),
                  row=row, col=c[i])
    if i%4==0:
        fig.update_yaxes(title='Frequency',row=row,col=c[i])
fig.update_layout(template=temp,title="Distribution of Categorical Variables",
                  legend=dict(orientation="h",yanchor="bottom",y=1.03,xanchor="right",x=0.2),
                  barmode='group',height=400,width=1200)
fig.show()

In [19]:
train_sample1 = train.sample(2800000)

In [20]:
train_sample2 = train.drop(train_sample1.index)

In [21]:
fig = make_subplots(rows=1, cols=4, 
                    subplot_titles=date_cols[:-1], # targetだけ除く
                    vertical_spacing=0.1)
row=0
c=[1,2,3,4]*5
plot_df=train_sample1[date_cols]
for i,col in enumerate(date_cols[:-1]):
    if i%4==0:
        row+=1
    plot_df[col]=plot_df[col].astype(object)
    df=plot_df.groupby(col)['target'].value_counts().rename('count').reset_index().replace('',np.nan)
    
    fig.add_trace(go.Bar(x=df[df.target==1][col], y=df[df.target==1]['count'] * weight, # scale揃える
                         marker_color=rgb[1], marker_line=dict(color=pal[1],width=2), 
                         hovertemplate='Value %{x} Frequency = %{y}',
                         name='Default', showlegend=(True if i==0 else False)),
                  row=row, col=c[i])
    fig.add_trace(go.Bar(x=df[df.target==0][col], y=df[df.target==0]['count'],
                         marker_color=rgb[0], marker_line=dict(color=pal[0],width=2),
                         hovertemplate='Value %{x} Frequency = %{y}',
                         name='Paid', showlegend=(True if i==0 else False)),
                  row=row, col=c[i])
    if i%4==0:
        fig.update_yaxes(title='Frequency',row=row,col=c[i])
fig.update_layout(template=temp,title="Distribution of Categorical Variables",
                  legend=dict(orientation="h",yanchor="bottom",y=1.03,xanchor="right",x=0.2),
                  barmode='group',height=400,width=1200)
fig.show()

In [22]:
fig = make_subplots(rows=1, cols=4, 
                    subplot_titles=date_cols[:-1], # targetだけ除く
                    vertical_spacing=0.1)
row=0
c=[1,2,3,4]*5
plot_df=train_sample2[date_cols]
for i,col in enumerate(date_cols[:-1]):
    if i%4==0:
        row+=1
    plot_df[col]=plot_df[col].astype(object)
    df=plot_df.groupby(col)['target'].value_counts().rename('count').reset_index().replace('',np.nan)
    
    fig.add_trace(go.Bar(x=df[df.target==1][col], y=df[df.target==1]['count'] * weight, # scale揃える
                         marker_color=rgb[1], marker_line=dict(color=pal[1],width=2), 
                         hovertemplate='Value %{x} Frequency = %{y}',
                         name='Default', showlegend=(True if i==0 else False)),
                  row=row, col=c[i])
    fig.add_trace(go.Bar(x=df[df.target==0][col], y=df[df.target==0]['count'],
                         marker_color=rgb[0], marker_line=dict(color=pal[0],width=2),
                         hovertemplate='Value %{x} Frequency = %{y}',
                         name='Paid', showlegend=(True if i==0 else False)),
                  row=row, col=c[i])
    if i%4==0:
        fig.update_yaxes(title='Frequency',row=row,col=c[i])
fig.update_layout(template=temp,title="Distribution of Categorical Variables",
                  legend=dict(orientation="h",yanchor="bottom",y=1.03,xanchor="right",x=0.2),
                  barmode='group',height=400,width=1200)
fig.show()

In [23]:
# day of week / user count 7 feature
# 毎月何日に行動したか, mean, std 2 feature
# 最後の行動から何日前か, mean, std, max, 3 feature

# last / mean
# last - first
# last / first

# 分布確認する
# 学習し直し
# ensemble

## Test

In [24]:
test = pd.read_parquet('/content/drive/MyDrive/Amex/test.parquet')

In [25]:
test['S_2'] = pd.to_datetime(test['S_2'])
test['year'] = test['S_2'].dt.year
test['month'] = test['S_2'].dt.month
test['day'] = test['S_2'].dt.day
test['dayofweek'] = test['S_2'].dt.dayofweek

In [27]:
temp = test.groupby('customer_ID')['S_2'].max().reset_index()
temp['S_2_month'] = temp['S_2'].dt.month
temp['S_2_month'].value_counts()

4     467966
10    456655
Name: S_2_month, dtype: int64

In [28]:
private_custLst = temp[temp['S_2_month']==10]['customer_ID'].tolist()

In [29]:
test_pri = test[test['customer_ID'].isin(private_custLst)].copy()

In [30]:
test_pub = test.drop(test_pri.index)

In [32]:
print(len(test_pri), len(test_pub))

5644293 5719469


In [33]:
test_pub['target'] = 0
test_pri['target'] = 1

In [34]:
test = pd.concat([test_pub, test_pri])

In [35]:
date_cols = ['year', 'month', 'day', 'dayofweek', 'target']

In [36]:
test.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,D_44,B_4,D_45,B_5,R_2,D_46,D_47,D_48,D_49,B_6,B_7,B_8,D_50,D_51,B_9,R_3,D_52,P_3,B_10,D_53,S_5,B_11,S_6,D_54,R_4,S_7,B_12,S_8,D_55,D_56,B_13,R_5,D_58,S_9,B_14,D_59,D_60,D_61,B_15,S_11,D_62,D_63,D_64,D_65,B_16,B_17,B_18,B_19,D_66,B_20,D_68,S_12,R_6,S_13,B_21,D_69,B_22,D_70,D_71,D_72,S_15,B_23,D_73,P_4,D_74,D_75,D_76,B_24,R_7,D_77,B_25,B_26,D_78,D_79,R_8,R_9,S_16,D_80,R_10,R_11,B_27,D_81,D_82,S_17,R_12,B_28,R_13,D_83,R_14,R_15,D_84,R_16,B_29,B_30,S_18,D_86,D_87,R_17,R_18,D_88,B_31,S_19,R_19,B_32,S_20,R_20,R_21,B_33,D_89,R_22,R_23,D_91,D_92,D_93,D_94,R_24,R_25,D_96,S_22,S_23,S_24,S_25,S_26,D_102,D_103,D_104,D_105,D_106,D_107,B_36,B_37,R_26,R_27,B_38,D_108,D_109,D_110,D_111,B_39,D_112,B_40,S_27,D_113,D_114,D_115,D_116,D_117,D_118,D_119,D_120,D_121,D_122,D_123,D_124,D_125,D_126,D_127,D_128,D_129,B_41,B_42,D_130,D_131,D_132,D_133,R_28,D_134,D_135,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,year,month,day,dayofweek,target
9,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,2018-04-22,0.894195,11,0.02097,1.001803,0.0,0.073243,0.0,0.001597,,,0,2,0.077769,0.034031,0,0.459191,0.340429,0.085902,-1,0.209445,0.029223,1.001762,,0,0.023032,0,0.150332,0.588535,0.295178,,0.031653,0.010088,0,1.0,0,0.044619,0.024187,2402,0.061875,,0.030294,0,0.002946,0.035853,0.009963,28,0.246552,0.075594,0.005224,9,0.328897,3,0,0,0,,1.0,0,-1,0,6,0.18439,0.005979,290,0.007812,0.003717,0,0,0.009871,0,4,0.013559,,0.0,0,0,,0.005298,0.0,0.300189,0.016686,0.002583,0,0,0,-1,0.002202,0,0,0,0.003154,0,-1,0.003865,1.0,0.022622,0,0,0.0,0,0,0,,0,0,0,-1,0,0,,1,0.006256,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.919222,0.140788,0.914135,0.973146,0.007747,0.001997,0,0.0,,-1,0,0.008936,0.023071,-1,1.006772,2,-1,0,,-1,,1.0,0.024934,0.473703,0,1,0.299809,0,4,0.288147,0.285856,0,0.832572,4,0,15,0,2,0,1.008954,0,0,,0.0,0.0,,0.002226,0,,-1,-1,-1,-1,0,0,0.0,,0,0.008436,0,2018,4,22,6,0
10,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,2018-05-28,0.913501,17,0.276014,1.005308,0.0,0.162684,0.0,0.007511,,,0,12,0.079254,0.01506,0,0.457494,0.345164,0.051944,-1,0.198607,0.088851,1.001018,,0,0.30314,0,0.15264,0.65401,0.294999,,0.049289,0.237034,0,1.0,0,0.111359,0.019633,3166,0.050986,,0.027126,0,0.004064,0.242635,0.116171,29,0.776301,0.050497,0.005353,9,0.332442,3,0,0,1,,1.0,0,-1,0,6,0.190978,0.000749,435,0.00244,0.00217,0,0,0.002156,0,4,0.070667,,0.0,0,0,,0.002995,0.0,0.307685,0.209185,0.006586,0,0,0,-1,0.005857,0,0,0,0.002688,0,-1,0.008406,1.0,0.076984,0,0,0.0,0,0,0,,0,0,0,-1,0,0,,1,0.003109,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.93699,0.132709,0.929066,0.969664,0.001641,0.000864,0,0.0,,-1,0,0.001287,0.271571,-1,1.007775,2,-1,0,,-1,,1.0,0.031882,0.522955,0,1,0.298768,0,4,0.287581,0.288523,0,0.827385,4,0,15,0,2,0,1.0033,0,0,,0.0,0.0,,0.005403,0,,-1,-1,-1,-1,0,0,0.0,,0,0.009261,0,2018,5,28,0,0
11,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,2018-06-27,0.892941,0,0.050353,1.002304,0.0,0.170599,0.0,0.002152,,,0,3,0.076948,0.355888,0,0.45686,0.349929,0.077195,-1,0.18832,0.02782,1.000562,,1,0.053909,0,0.147116,0.612161,0.298851,,0.023981,0.025124,0,1.0,0,0.109764,0.018395,2694,0.060048,0.060207,0.143482,0,0.003052,0.235553,0.024325,31,0.831362,0.069873,0.009936,11,0.332041,3,0,0,1,,1.0,0,-1,0,6,0.185684,0.002605,435,0.008879,0.006291,0,0,0.007821,0,4,0.02049,,0.0,0,0,,0.006026,0.0,0.300942,0.034657,0.001051,0,0,0,-1,0.008829,0,0,0,0.006118,0,-1,0.000101,1.0,0.018192,0,0,0.0,0,0,0,,0,0,0,-1,0,0,,1,0.009526,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0.907012,0.140462,0.811676,0.97776,0.005153,0.002937,0,0.0,,-1,0,0.006827,0.042421,-1,1.007974,2,-1,0,,-1,,1.0,0.020787,0.130432,0,1,0.298817,0,4,0.301724,0.300086,0,0.834077,4,0,15,0,2,0,1.004925,0,0,,0.0,0.0,,0.000696,0,,-1,-1,-1,-1,0,0,0.0,,0,0.004962,0,2018,6,27,2,0
12,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,2018-07-14,0.794469,0,0.027894,1.002515,0.0,0.172991,0.0,0.006869,,,0,4,0.076225,0.35628,0,0.479771,0.342168,0.116785,-1,0.197272,0.030637,1.002328,,1,0.054504,0,0.145399,0.385655,0.297881,,0.025716,0.018064,0,1.0,0,0.124283,0.044464,2402,0.064804,0.065393,0.140987,0,0.001204,0.544298,0.025354,32,0.377075,0.11312,0.005398,7,0.326891,3,0,0,0,,1.0,0,-1,0,6,0.192328,0.009734,290,0.005492,0.007609,0,0,0.009499,0,4,0.017435,,0.0,0,0,,0.005738,0.0,0.30597,0.033734,0.002159,0,0,0,-1,0.001569,0,0,0,0.009033,0,-1,0.004135,1.0,0.03102,0,0,0.0,0,0,0,,0,0,0,-1,0,0,,1,0.002603,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0.907259,0.132259,0.856068,0.974301,0.002393,0.003579,0,0.0,,-1,0,0.000363,0.025716,-1,1.003243,2,-1,0,,-1,,1.0,0.021141,0.135519,0,1,0.30154,0,4,0.298843,0.30018,0,0.841098,4,0,15,0,2,0,1.004899,0,0,,0.0,0.0,,0.003487,0,,-1,-1,-1,-1,0,0,0.0,,0,0.007293,0,2018,7,14,5,0
13,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,2018-08-15,0.874496,3,0.042944,1.005974,0.0,0.156462,0.0,0.00425,,,0,6,0.087716,0.065778,0,0.471366,0.349913,0.088597,-1,0.190287,0.039989,1.00057,,1,0.090654,0,0.151828,0.46254,0.29559,,0.01241,0.024865,0,1.0,0,0.106837,0.044878,2896,0.066502,0.061252,0.143099,0,0.009221,0.522643,0.036544,33,0.412195,0.087307,0.00388,9,0.328335,3,0,0,1,,1.0,0,-1,0,6,0.190233,0.001224,435,0.00347,0.008627,0,0,0.00628,0,4,0.029288,,0.0,0,0,,0.000318,0.0,0.306021,0.062109,0.002709,0,0,0,-1,0.005801,0,0,0,0.00894,0,-1,0.008955,1.0,0.037987,0,0,0.0,0,0,0,,0,0,0,-1,0,0,,1,0.000374,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0.929528,0.133968,0.934035,0.969459,0.005135,0.009631,0,0.0,,-1,0,0.001338,0.041389,-1,1.004382,2,-1,0,,-1,,1.0,0.040057,0.50298,0,1,0.30628,0,4,0.30271,0.300757,0,0.839806,4,0,15,0,2,0,1.00085,0,0,,0.0,0.0,,0.002761,0,,-1,-1,-1,-1,0,0,0.0,,0,0.002697,0,2018,8,15,2,0


In [37]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
pal, color=['#016CC9','#DEB078'], ['#8DBAE2','#EDD3B3']
rgb=['rgba'+str(matplotlib.colors.to_rgba(i,0.7)) for i in pal]
temp=dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), 
                           height=500, width=2000))


fig = make_subplots(rows=1, cols=4, 
                    subplot_titles=date_cols[:-1], # targetだけ除く
                    vertical_spacing=0.1)
row=0
c=[1,2,3,4]*5
plot_df=test[date_cols]
for i,col in enumerate(date_cols[:-1]):
    if i%4==0:
        row+=1
    plot_df[col]=plot_df[col].astype(object)
    df=plot_df.groupby(col)['target'].value_counts().rename('count').reset_index().replace('',np.nan)
    
    fig.add_trace(go.Bar(x=df[df.target==1][col], y=df[df.target==1]['count'], 
                         marker_color=rgb[1], marker_line=dict(color=pal[1],width=2), 
                         hovertemplate='Value %{x} Frequency = %{y}',
                         name='Private', showlegend=(True if i==0 else False)),
                  row=row, col=c[i])
    fig.add_trace(go.Bar(x=df[df.target==0][col], y=df[df.target==0]['count'],
                         marker_color=rgb[0], marker_line=dict(color=pal[0],width=2),
                         hovertemplate='Value %{x} Frequency = %{y}',
                         name='Public', showlegend=(True if i==0 else False)),
                  row=row, col=c[i])
    if i%4==0:
        fig.update_yaxes(title='Frequency',row=row,col=c[i])
fig.update_layout(template=temp,title="Distribution of Categorical Variables",
                  legend=dict(orientation="h",yanchor="bottom",y=1.03,xanchor="right",x=0.2),
                  barmode='group',height=400,width=1200)
fig.show()