In [172]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
import datetime
import shutil

In [173]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample = pd.read_csv('./data/sample_submission.csv')

In [174]:
no_need_idx = train.nunique()[train.nunique() == 1].index
train.drop(no_need_idx,axis=1,inplace=True)
test.drop(no_need_idx,axis=1,inplace=True)

In [175]:
train

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8405,107.39,ak,s,as,c,d,aa,d,q,...,1,0,0,0,0,0,0,0,0,0
4205,8406,108.77,j,o,t,d,d,aa,h,h,...,0,1,0,0,0,0,0,0,0,0
4206,8412,109.22,ak,v,r,a,d,aa,g,e,...,0,0,1,0,0,0,0,0,0,0
4207,8415,87.48,al,r,e,f,d,aa,l,u,...,0,0,0,0,0,0,0,0,0,0


In [176]:
test

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8410,aj,h,as,f,d,aa,j,e,0,...,0,0,0,0,0,0,0,0,0,0
4205,8411,t,aa,ai,d,d,aa,j,y,0,...,0,1,0,0,0,0,0,0,0,0
4206,8413,y,v,as,f,d,aa,d,w,0,...,0,0,0,0,0,0,0,0,0,0
4207,8414,ak,v,as,a,d,aa,c,q,0,...,0,0,1,0,0,0,0,0,0,0


In [177]:
sample

Unnamed: 0,ID,y
0,1,100.669318
1,2,100.669318
2,3,100.669318
3,4,100.669318
4,5,100.669318
...,...,...
4204,8410,100.669318
4205,8411,100.669318
4206,8413,100.669318
4207,8414,100.669318


In [178]:
int_idx = train.dtypes[train.dtypes == 'int64'].index.drop('ID')

In [179]:
obj_idx = train.dtypes[train.dtypes != 'int64'].index.drop('y')

In [180]:
train.loc[:,obj_idx]

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
0,k,v,at,a,d,u,j,o
1,k,t,av,e,d,y,l,o
2,az,w,n,c,d,x,j,x
3,az,t,n,f,d,x,l,e
4,az,v,n,f,d,h,d,n
...,...,...,...,...,...,...,...,...
4204,ak,s,as,c,d,aa,d,q
4205,j,o,t,d,d,aa,h,h
4206,ak,v,r,a,d,aa,g,e
4207,al,r,e,f,d,aa,l,u


In [181]:
test.loc[:,obj_idx]

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
0,az,v,n,f,d,t,a,w
1,t,b,ai,a,d,b,g,y
2,az,v,as,f,d,a,j,j
3,az,l,n,f,d,z,l,n
4,w,s,as,c,d,y,i,m
...,...,...,...,...,...,...,...,...
4204,aj,h,as,f,d,aa,j,e
4205,t,aa,ai,d,d,aa,j,y
4206,y,v,as,f,d,aa,d,w
4207,ak,v,as,a,d,aa,c,q


In [182]:
df_obj = pd.concat([train.loc[:,obj_idx],test.loc[:,obj_idx]])

In [183]:
train.loc[:,obj_idx].nunique().sum()

195

In [184]:
encoder = OneHotEncoder(sparse=False,handle_unknown='ignore')
X_encoded = encoder.fit_transform(train.loc[:,obj_idx])
X_encoded = pd.DataFrame(X_encoded,columns=encoder.get_feature_names_out())
X_test_encoded = encoder.transform(test.loc[:,obj_idx])
X_test_encoded = pd.DataFrame(X_test_encoded,columns=encoder.get_feature_names_out())

モデルの構築

In [185]:
X = pd.concat([train.loc[:,int_idx],X_encoded],axis=1)
y = train['y']
X_test = pd.concat([test.loc[:,int_idx],X_test_encoded],axis=1)

In [186]:
df_corr = pd.DataFrame(columns=['column1','column2','corr'])
for i in range(X.shape[1]):
    for j in range(X.shape[1]):
        if i>j:
            df_corr_tmp = pd.DataFrame(data={'column1':[X.iloc[:,i].name],'column2':[X.iloc[:,j].name],'corr':[X.iloc[:,i].corr(X.iloc[:,j])]})
            df_corr = pd.concat([df_corr,df_corr_tmp])
        else:
            pass

In [187]:
high_corr_idx = df_corr[abs(df_corr['corr'])>0.2]['column1'].unique()
high_corr_idx = high_corr_idx.tolist()

In [189]:
X = X.drop(high_corr_idx,axis=1)
X_test = X_test.drop(high_corr_idx,axis=1)

In [190]:
kf = KFold(n_splits=5)

In [191]:
model_list = []

for train_idx,valid_idx in kf.split(X,y):
    X_train,X_valid = X.iloc[train_idx,:],X.iloc[valid_idx,:]
    y_train,y_valid = y[train_idx],y[valid_idx]
    model = LinearRegression()
    model.fit(X_train,y_train)
    model_list.append(model)

In [192]:
train_r2_score_list = []
valid_r2_score_list = []

for model in model_list:
    y_train_pred = model.predict(X_train)
    y_valid_pred = model.predict(X_valid)
    train_r2_score_list.append(r2_score(y_train,y_train_pred))
    valid_r2_score_list.append(r2_score(y_valid,y_valid_pred))

print('train_r2_score_list:',train_r2_score_list)
print('valid_r2_score_list:',valid_r2_score_list)

train_r2_score_list: [-4.474039766818962e+21, -9.187236744693352e+24, -5.190767230379853e+23, -9.815480934579715e+21, 0.18834464644042115]
valid_r2_score_list: [0.2214001564748801, 0.22746309060705816, 0.2285753233810952, 0.21837033017417584, -6.416572638946423e+22]


テストデータの予測

In [193]:
y_test_df = pd.DataFrame()

for i,model in enumerate(model_list):
    y_test = model.predict(X_test)
    y_test = pd.Series(y_test,name=f'model_{i}')
    y_test_df = pd.concat([y_test_df,y_test],axis=1)

y_test_df['mean'] = y_test_df.apply(np.mean,axis=1)
y_test_df

Unnamed: 0,model_0,model_1,model_2,model_3,model_4,mean
0,8.013925e+01,78.312769,81.530184,79.735766,81.914609,8.032652e+01
1,9.965448e+01,99.280247,100.520529,100.539012,102.919275,1.005827e+02
2,1.007259e+02,98.442815,100.524092,100.461994,102.959875,1.006229e+02
3,-1.697235e+11,88.669018,91.287020,87.423754,87.418014,-3.394469e+10
4,9.261507e+01,92.434569,92.783614,92.739338,96.288759,9.337227e+01
...,...,...,...,...,...,...
4204,1.035221e+02,103.647281,105.740129,107.740316,109.948111,1.061196e+02
4205,1.001546e+02,99.510279,100.537520,100.902912,102.947345,1.008105e+02
4206,9.867933e+01,97.871279,98.852401,99.337547,103.681583,9.968443e+01
4207,1.020462e+02,105.832180,103.260176,104.832111,107.026619,1.045995e+02


In [194]:
submit = pd.concat([test['ID'],y_test_df['mean']],axis=1)
submit.columns = ['ID','y']
now = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
submit_filename = f'submit_{now}' + '.csv'
submit.to_csv(submit_filename,index=None)
shutil.move(submit_filename,'./submit_files')

'./submit_files/submit_2022-07-08-17:35:59.csv'