In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
import datetime
import shutil

In [97]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample = pd.read_csv('./data/sample_submission.csv')

In [98]:
no_need_idx = train.nunique()[train.nunique() == 1].index
train.drop(no_need_idx,axis=1,inplace=True)
test.drop(no_need_idx,axis=1,inplace=True)

In [99]:
train

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8405,107.39,ak,s,as,c,d,aa,d,q,...,1,0,0,0,0,0,0,0,0,0
4205,8406,108.77,j,o,t,d,d,aa,h,h,...,0,1,0,0,0,0,0,0,0,0
4206,8412,109.22,ak,v,r,a,d,aa,g,e,...,0,0,1,0,0,0,0,0,0,0
4207,8415,87.48,al,r,e,f,d,aa,l,u,...,0,0,0,0,0,0,0,0,0,0


In [100]:
test

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8410,aj,h,as,f,d,aa,j,e,0,...,0,0,0,0,0,0,0,0,0,0
4205,8411,t,aa,ai,d,d,aa,j,y,0,...,0,1,0,0,0,0,0,0,0,0
4206,8413,y,v,as,f,d,aa,d,w,0,...,0,0,0,0,0,0,0,0,0,0
4207,8414,ak,v,as,a,d,aa,c,q,0,...,0,0,1,0,0,0,0,0,0,0


In [101]:
sample

Unnamed: 0,ID,y
0,1,100.669318
1,2,100.669318
2,3,100.669318
3,4,100.669318
4,5,100.669318
...,...,...
4204,8410,100.669318
4205,8411,100.669318
4206,8413,100.669318
4207,8414,100.669318


In [102]:
int_idx = train.dtypes[train.dtypes == 'int64'].index.drop('ID')

モデルの構築

In [103]:
X = train.loc[:,int_idx]
y = train['y']
X_test = test.loc[:,int_idx]

In [104]:
kf = KFold(n_splits=5)

In [105]:
model_list = []

for train_idx,valid_idx in kf.split(X,y):
    X_train,X_valid = X.iloc[train_idx,:],X.iloc[valid_idx,:]
    y_train,y_valid = y[train_idx],y[valid_idx]
    model = LinearRegression()
    model.fit(X_train,y_train)
    model_list.append(model)

In [106]:
train_r2_score_list = []
valid_r2_score_list = []

for model in model_list:
    y_train_pred = model.predict(X_train)
    y_valid_pred = model.predict(X_valid)
    train_r2_score_list.append(r2_score(y_train,y_train_pred))
    valid_r2_score_list.append(r2_score(y_valid,y_valid_pred))

print('train_r2_score_list:',train_r2_score_list)
print('valid_r2_score_list:',valid_r2_score_list)

train_r2_score_list: [-2.964648638821609e+23, -7.012294614129907e+22, -1.9468682434242276e+21, -4.306317626279367e+22, 0.5742747101640095]
valid_r2_score_list: [0.6488519582111922, 0.6563448332735622, 0.6559027313310417, 0.6517786126895657, -9.718218434344715e+22]


テストデータの予測

In [107]:
y_test_df = pd.DataFrame()

for i,model in enumerate(model_list):
    y_test = model.predict(X_test)
    y_test = pd.Series(y_test,name=f'model_{i}')
    y_test_df = pd.concat([y_test_df,y_test],axis=1)

y_test_df['mean'] = y_test_df.apply(np.mean,axis=1)
y_test_df

Unnamed: 0,model_0,model_1,model_2,model_3,model_4,mean
0,-6.671803e+13,1.265803e+13,-5.559970e+13,-3.607473e+13,2.405908e+13,-2.433507e+13
1,1.131226e+13,-9.514847e+13,-1.867276e+13,-9.787629e+13,-6.698155e+13,-5.347336e+13
2,-3.089180e+14,9.468750e+01,1.045312e+02,1.094062e+02,1.034062e+02,-6.178359e+13
3,6.339895e+11,7.331250e+01,7.059375e+01,8.215625e+01,7.790625e+01,1.267979e+11
4,1.114688e+02,1.075625e+02,1.126562e+02,1.105312e+02,1.105312e+02,1.105500e+02
...,...,...,...,...,...,...
4204,1.040938e+02,1.036875e+02,1.025938e+02,1.031250e+02,1.030938e+02,1.033187e+02
4205,9.709375e+01,9.856250e+01,9.481250e+01,9.850000e+01,9.765625e+01,9.732500e+01
4206,9.228125e+01,9.125000e+01,9.078125e+01,9.112500e+01,9.190625e+01,9.146875e+01
4207,1.118438e+02,1.121875e+02,1.134688e+02,1.105938e+02,1.114062e+02,1.119000e+02


In [108]:
submit = pd.concat([test['ID'],y_test_df['mean']],axis=1)
submit.columns = ['ID','y']
now = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
submit_filename = f'submit_{now}' + '.csv'
submit.to_csv(submit_filename,index=None)
shutil.move(submit_filename,'./submit_files')

'./submit_files/submit_2022-07-08-15:26:05.csv'