In [41]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import mean_absolute_error

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import warnings
warnings.filterwarnings('ignore')

In [23]:
path = "../input/"

train = pd.read_csv(path + 'train.csv')
train_x = train.drop(['winPlacePerc'], axis=1)
train_y = train['winPlacePerc']
test_x = pd.read_csv(path + 'test.csv')

In [24]:
df = pd.concat([train_x, test_x])

In [25]:
print('訓練データのデータ数は{}、変数は{}種類です。'.format(train.shape[0], train.shape[1]))
print('テストデータのデータ数は{}、変数は{}種類です'.format(test_x.shape[0], test_x.shape[1]))

訓練データのデータ数は7134、変数は29種類です。
テストデータのデータ数は1794、変数は28種類です


In [26]:
len(train_x)

7134

In [27]:
len(test_x)

1794

### groupIdの前処理

In [28]:
group_count = df.groupby("groupId", as_index=False)["Id"].count().rename(columns={"Id": "TeamSize"})

df = pd.merge(df, group_count, on="groupId", how="left")

df.drop('groupId', axis=1, inplace=True)

train_x = df[:7134]
test_x = df[7134:]

### matchTypeの前処理
hot one encordingで処理

In [29]:
train["matchType"].unique()

array(['duo', 'duo-fpp', 'squad-fpp', 'solo-fpp', 'squad', 'crashfpp',
       'solo'], dtype=object)

In [30]:
matchType = pd.concat([train_x['matchType'], test_x['matchType']])
matchType_ohe = pd.get_dummies(matchType)

matchType_ohe_train = matchType_ohe[:7134]
matchType_ohe_test = matchType_ohe[7134:]

train_x = pd.concat([train_x, matchType_ohe_train], axis=1)
test_x = pd.concat([test_x, matchType_ohe_test], axis=1)

train_x.drop('matchType', axis=1, inplace=True)
test_x.drop('matchType', axis=1, inplace=True)

train_x.head()

Unnamed: 0,Id,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,TeamSize,crashfpp,duo,duo-fpp,normal-squad-fpp,solo,solo-fpp,squad,squad-fpp
0,2860d3f9ee4d4b,25ade93a241f39,0,0,0.0,0,0,0,80,0,0,0,0.0,1413,48,48,1496,0,0.0,0,0.0,0,0,257.0,4,0,2,0,1,0,0,0,0,0,0
1,0b88c47e54cdcc,95125f5e05c653,0,2,359.0,4,0,0,5,1387,4,2,35.7,1386,48,47,-1,1,262.7,0,0.0,0,0,1025.0,5,1607,2,0,0,1,0,0,0,0,0
2,8f086d61515bc4,8a84da96d077c3,0,0,223.5,1,0,0,15,1673,2,1,27.39,1410,49,48,-1,0,0.0,0,0.0,0,0,712.8,2,1736,2,0,0,1,0,0,0,0,0
3,7c037ca62a38a4,a139eac809980a,0,0,128.1,0,0,0,50,1479,0,0,0.0,1357,31,28,-1,0,0.0,0,0.0,0,0,2738.0,4,1476,4,0,0,0,0,0,0,0,1
4,82f08cde238c60,e9d203a36b1d0a,0,0,200.0,2,0,0,36,0,1,1,25.75,1395,28,28,1479,0,0.0,0,0.0,0,0,129.4,2,0,4,0,0,0,0,0,0,0,1


### matchIdの前処理
削除

In [31]:
train_x.drop('matchId', axis=1, inplace=True)
test_x.drop('matchId', axis=1, inplace=True)

Idの前処理

In [33]:
train_x.drop('Id', axis=1, inplace=True)
test_x.drop('Id', axis=1, inplace=True)

## 特徴量

### バリデーション

In [34]:
# 学習データを学習データとバリデーションデータに分ける
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True, random_state=71)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

In [36]:
# 特徴量と目的変数をlightgbmのデータ構造に変換する
lgb_train = lgb.Dataset(tr_x, tr_y)
lgb_eval = lgb.Dataset(va_x, va_y)

In [37]:
# ハイパーパラメータの設定
params = {'objective': 'regression', 
          'seed': 71, 
          'verbose': 0, 
          'metrics': 'mae'}
num_round = 100

In [38]:
# 学習の実行
# カテゴリ変数をパラメータで指定している
# バリデーションデータもモデルに渡し、学習の進行とともにスコアがどう変わるかモニタリングする
model = lgb.train(params, lgb_train, num_boost_round=num_round,
                  valid_names=['train', 'valid'], valid_sets=[lgb_train, lgb_eval])

[1]	train's l1: 0.228648	valid's l1: 0.234318
[2]	train's l1: 0.208971	valid's l1: 0.214825
[3]	train's l1: 0.191378	valid's l1: 0.197287
[4]	train's l1: 0.175705	valid's l1: 0.181434
[5]	train's l1: 0.161832	valid's l1: 0.167578
[6]	train's l1: 0.149439	valid's l1: 0.155318
[7]	train's l1: 0.138352	valid's l1: 0.144255
[8]	train's l1: 0.128326	valid's l1: 0.134132
[9]	train's l1: 0.119552	valid's l1: 0.125445
[10]	train's l1: 0.111837	valid's l1: 0.117878
[11]	train's l1: 0.104812	valid's l1: 0.111068
[12]	train's l1: 0.098808	valid's l1: 0.105147
[13]	train's l1: 0.0933683	valid's l1: 0.0998195
[14]	train's l1: 0.0886917	valid's l1: 0.0953065
[15]	train's l1: 0.084625	valid's l1: 0.0913591
[16]	train's l1: 0.0809304	valid's l1: 0.0878625
[17]	train's l1: 0.0777236	valid's l1: 0.0846357
[18]	train's l1: 0.0747638	valid's l1: 0.0817547
[19]	train's l1: 0.0722349	valid's l1: 0.0792848
[20]	train's l1: 0.0700399	valid's l1: 0.0770513
[21]	train's l1: 0.0677433	valid's l1: 0.0748326
[22]	

In [42]:
# バリデーションデータでのスコアの確認
va_pred = model.predict(va_x)
score = mean_absolute_error(va_y, va_pred)
score

0.05201629716527954

In [43]:
# 予測
pred = model.predict(test_x)

In [44]:
va_pred

array([0.71176733, 0.88729702, 0.94466137, ..., 0.26165451, 0.68406469,
       0.10524452])

In [47]:
path = "../submission/"

submission = pd.read_csv(path + 'submission.csv')

submission['winPlacePerc'] = pred
submission

Unnamed: 0,winPlacePerc
0,0.126132
1,0.224306
2,0.260975
3,0.267577
4,0.515079
5,0.201897
6,0.066514
7,0.012258
8,0.547470
9,0.393087


In [48]:
submission.to_csv(path + 'production_submission.csv', index=False)