In [4]:
%matplotlib inline
import numpy as np
import pandas as pd
import stat
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import lightgbm as lgb
import xgboost as xgb
import torch.nn.functional as F
from torch import nn, optim, sigmoid
from skorch import NeuralNetRegressor
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error

In [5]:
df_train = pd.read_csv('data/pubg-train.csv')
df_test = pd.read_csv('data/pubg-test.csv')

In [6]:
def clean_data_2(df):
    
    for point, thre in zip(['rankPoints', 'killPoints', 'winPoints'], [1250, 1000, 1400]):
        df[point+'_dummy'] = pd.Series(df['rankPoints']>=thre, dtype=np.int64)
        df[point] = np.clip(df[point], thre, 2000) - thre
    
    
    df['swim_dummy'] = pd.Series(df['swimDistance']>0, dtype=np.int64)
    df['kill_dummy'] = pd.Series(df['kills']!=0, dtype=np.int64)
    
    for dist in ['longestKill', 'walkDistance', 'damageDealt', 'rideDistance', 'swimDistance']:
        if dist not in ['longestKill', 'damageDealt']:
            df[dist] = df[dist] / df['matchDuration'] * 60.
        df[dist] = np.log1p(df[dist])
    
    for var in ['roadKills', 'vehicleDestroys', 'teamKills']:
        df.loc[df[var]>0, var] = 1

    return df


df_all = pd.concat((df_train.iloc[:, 3:-1], df_test.iloc[:, 3:]))
df_all = clean_data_2(df_all)

df_all.head(10)

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,...,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,rankPoints_dummy,killPoints_dummy,winPoints_dummy,swim_dummy,kill_dummy
0,0,0,0.0,0,0,0,96,70,0,0,...,0,0,1.185833,2,89,0,0,0,0,0
1,0,6,5.514235,1,0,0,14,0,2,1,...,0,0,5.807122,7,0,1,0,0,0,1
2,0,0,0.0,0,0,0,59,0,0,0,...,0,0,2.883529,5,0,1,0,0,0,0
3,0,0,5.630495,3,0,1,17,0,2,1,...,0,0,3.375291,5,0,1,0,0,0,1
4,0,2,4.967728,1,1,4,13,169,2,1,...,0,0,4.567316,5,190,0,0,0,0,1
5,0,0,0.0,0,0,0,86,228,0,0,...,0,0,1.73674,2,13,0,0,0,0,0
6,1,5,3.36557,0,0,10,43,44,0,0,...,0,1,5.209182,5,121,0,0,0,0,0
7,0,0,5.70711,2,1,0,18,59,2,2,...,0,0,2.849205,2,100,0,0,0,0,1
8,0,0,5.303305,2,0,0,37,0,1,1,...,0,0,2.290843,3,0,1,0,0,0,1
9,0,2,0.0,0,0,1,51,0,0,0,...,0,0,3.884078,1,0,1,0,0,0,0


In [7]:
vec = DictVectorizer(sparse=False, dtype=np.float32)
mmscaler = MinMaxScaler()

X = vec.fit_transform(df_all.to_dict('record'))
X = mmscaler.fit_transform(X)

X.shape

(445094, 44)

In [8]:
feature_names = vec.get_feature_names()
X_train, X_test = X[:len(df_train)], X[len(df_train):]
del X

X_train.shape, X_test.shape

((356075, 44), (89019, 44))

In [9]:
Y_train = df_train['winPlacePerc']
Y_train = Y_train.astype(np.float32)
Y_train.shape

(356075,)

In [10]:
class simplenn(nn.Module):
    def __init__(self, layer_nodes, act=F.relu):
        super(simplenn, self).__init__()
        self.fc1 = nn.Linear(layer_nodes[0], layer_nodes[1])
        self.fc2 = nn.Linear(layer_nodes[1], layer_nodes[2])
        self.fc3 = nn.Linear(layer_nodes[2], layer_nodes[3])
        self.fc4 = nn.Linear(layer_nodes[3], 1)
        
        self.act = act

    def forward(self, x):
        x = self.act(self.fc1(x))
#       x = F.dropout(x, p=0.2)
        x = self.act(self.fc2(x))
        x = self.act(self.fc3(x))
        x = sigmoid(self.fc4(x))
        return x

In [19]:
def models_train(X, Y, params_lgb, params_xgb, network, norm=False):
    
    #input data
    X_1 = xgb.DMatrix(data=X, label=Y)
    
    #train and predict by XGBoost model No.1
    xgb1 = xgb.train(params_xgb, X_1, num_boost_round=100)
    pred_xgb1 = xgb1.predict(xgb.DMatrix(data=X))
    if norm:
        pred_xgb1 = np.clip(pred_xgb1, 0., 1.)
    
    del X_1
    
    #input for layer 2, tune parameters for two LightGBM models
    X_2 = np.hstack((X, pred_xgb1[:, np.newaxis]))
    X_2_ds = lgb.Dataset(X_2, Y)
    
    del pred_xgb1
    
    params_lgb_2 = params_lgb.copy()
    params_lgb_2['num_leaves'] = 356
    
    #train two models
    lgb1 = lgb.train(params_lgb, X_2_ds)
    lgb2 = lgb.train(params_lgb_2, X_2_ds)
    
    del X_2_ds
    
    pred_lgb1 = lgb1.predict(X_2)
    if norm:
        pred_lgb1 = np.clip(pred_lgb1, 0., 1.)
    
    del X_2
    
    #input for layer 3
    X_3 = np.hstack((X, pred_lgb1[:, np.newaxis]))
    X_3_dm =xgb.DMatrix(data=X_3, label=Y)
    
    del pred_lgb1, X
    
    #train xgb2
    xgb2 = xgb.train(params_xgb, X_3_dm, num_boost_round=100)
    
    del X_3_dm
    #train neural network
    
    net = NeuralNetRegressor(network,
                               module__act=F.softplus,
                               module__layer_nodes=[45, 120, 50, 15],
                               max_epochs=5,
                               lr = 0.00175,
                               criterion=nn.MSELoss,
                               optimizer=optim.Adam,
                               iterator_train__shuffle=True,
                               iterator_train__batch_size=64,
                               #optimizer__weight_decay=0.001
                               )
    
    net.fit(X_3.astype(np.float32), Y[:, np.newaxis])
    
    mods={
        'xgb1' : xgb1,
        'xgb2' : xgb2,
        'lgb1' : lgb1,
        'lgb2' : lgb2,
        'net' : net
    }
    
    return mods

In [26]:
def models_predict(mods, X):
    
    pred_xgb1 = mods['xgb1'].predict(xgb.DMatrix(data=X))
    
    X_2 = np.hstack((X, pred_xgb1[:, np.newaxis]))
    
    del pred_xgb1
    
    pred_lgb1 = mods['lgb1'].predict(X_2)
    pred_lgb2 = mods['lgb2'].predict(X_2)
    
    del X_2
    
    X_3 = np.hstack((X, pred_lgb1[:, np.newaxis]))
    
    del pred_lgb1, X
    
    pred_xgb2 = mods['xgb2'].predict(xgb.DMatrix(data=X_3))
    pred_net = mods['net'].predict(X_3.astype(np.float32))
    
    del X_3
    
    output = np.hstack((pred_lgb2[:, np.newaxis], pred_xgb2[:, np.newaxis], pred_net))
    output = np.dot(output, np.array([0.3,0.4,0.3])[:, np.newaxis])
    
    del pred_lgb2, pred_xgb2, pred_net
    
    return np.clip(output, 0., 1.)

In [27]:
params_lgb = {
        'boosting_type' : 'gbdt',
        'objective' : 'regression',
        'metric' : {'l2'},
        'num_leaves' : 386,
        'learning_rate' : 0.1,
        'feature_fraction' : 0.9,
        'bagging_fraction' : 0.8,
        'bagging_freq': 5,
        'verbose' : 0
}



params_xgb = {'objective' : 'reg:linear',
              'colsample_bytree': 0.8, 
              'learning_rate' : 0.1,
              'max_depth' : 8}

In [28]:
scores = []

kf = KFold(n_splits=5, shuffle=True)
for train, test in kf.split(X_train):
    X1, X2 = X_train[train], X_train[test]
    Y1, Y2 = Y_train[train], Y_train[test]
    
    mods = models_train(X1, Y1, params_lgb, params_xgb, network=simplenn)
    scores.append(mean_absolute_error(Y2, models_predict(mods, X2)))
    
    del mods

  if getattr(data, 'base', None) is not None and \


  epoch    train_loss    valid_loss      dur
-------  ------------  ------------  -------
      1        [36m0.0095[0m        [32m0.0066[0m  20.5952
      2        [36m0.0067[0m        0.0070  16.2048
      3        [36m0.0066[0m        0.0066  19.1756
      4        [36m0.0066[0m        [32m0.0064[0m  14.2798
      5        [36m0.0065[0m        0.0066  15.1537


  if getattr(data, 'base', None) is not None and \


  epoch    train_loss    valid_loss      dur
-------  ------------  ------------  -------
      1        [36m0.0097[0m        [32m0.0067[0m  17.9000
      2        [36m0.0067[0m        0.0069  20.3019
      3        [36m0.0066[0m        [32m0.0064[0m  15.7648
      4        [36m0.0065[0m        0.0065  18.6941
      5        [36m0.0065[0m        0.0064  15.6576


  if getattr(data, 'base', None) is not None and \


  epoch    train_loss    valid_loss      dur
-------  ------------  ------------  -------
      1        [36m0.0096[0m        [32m0.0069[0m  17.1573
      2        [36m0.0068[0m        [32m0.0067[0m  15.1667
      3        [36m0.0066[0m        [32m0.0065[0m  15.2434
      4        [36m0.0065[0m        0.0066  19.1017
      5        [36m0.0065[0m        [32m0.0065[0m  15.6627


  if getattr(data, 'base', None) is not None and \


  epoch    train_loss    valid_loss      dur
-------  ------------  ------------  -------
      1        [36m0.0094[0m        [32m0.0068[0m  14.4940
      2        [36m0.0067[0m        [32m0.0066[0m  16.1408
      3        [36m0.0066[0m        0.0067  17.8186
      4        [36m0.0065[0m        [32m0.0066[0m  17.7985
      5        [36m0.0065[0m        [32m0.0065[0m  15.0983


  if getattr(data, 'base', None) is not None and \


  epoch    train_loss    valid_loss      dur
-------  ------------  ------------  -------
      1        [36m0.0091[0m        [32m0.0071[0m  17.9573
      2        [36m0.0067[0m        [32m0.0066[0m  16.9274
      3        [36m0.0066[0m        [32m0.0066[0m  17.5334
      4        [36m0.0065[0m        [32m0.0065[0m  19.0369
      5        [36m0.0065[0m        [32m0.0065[0m  14.2615


In [29]:
scores

[0.07068115796516014,
 0.07054638755293813,
 0.07052503031783257,
 0.07060344858231998,
 0.07076660265784492]