In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from time import sleep

from keras.layers import Input, Dense, BatchNormalization, Dropout
from keras.models import Model, load_model
from keras.callbacks import LearningRateScheduler, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras import optimizers

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold
import lightgbm as lgb

import gc, sys
gc.enable()

In [3]:
# load dataset
train = pd.read_csv('train_V2.csv')
# drop the row with missing value
train.drop(train[train['winPlacePerc'].isnull()].index, inplace=True)

In [4]:
# function to reduce memory usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [5]:
# reduce memory usage of train_df
train = reduce_mem_usage(train)

In [6]:
# function to undergo feature engineering
def feature_engineering(df, is_train=True):
    # fix rank points
    df['rankPoints'] = np.where(df['rankPoints'] <= 0, 0, df['rankPoints'])
    
    print('adding new features...')
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]
    df['headshotrate'] = df['kills'] / df['headshotKills']
    df['killStreakrate'] = df['killStreaks'] / df['kills']
    df['healthitems'] = df['heals'] + df['boosts']
    df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
    df['headshotKills_over_kills'] = df['headshotKills'] / df['kills']
    df['distance_over_weapons'] = df['totalDistance'] / df['weaponsAcquired']
    df['walkDistance_over_heals'] = df['walkDistance'] / df['heals']
    df['walkDistance_over_kills'] = df['walkDistance'] / df['kills']
    df['killsPerWalkDistance'] = df['kills'] / df['walkDistance']
    df["skill"] = df["headshotKills"]+df["roadKills"]
    
    df[df == np.Inf] = np.NaN
    df[df == np.NINF] = np.NaN
    
    print("Removing Na's From DF")
    df.fillna(0, inplace=True)
    
    print(df.isnull().any().any())
    
    target = 'winPlacePerc'
    features = list(df.columns)
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchType")
    
    y = None
    if is_train: 
        y = np.array(df.groupby(['matchId','groupId'])[target].agg('mean'), dtype=np.float64)
        features.remove(target)
    
    print("adding group mean feature...")
    agg = df.groupby(['matchId','groupId'])[features].agg('mean')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()    
    if is_train: df_out = agg.reset_index()[['matchId','groupId']]
    else: df_out = df[['matchId','groupId']]
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])
    
    # new
    # print("get group median feature")
    # agg = df.groupby(['matchId','groupId'])[features].agg('median')
    # agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    # df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    # df_out = df_out.merge(agg_rank, suffixes=["_median", "_median_rank"], how='left', on=['matchId', 'groupId'])
    
    print("adding group max feature...")
    agg = df.groupby(['matchId','groupId'])[features].agg('max')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])
    
    print("adding group min feature...")
    agg = df.groupby(['matchId','groupId'])[features].agg('min')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
    
    # new
#     print("get group sum feature")
#     agg = df.groupby(['matchId','groupId'])[features].agg('sum')
#     agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
#     df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
#     df_out = df_out.merge(agg_rank, suffixes=["_sum", "_sum_rank"], how='left', on=['matchId', 'groupId'])
    
    print("adding group size feature...")
    agg = df.groupby(['matchId','groupId']).size().reset_index(name='group_size')
    df_out = df_out.merge(agg, how='left', on=['matchId', 'groupId'])
    
    print("adding match mean feature...")
    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    
    # new
#     print("adding match sum feature")
#     agg = df.groupby(['matchId'])[features].agg('sum').reset_index()
#     df_out = df_out.merge(agg, suffixes=["", "_match_sum"], how='left', on=['matchId'])
    
    # new
#     print("adding match median feature")
#     agg = df.groupby(['matchId'])[features].agg('median').reset_index()
#     df_out = df_out.merge(agg, suffixes=["", "_match_median"], how='left', on=['matchId'])
    
    print("adding match size feature...")
    agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    df_out = df_out.merge(agg, how='left', on=['matchId'])
    
    
    del df, agg, agg_rank
    gc.collect()
    sleep(30)
    
    df_out.drop(["matchId", "groupId"], axis=1, inplace=True)

    df_out = np.array(df_out, dtype=np.float64)

#     del df, df_out, agg, agg_rank
#     gc.collect()

    return df_out, y

In [8]:
# feature engineering to train_df
X_df, y_df = feature_engineering(train, True)

adding new features...
Removing Na's From DF
False
adding group mean feature...
adding group max feature...
adding group min feature...
adding group size feature...
adding match mean feature...
adding match size feature...


In [9]:
# scale train_df
scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1), copy=False).fit(X_df)
X_df = scaler.transform(X_df)

In [10]:
# custom function to run light gbm model
def run_lgb(X_train, y_train, X_val, y_val):
    params = {"objective" : "regression", "metric" : "mae", 'n_estimators':20000, 'early_stopping_rounds':200,
              "num_leaves" : 31, "learning_rate" : 0.05, "bagging_fraction" : 0.7,
               "bagging_seed" : 0, "num_threads" : 4,"colsample_bytree" : 0.7
             }
    
    lgtrain = lgb.Dataset(X_train, label=y_train)
    lgval = lgb.Dataset(X_val, label=y_val)
    model = lgb.train(params, lgtrain, valid_sets=[lgtrain, lgval], early_stopping_rounds=200, verbose_eval=1000)
    
    # pred_test_y = model.predict(x_test, num_iteration=model.best_iteration)
    # return pred_test_y, model
    return model

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X_df, y_df, test_size=0.2)
# Training the model #
model = run_lgb(X_train, y_train, X_val, y_val)

del X_df, y_df, X_train, y_train, X_val, y_val
gc.collect()
sleep(30)



Training until validation scores don't improve for 200 rounds.
[1000]	training's l1: 0.0282865	valid_1's l1: 0.028793
[2000]	training's l1: 0.0270618	valid_1's l1: 0.0279507
[3000]	training's l1: 0.026262	valid_1's l1: 0.027516
[4000]	training's l1: 0.0256278	valid_1's l1: 0.0272223
[5000]	training's l1: 0.0250771	valid_1's l1: 0.0269997
[6000]	training's l1: 0.0245995	valid_1's l1: 0.0268379
[7000]	training's l1: 0.0241504	valid_1's l1: 0.0266885
[8000]	training's l1: 0.0237384	valid_1's l1: 0.0265655
[9000]	training's l1: 0.0233432	valid_1's l1: 0.0264533
[10000]	training's l1: 0.022967	valid_1's l1: 0.026343
[11000]	training's l1: 0.0226047	valid_1's l1: 0.026239
[12000]	training's l1: 0.0222584	valid_1's l1: 0.0261443
[13000]	training's l1: 0.0219242	valid_1's l1: 0.0260544
[14000]	training's l1: 0.0216077	valid_1's l1: 0.0259765
[15000]	training's l1: 0.0212985	valid_1's l1: 0.0258979
[16000]	training's l1: 0.0210054	valid_1's l1: 0.0258299
[17000]	training's l1: 0.0207269	valid_1

In [12]:
X_test = pd.read_csv('test_V2.csv')
X_test = reduce_mem_usage(X_test)
X_test, _ = feature_engineering(X_test, False)
X_test = scaler.transform(X_test)

pred = model.predict(X_test, num_iteration=model.best_iteration)

adding new features...
Removing Na's From DF
False
adding group mean feature...
adding group max feature...
adding group min feature...
adding group size feature...
adding match mean feature...
adding match size feature...


In [13]:
X_test = pd.read_csv('test_V2.csv')

for i in range(len(X_test)):
    winPlacePerc = pred[i]
    maxPlace = int(X_test.iloc[i]['maxPlace'])
    if maxPlace == 0:
        winPlacePerc = 0.0
    elif maxPlace == 1:
        winPlacePerc = 1.0
    else:
        gap = 1.0 / (maxPlace - 1)
        winPlacePerc = round(winPlacePerc / gap) * gap
    
    if winPlacePerc < 0: winPlacePerc = 0.0
    if winPlacePerc > 1: winPlacePerc = 1.0    
    pred[i] = winPlacePerc

    if (i + 1) % 100000 == 0:
        print(i, flush=True, end=" ")
        

X_test['winPlacePerc'] = pred

99999 199999 299999 399999 499999 599999 699999 799999 899999 999999 1099999 1199999 1299999 1399999 1499999 1599999 1699999 1799999 1899999 

In [None]:
submission = X_test[['Id', 'winPlacePerc']]
submission.to_csv('submission_NN_2.csv', index=False)