In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.layers import Input, Dense, BatchNormalization, Dropout
from keras.models import Model, load_model
from keras.callbacks import LearningRateScheduler, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras import optimizers

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import gc, sys
gc.enable()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train = pd.read_csv('train_V2.csv')

train.drop(train[train['winPlacePerc'].isnull()].index, inplace=True)

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [4]:
train = reduce_mem_usage(train)

In [3]:
def feature_engineering(df, is_train=True):
    # fix rank points
    df['rankPoints'] = np.where(df['rankPoints'] <= 0, 0, df['rankPoints'])
    
    print('adding new features...')
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]
    df['headshotrate'] = df['kills'] / df['headshotKills']
    df['killStreakrate'] = df['killStreaks'] / df['kills']
    df['healthitems'] = df['heals'] + df['boosts']
    df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
    df['headshotKills_over_kills'] = df['headshotKills'] / df['kills']
    df['distance_over_weapons'] = df['totalDistance'] / df['weaponsAcquired']
    df['walkDistance_over_heals'] = df['walkDistance'] / df['heals']
    df['walkDistance_over_kills'] = df['walkDistance'] / df['kills']
    df['killsPerWalkDistance'] = df['kills'] / df['walkDistance']
    df["skill"] = df["headshotKills"]+df["roadKills"]
    
    df[df == np.Inf] = np.NaN
    df[df == np.NINF] = np.NaN
    
    print("Removing Na's From DF")
    df.fillna(0, inplace=True)
    
    print(df.isnull().any().any())
    
    target = 'winPlacePerc'
    features = list(df.columns)
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchType")
    
    y = None
    if is_train: 
        y = np.array(df.groupby(['matchId','groupId'])[target].agg('mean'), dtype=np.float64)
        features.remove(target)
    
    print("adding group mean feature...")
    agg = df.groupby(['matchId','groupId'])[features].agg('mean')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()    
    if is_train: df_out = agg.reset_index()[['matchId','groupId']]
    else: df_out = df[['matchId','groupId']]
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])
    
#     print("get group median feature")
#     agg = df.groupby(['matchId','groupId'])[features].agg('median')
#     agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
#     df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
#     df_out = df_out.merge(agg_rank, suffixes=["_median", "_median_rank"], how='left', on=['matchId', 'groupId'])
    
    print("adding group max feature...")
    agg = df.groupby(['matchId','groupId'])[features].agg('max')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])
    
    print("adding group min feature...")
    agg = df.groupby(['matchId','groupId'])[features].agg('min')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
    
#     print("get group sum feature")
#     agg = df.groupby(['matchId','groupId'])[features].agg('sum')
#     agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
#     df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
#     df_out = df_out.merge(agg_rank, suffixes=["_sum", "_sum_rank"], how='left', on=['matchId', 'groupId'])
    
    print("adding group size feature...")
    agg = df.groupby(['matchId','groupId']).size().reset_index(name='group_size')
    df_out = df_out.merge(agg, how='left', on=['matchId', 'groupId'])
    
    print("adding match mean feature...")
    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    
#     print("get match sum feature")
#     agg = df.groupby(['matchId'])[features].agg('sum').reset_index()
#     df_out = df_out.merge(agg, suffixes=["", "_match_sum"], how='left', on=['matchId'])
    
#     print("get match median feature")
#     agg = df.groupby(['matchId'])[features].agg('median').reset_index()
#     df_out = df_out.merge(agg, suffixes=["", "_match_median"], how='left', on=['matchId'])
    
    print("adding match size feature...")
    agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    df_out = df_out.merge(agg, how='left', on=['matchId'])
    
    
    del df, agg, agg_rank
    gc.collect()
    
    df_out.drop(["matchId", "groupId"], axis=1, inplace=True)

    df_out = np.array(df_out, dtype=np.float64)

#     del df, df_out, agg, agg_rank
#     gc.collect()

    return df_out, y

In [6]:
%%time
X_train, y_train = feature_engineering(train, True)

adding new features...
Removing Na's From DF
False
adding group mean feature...
adding group max feature...
adding group min feature...
adding group size feature...
adding match mean feature...
adding match size feature...
Wall time: 3min 21s


In [8]:
# pd.DataFrame(X_train).to_csv('X_train.csv', header=None)
# pd.DataFrame(y_train).to_csv('y_train.csv', header=None)

In [10]:
%%time
scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1), copy=False).fit(X_train)
# scaler = preprocessing.QuantileTransformer().fit(X_train)
# scaler = preprocessing.StandardScaler()
scaler.transform(X_train)

Wall time: 3.04 s


In [15]:
# from sklearn.externals import joblib
# scaler_filename = "scaler_MinMax.save"
# joblib.dump(scaler, scaler_filename)

['scaler_MinMax.save']

In [8]:
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.2)

In [9]:
def get_model(input_size):
    input_X = Input((input_size, ))
    X = Dense(32, activation='relu')(input_X)
    X = BatchNormalization()(X)
    X = Dense(32, activation='relu')(X)
    X = BatchNormalization()(X)
    X = Dropout(0.5)(X)
    X = Dense(32, activation='relu')(X)
    output = Dense(1, activation='sigmoid')(X)
    
    model = Model(inputs=input_X, outputs=output)
    
    return model

In [10]:
input_size = X_train.shape[1]

In [11]:
m1 = get_model(input_size)

In [12]:
m1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 247)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                7936      
_________________________________________________________________
batch_normalization_1 (Batch (None, 32)                128       
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
batch_normalization_2 (Batch (None, 32)                128       
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                1056      
__________

In [13]:
opt = optimizers.Adam(lr=0.01, epsilon=1e-8, decay=1e-4, amsgrad=False)

In [15]:
m1.compile(loss='mean_absolute_error',
           optimizer=opt,
           metrics=['mae'])

In [17]:
def step_decay_schedule(initial_lr=1e-3, decay_factor=0.75, step_size=10, verbose=0):
    '''
    Wrapper function to create a LearningRateScheduler with step decay schedule.
    '''
    def schedule(epoch):
        return initial_lr * (decay_factor ** np.floor(epoch/step_size))
    
    return LearningRateScheduler(schedule, verbose)

lr_sched = step_decay_schedule(initial_lr=0.1, decay_factor=0.9, step_size=1, verbose=1)
early_stopping = EarlyStopping(patience=20, verbose=1)
model_checkpoint = ModelCheckpoint("NN_3.model", save_best_only=True, verbose=1)
# reduce_lr = ReduceLROnPlateau(factor=0.5, patience=5, verbose=1)

In [18]:
train_history = m1.fit(X_train, y_train, batch_size=2048,
                       epochs=50, validation_data=(X_dev, y_dev),
                       callbacks=[early_stopping, model_checkpoint, lr_sched],
                       verbose=1)

Train on 1621395 samples, validate on 405349 samples
Epoch 1/50

Epoch 00001: LearningRateScheduler setting learning rate to 0.1.

Epoch 00001: val_loss improved from inf to 0.07685, saving model to NN_3.model
Epoch 2/50

Epoch 00002: LearningRateScheduler setting learning rate to 0.09000000000000001.

Epoch 00002: val_loss did not improve from 0.07685
Epoch 3/50

Epoch 00003: LearningRateScheduler setting learning rate to 0.08100000000000002.

Epoch 00003: val_loss did not improve from 0.07685
Epoch 4/50

Epoch 00004: LearningRateScheduler setting learning rate to 0.0729.

Epoch 00004: val_loss did not improve from 0.07685
Epoch 5/50

Epoch 00005: LearningRateScheduler setting learning rate to 0.06561.

Epoch 00005: val_loss did not improve from 0.07685
Epoch 6/50

Epoch 00006: LearningRateScheduler setting learning rate to 0.05904900000000001.

Epoch 00006: val_loss did not improve from 0.07685
Epoch 7/50

Epoch 00007: LearningRateScheduler setting learning rate to 0.0531441000000000

In [4]:
X_test = pd.read_csv('test_V2.csv')
X_test = reduce_mem_usage(X_test)

In [5]:
X_test, _ = feature_engineering(X_test, False)

adding new features...
Removing Na's From DF
False
adding group mean feature...
adding group max feature...
adding group min feature...
adding group size feature...
adding match mean feature...
adding match size feature...


In [6]:
pd.DataFrame(X_test).to_csv('X_test.csv', header=None)

In [7]:
from sklearn.externals import joblib
scaler = joblib.load('scaler_MinMax.save')

In [8]:
scaler.transform(X_test)

array([[-0.97727273, -1.        , -0.99050597, ..., -0.9986185 ,
        -0.92129884,  0.83673469],
       [-0.95454545, -0.6969697 , -0.88263301, ..., -0.99670277,
        -0.94778481,  0.91836735],
       [-0.93181818, -0.86363636, -0.8873942 , ..., -0.99770283,
        -0.93186103,  0.87755102],
       ...,
       [-0.96969697, -0.93939394, -0.97226421, ..., -0.99786951,
        -0.93778616,  0.87755102],
       [-0.95454545, -0.81818182, -0.94358374, ..., -0.9972605 ,
        -0.94488397,  0.91836735],
       [-1.        , -1.        , -0.98838989, ..., -0.99770497,
        -0.92432581,  0.83673469]])

In [10]:
model = load_model('NN_2.model')

In [11]:
pred = model.predict(X_test)
pred = pred.reshape(-1)

In [13]:
X_test = pd.read_csv('test_V2.csv')

In [14]:
for i in range(len(X_test)):
    winPlacePerc = pred[i]
    maxPlace = int(X_test.iloc[i]['maxPlace'])
    if maxPlace == 0:
        winPlacePerc = 0.0
    elif maxPlace == 1:
        winPlacePerc = 1.0
    else:
        gap = 1.0 / (maxPlace - 1)
        winPlacePerc = round(winPlacePerc / gap) * gap
    
    if winPlacePerc < 0: winPlacePerc = 0.0
    if winPlacePerc > 1: winPlacePerc = 1.0    
    pred[i] = winPlacePerc

    if (i + 1) % 100000 == 0:
        print(i, flush=True, end=" ")
        

X_test['winPlacePerc'] = pred

99999 199999 299999 399999 499999 599999 699999 799999 899999 999999 1099999 1199999 1299999 1399999 1499999 1599999 1699999 1799999 1899999 

In [15]:
submission = X_test[['Id', 'winPlacePerc']]
submission.to_csv('submission_NN_2.csv', index=False)