In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
from timeit import default_timer as timer
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn import preprocessing
from __future__ import division
import os
import gc, sys
gc.enable()
# Any results you write to the current directory are saved as output.


In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    return df

In [None]:
def pre_process(train, is_train=True):
    if is_train:
        train=train[train['maxPlace']>1] ##Remove games which have only one player
        y=train.groupby(['matchId','groupId'])['winPlacePerc'].agg('mean')
        train=train.drop(['winPlacePerc'], axis=1)
    train['totalDistance']=train['swimDistance']+train['walkDistance']+train['rideDistance']
    train['headshotrate'] = train['kills']/train['headshotKills']
    train['killStreakrate'] = train['killStreaks']/train['kills']
    train['healthitems'] = train['heals'] + train['boosts']
    #train=train.drop(['heals','boosts'], axis=1)
    train['totalDistance'] = train['rideDistance'] + train["walkDistance"] + train["swimDistance"]
    train['killPlace_over_maxPlace'] = train['killPlace'] / train['maxPlace']
    train['headshotKills_over_kills'] = train['headshotKills'] / train['kills']
    train['distance_over_weapons'] = train['totalDistance'] / train['weaponsAcquired']
    train['walkDistance_over_heals'] = train['walkDistance'] / train['heals']
    train['walkDistance_over_kills'] = train['walkDistance'] / train['kills']
    train['killsPerWalkDistance'] = train['kills'] / train['walkDistance']
    train["skill"] = train["headshotKills"] + train["roadKills"]
    train[train == np.Inf] = np.NaN
    train[train == np.NINF] = np.NaN
    print("Removing Na's From DF")
    train.fillna(0, inplace=True)
    #train=train.drop(['swimDistance', 'walkDistance','rideDistance'], axis=1)
    train['teamSize']=train.groupby('groupId')['groupId'].transform('count')
    features=list(train.columns)
    features.remove("Id")
    features.remove("numGroups")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchType")
    train_out = train.groupby(['matchId','groupId'])[features].agg({'avg':'mean','max': 'max', 'min': 'min','sum':'sum'})
    #'dev':'std'
    train_out.columns = ["_".join(x) for x in train_out.columns.ravel()]
    #train_out=train_out.drop(['dev_teamSize'], axis=1)
    train_out=train_out.replace([np.inf, -np.inf], 0)
    train_out=train_out.drop(['max_teamSize', 'min_teamSize','sum_teamSize'], axis=1)
    features=list(train_out.columns)
    train_rank = train_out.groupby('matchId')[features].rank(pct=True, na_option= 'top')
    train_final = train_out.reset_index()[['matchId','groupId']]
    train_final = train_final.merge(train_out.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    train_final = train_final.merge(train_rank, suffixes=["","_rank"], how='left', on=['matchId', 'groupId'])
    train_final['matchSize']=train_final.groupby('matchId')['matchId'].transform('count')
    if is_train:
        train_final=train_final.drop(['matchId', 'groupId'], axis=1)
    train_final=reduce_mem_usage(train_final)
    del train_rank, train_out, train
    gc.collect()
    if is_train:
        return train_final, y
    return train_final

In [None]:
train_main=pd.read_csv("../input/train_V2.csv")
test=pd.read_csv("../input/test_V2.csv")

In [None]:
train_main=reduce_mem_usage(train_main)
test=reduce_mem_usage(test)

In [None]:
##the validation and training splits have to be made based on the matchId's as taking random points may mislead the classifier
train_match, val_match = model_selection.train_test_split(train_main.matchId.unique())
X_train=train_main.loc[train_main['matchId'].isin(train_match)]
X_val=train_main.loc[train_main['matchId'].isin(val_match)]
del train_main
print('Training data shape : ', X_train.shape)
print('Validation data shape: ', X_val.shape)
print('Test data shape: ', test.shape)

In [None]:
X_train, y_train=pre_process(X_train, is_train=True)
X_val, y_val=pre_process(X_val, is_train=True)
test=pre_process(test, is_train=False)

In [None]:
def simple_regression(X, y):
    lr=LinearRegression()
    lr.fit(X, y)
    print("Prediction error: ", mean_absolute_error(y, lr.predict(X)))

There are different sets of matches and each match has a number of groupIds. The target values can be obtained by averaging the finish percentages of a specific group as all the members of the group will have the same finish percentage. 

In [None]:
"""
xgbreg = xgb.XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='reg:linear', booster='gbtree', n_jobs=-1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=9, seed=9, missing=None, importance_type='gain')
xgbreg.fit(train_final ,y,verbose = 50)
print("Prediction error: ", mean_absolute_error(y, xgbreg.predict(train_final)))
"""

In [None]:
params = {"objective" : "regression", "metric" : "mae", 'n_estimators':20000, 'early_stopping_rounds':200,
              "num_leaves" : 31, "learning_rate" : 0.05, "bagging_fraction" : 0.7,
               "bagging_seed" : 0, "num_threads" : 4,"colsample_bytree" : 0.7}
lgtrain = lgb.Dataset(X_train, label=y_train)
lgval = lgb.Dataset(X_val, label=y_val)
model = lgb.train(params, lgtrain, valid_sets=[lgtrain, lgval], early_stopping_rounds=200, verbose_eval=1000)
pred_train_y = model.predict(X_train, num_iteration=model.best_iteration)
pred_val_y = model.predict(X_val, num_iteration=model.best_iteration)

In [None]:
print("Prediction error: ", mean_absolute_error(y_val, pred_val_y ))

In [None]:
pred_test_y=model.predict(test.drop(['matchId','groupId'], axis=1), num_iteration=model.best_iteration)
pred_y=pd.DataFrame({'winPlacePerc':pred_test_y})

In [None]:
test=pd.read_csv("../input/test_V2.csv")
final_out=test[['Id', 'groupId']]
test=pre_process(test, is_train=False)
values=zip(list(test['groupId'].values), list(pred_test_y))
df = pd.DataFrame(data = list(values), columns=['groupId', 'winPlacePerc'])
final_out=final_out.merge(df, on='groupId', how='left')

In [None]:
final_out=final_out.drop(['groupId'], axis=1)

In [None]:
submission = final_out
submission.to_csv('submission.csv', index=False)