In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from scipy.stats import mode
import string
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
warnings.filterwarnings("ignore")
import gc
import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type to reduce memory usage."""
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
#train=pd.read_csv("../input/train_V2.csv")
train=pd.read_csv("../input/train_V2.csv")
train=reduce_mem_usage(train)
test=pd.read_csv("../input/test_V2.csv")
test=reduce_mem_usage(test)
train[:5].T

In [None]:
#train['winPlacePerc'].describe()

In [None]:
#print(train["Id"].isin(test["Id"]).any(),train["matchId"].isin(test["matchId"]).any())
#### thank god no overlap between test and train

In [None]:
gc.collect()

# **Feature engineer**

In [None]:
null_cnt = train.isnull().sum().sort_values()
#print(null_cnt)
print('null count:', null_cnt[null_cnt > 0])
train.dropna(inplace=True) # dropna

In [None]:
train = train.append(test,sort=False).reset_index(drop=True)

In [None]:
def fillInf(df, val):
    numcols = df.select_dtypes(include='number').columns
    cols = numcols[numcols != 'winPlacePerc']
    df[df == np.Inf] = np.NaN
    df[df == np.NINF] = np.NaN
    for c in cols: df[c].fillna(val, inplace=True)

In [None]:

group = train.groupby('groupId')

train['playersJoined'] = train.groupby('matchId')['matchId'].transform('count')
train['groupplayers'] = group['groupId'].transform('count')
train['healsAndBoosts'] = train['heals']+train['boosts']
train['totalDistance'] = train['walkDistance']+train['rideDistance']+train['swimDistance']
train['allin'] = train['groupplayers']/train['playersJoined']*train['numGroups']
train['headshotrate'] = train['headshotKills']/train['kills']
train['boostrate'] = train['boosts']/train['healsAndBoosts']
train['killsrate'] = train['kills']/train['totalDistance']
train['healsrate'] = train['healsAndBoosts']/train['totalDistance']
#train['combine'] = train['kills']*train['totalDistance']
fillInf(train, 0)

In [None]:
sns.jointplot(x=train["winPlacePerc"],y=train['allin'],kind='hex')

In [None]:
train['NevigateTimeperc'] = (train['walkDistance']/5+train['rideDistance']/27+train['swimDistance']/2)/train['matchDuration']

In [None]:
train[(train["maxPlace"]==1)]["winPlacePerc"].hist()

In [None]:
#group mean,max,min
ranklist=[ 'boosts','damageDealt','heals','killPlace','killPoints','kills','killStreaks','winPoints',
          'rankPoints','walkDistance','weaponsAcquired','rideDistance','healsAndBoosts','totalDistance','NevigateTimeperc']


for col in ranklist:

    train['Groupmax'+col] = group[col].transform('max')
    train['Groupmin'+col] = group[col].transform('min')
    #train['Groupmean'+col] = group[col].transform('mean')
    train['Groupsum'+col] = group[col].transform('sum')
    
    
match = train.groupby('matchId')

for col in ranklist:
    train['maxrank'+col] = match['Groupmax'+col].rank(pct=True).values
    train.drop(['Groupmax'+col],axis=1,inplace=True)
    
    train['minrank'+col] = match['Groupmin'+col].rank(pct=True).values
    train.drop(['Groupmin'+col],axis=1,inplace=True)
    
#     train['meanrank'+col] = match['Groupmean'+col].rank(pct=True).values
#     train.drop(['Groupmean'+col],axis=1,inplace=True)
    
    train['sumrank'+col] = match['Groupsum'+col].rank(pct=True).values
    train.drop(['Groupsum'+col],axis=1,inplace=True)
    

train[:5].T

In [None]:
train.drop(['boosts','damageDealt','heals','killPlace','killPoints','kills','killStreaks','winPoints','longestKill',
          'rankPoints','walkDistance','weaponsAcquired','rideDistance','healsAndBoosts','totalDistance','NevigateTimeperc'],axis=1,inplace=True)

In [None]:
del group,match
train=reduce_mem_usage(train)
gc.collect()

In [None]:
null_cnt = train.isnull().sum().sort_values()
print(null_cnt)
print('null count:', null_cnt[null_cnt > 0])

In [None]:
train[train['numGroups']==1]['winPlacePerc'].value_counts()

In [None]:
sns.distplot(train[train['winPlacePerc'].notnull()]['winPlacePerc'])

In [None]:
print(train['matchType'].value_counts())

### seperate out event mode

In [None]:
mapper = lambda x: 'solo' if ('solo' in x) else 'duo' if ('duo' in x) else 'squad' if ('squad' in x) else 'crash' if ('crash' in x) else 'flare'
train['matchTypeCat'] = train['matchType'].map(mapper)

In [None]:
print(train['matchTypeCat'].value_counts())

In [None]:
#print(train['groupplayers'].value_counts())

In [None]:
# ablist=list(train[train['groupplayers']>4]['matchId'].unique())
# train.loc[train['matchId'].isin(ablist),'matchTypeCat']+='ab'

In [None]:
# print(train['matchTypeCat'].value_counts())

In [None]:
# abnormrows=train.query('allin>1').index
# abnormdata = train.loc[abnormrows]
# train=train.drop(abnormrows)
# print(train['groupplayer'].value_counts())
# print(abnormdata['groupplayer'].value_counts())

In [None]:
X_train = train[train['winPlacePerc'].notnull()].reset_index(drop=True)
X_test = train[train['winPlacePerc'].isnull()].drop(['winPlacePerc'], axis=1).reset_index(drop=True)
del train
gc.collect()

Y_train = X_train.pop('winPlacePerc')
#X_test_grp = X_test[['matchId','groupId']].copy()
#train_matchId = X_train['matchId']

# drop matchId,groupId
X_train.drop(['Id','matchType','matchId','groupId'], axis=1, inplace=True)
#X_test.drop(['Id','matchType','matchId','groupId'], axis=1, inplace=True)

print(X_train.shape, X_test.shape)

In [None]:
X_test.head()

In [None]:
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import minmax_scale
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

params={'learning_rate': 0.05,
        'objective':'mae',
        'metric':'mae',
        'num_leaves': 31,
        'random_state':42,
        'bagging_fraction': 0.7,
        'feature_fraction': 0.7,
       }
mts = list()
fis = list()
pred = np.zeros(X_test.shape[0])
for mt in X_train['matchTypeCat'].unique():
    idx = X_train[X_train['matchTypeCat'] == mt].index
    x_train = X_train.loc[idx].drop(['matchTypeCat'],axis=1)
    reg = lgb.LGBMRegressor(**params, n_estimators=20000,num_threads=5)
#     train_x, valid_x, train_y, valid_y = train_test_split(x_train, Y_train.loc[idx], test_size=0.25, random_state=1)
#     print(mt,train_x.shape,train_y.shape)
#     reg.fit(train_x,train_y)
    reg.fit(x_train,Y_train.loc[idx])
    idx = X_test[X_test['matchTypeCat'] == mt].index
    pred[idx] = reg.predict(X_test.loc[idx].drop(['Id','matchType','matchId','groupId','matchTypeCat'],axis=1), num_iteration=reg.best_iteration_)
    #print ('mae=',mean_absolute_error(valid_y, reg.predict(valid_x,num_iteration=reg.best_iteration_)))
    mts.append(mt)
    fis.append(reg.feature_importances_)

In [None]:
for mt, feature_importance in zip(mts, fis): 
    # Plot feature importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    sorted_idx = sorted_idx[len(feature_importance) - 30:]
    pos = np.arange(sorted_idx.shape[0]) + .5

    plt.figure(figsize=(12,6))
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, X_train.columns[sorted_idx])
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance [matchTypeCat:' + str(mt) + ']')
    plt.show()

In [None]:
print(pred.shape, X_test.shape)

In [None]:
df_sub = pd.read_csv("../input/sample_submission_V2.csv")
df_test = pd.read_csv("../input/test_V2.csv")
df_sub['winPlacePerc'] = pred
# Restore some columns
df_sub = df_sub.merge(df_test[["Id", "matchId", "groupId", "maxPlace", "numGroups"]], on="Id", how="left")

# Sort, rank, and assign adjusted ratio
df_sub_group = df_sub.groupby(["matchId", "groupId"]).first().reset_index()
df_sub_group["rank"] = df_sub_group.groupby(["matchId"])["winPlacePerc"].rank()
df_sub_group = df_sub_group.merge(
    df_sub_group.groupby("matchId")["rank"].max().to_frame("max_rank").reset_index(), 
    on="matchId", how="left")
df_sub_group["adjusted_perc"] = (df_sub_group["rank"] - 1) / (df_sub_group["numGroups"] - 1)

df_sub = df_sub.merge(df_sub_group[["adjusted_perc", "matchId", "groupId"]], on=["matchId", "groupId"], how="left")
df_sub["winPlacePerc"] = df_sub["adjusted_perc"]

# Deal with edge cases
df_sub.loc[df_sub.maxPlace == 0, "winPlacePerc"] = 0
df_sub.loc[df_sub.maxPlace == 1, "winPlacePerc"] = 1

# Align with maxPlace
# Credit: https://www.kaggle.com/anycode/simple-nn-baseline-4
subset = df_sub.loc[df_sub.maxPlace > 1]
gap = 1.0 / (subset.maxPlace.values - 1)
new_perc = np.around(subset.winPlacePerc.values / gap) * gap
df_sub.loc[df_sub.maxPlace > 1, "winPlacePerc"] = new_perc

# Edge case
df_sub.loc[(df_sub.maxPlace > 1) & (df_sub.numGroups == 1), "winPlacePerc"] = 0
assert df_sub["winPlacePerc"].isnull().sum() == 0

df_sub[["Id", "winPlacePerc"]].to_csv("submission_adjusted.csv", index=False)

In [None]:
sns.distplot(df_sub['winPlacePerc'])