In [3]:
import numpy as np
import pandas as pd
import tables
from fastai.imports import *
from fastai.structured import *
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error as mae

In [4]:
def get_sample(df,n):
    idxs = sorted(np.random.permutation(len(df))[:n])
    return df.iloc[idxs].copy()

def imae(x,y): return (abs(x-y)).mean()

def split_vals(a,n): return a[:n].copy(), a[n:].copy()

def print_score(m):
    maer = [imae(m.predict(X_train), y_train), imae(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): maer.append(m.oob_score_)
    print(maer)

def numericalize(df, col, name, max_n_cat):
    if not  pd.api.types.is_numeric_dtype(col) and ( max_n_cat is None or len(col.cat.categories)>max_n_cat):
        df[name] = pd.Categorical(col).codes+1

def fix_missing(df, col, name, na_dict):
    if  pd.api.types.is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name+'_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict

def proc_df(df, y_fld=None, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
    if not ignore_flds: ignore_flds=[]
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    else: df = df.copy()
    ignored_flds = df.loc[:, ignore_flds]
    df.drop(ignore_flds, axis=1, inplace=True)
    if preproc_fn: preproc_fn(df)
    if y_fld is None: y = None
    else:
        if not pd.api.types.is_numeric_dtype(df[y_fld]): df[y_fld] = pd.Categorical(df[y_fld]).codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)

    if na_dict is None: na_dict = {}
    else: na_dict = na_dict.copy()
    na_dict_initial = na_dict.copy()
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    if len(na_dict_initial.keys()) > 0:
        df.drop([a + '_na' for a in list(set(na_dict.keys()) - set(na_dict_initial.keys()))], axis=1, inplace=True)
    if do_scale: mapper = scale_vars(df, mapper)
    for n,c in df.items(): numericalize(df, c, n, max_n_cat)
    df = pd.get_dummies(df, dummy_na=True)
    df = pd.concat([ignored_flds, df], axis=1)
    res = [df, y, na_dict]
    if do_scale: res = res + [mapper]
    return res

def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [5]:
PATH = '/home/thiago.martinelli/data/kaggle/pubg/'

In [6]:
%%time
data_store = pd.HDFStore(PATH+ 'train.h5')
df = data_store['sorted_df']
data_store.close()

CPU times: user 540 ms, sys: 2.06 s, total: 2.6 s
Wall time: 3min 35s


In [8]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,winPlacePerc_na
matchId,kills,killPlace,groupId,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,0,36,1527915,3556463,1527915,1,0,4,57.32,0,0,1,36,...,557.8,0,0.0,0,0,2256.0,2,1527,0.8889,False
1,0,37,810650,3168556,810650,1,1,2,155.5,0,0,2,37,...,385.8,0,0.0,0,0,3397.0,3,1454,0.7778,False
1,0,38,810650,2886511,810650,1,1,1,256.1,3,0,0,38,...,2202.0,0,0.0,0,0,2400.0,8,1559,0.7778,False
1,0,39,1467244,3443301,1467244,1,0,0,0.0,0,0,0,39,...,0.0,0,0.0,0,0,1402.0,5,1521,0.7407,False
1,0,40,982090,2124464,982090,1,2,1,211.9,0,0,0,40,...,204.1,0,2.489,0,0,2008.0,4,1467,0.6667,False


# l_____________________________________________________________l

In [53]:
df['gid'] = df['groupId']

In [54]:
df.drop('groupId', inplace=True, axis = 1)

In [55]:
df_groups = df.groupby('groupId').max()

In [59]:
df_groups.set_index(['matchId','kills', 'killPlace','gid'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Id,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,...,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,winPlacePerc_na,gid
matchId,kills,killPlace,gid,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
19098,1,80,1,3047918,19098,0,1,298.00,2,1,1,80,0,...,0,0.000,0,0,350.80,3,0,0.2000,False,1
10211,1,40,2,1878751,10211,0,0,173.70,0,0,0,40,0,...,0,0.000,0,0,135.80,3,0,0.1154,False,2
23312,0,72,3,330903,23312,0,0,0.00,0,0,0,72,0,...,0,0.000,0,0,110.80,2,0,0.2000,False,3
22388,3,25,4,2849119,22388,1,5,579.10,4,1,5,25,0,...,0,77.270,0,0,2322.00,7,0,0.9615,False,4
17920,1,75,5,3037779,17920,1,0,65.70,1,1,0,75,1013,...,0,0.000,0,0,103.20,3,1501,0.1395,False,5
22576,2,39,6,1911925,22576,1,4,187.90,2,0,11,39,1000,...,0,19.670,0,0,2559.00,7,1500,0.8387,False,6
43738,0,69,7,1738115,43738,0,0,0.00,0,0,0,69,998,...,0,0.000,0,0,174.80,1,1494,0.3673,False,7
40360,0,63,8,689404,40360,0,0,0.00,0,0,0,63,0,...,0,0.000,0,0,61.08,3,0,0.3736,False,8
22402,1,54,9,3979264,22402,1,5,273.80,1,0,1,54,0,...,0,106.800,0,0,3017.00,10,0,0.7778,False,9
17015,0,48,10,1970887,17015,0,1,0.00,0,0,0,48,1000,...,0,5.088,0,0,1495.00,4,1500,0.8298,False,10


In [29]:
df_cut = df[['matchId', 'kills', 'killPlace', 'groupId','numGroups', 'maxPlace', 'winPlacePerc']]

In [30]:
df_cut.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,matchId,kills,killPlace,groupId,numGroups,maxPlace,winPlacePerc
matchId,kills,killPlace,groupId,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,36,1527915,1,0,36,1527915,28,28,0.8889
1,0,37,810650,1,0,37,810650,28,28,0.7778
1,0,38,810650,1,0,38,810650,28,28,0.7778
1,0,39,1467244,1,0,39,1467244,28,28,0.7407
1,0,40,982090,1,0,40,982090,28,28,0.6667
1,0,41,982090,1,0,41,982090,28,28,0.6667
1,0,42,982090,1,0,42,982090,28,28,0.6667
1,0,43,982090,1,0,43,982090,28,28,0.6667
1,0,44,982090,1,0,44,982090,28,28,0.6667
1,0,45,1131961,1,0,45,1131961,28,28,0.6296


In [40]:
df_cut.shape

(4446966, 7)

In [44]:
df_cut_less=df_cut[:100000]
df_cut_match1 = df_cut[df_cut['matchId']==1]

In [48]:
df_tr = df_cut_match1.drop(['winPlacePerc'],axis=1)
y_tr = df_cut_match1['winPlacePerc']

n_valid = 10  # same as Kaggle's test set size
n_tr = len(df_tr)-n_valid
#raw_train, raw_valid = split_vals(df_raw, n_tr)
X_train, X_valid = split_vals(df_tr, n_tr)
y_train, y_valid = split_vals(y_tr, n_tr)
X_train.shape, y_train.shape, X_valid.shape

m = RandomForestRegressor(n_jobs=-1, n_estimators = 500 )
m.fit(X_train, y_train)
print_score(m)

[0.020353402352940878, 0.15295897999999886, 0.9784830558696082, 0.31777076431409246]
