# Part 1

In [1]:
import numpy as np
import pandas as pd
from fastai.imports import *
from fastai.structured import *
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error as mae

In [2]:
def get_sample(df,n):
    idxs = sorted(np.random.permutation(len(df))[:n])
    return df.iloc[idxs].copy()

In [3]:
def imae(x,y): return (abs(x-y)).mean()

In [4]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

In [5]:
def print_score(m):
    maer = [imae(m.predict(X_train), y_train), imae(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): maer.append(m.oob_score_)
    print(maer)

In [6]:
def numericalize(df, col, name, max_n_cat):
    if not  pd.api.types.is_numeric_dtype(col) and ( max_n_cat is None or len(col.cat.categories)>max_n_cat):
        df[name] = pd.Categorical(col).codes+1

In [7]:
def fix_missing(df, col, name, na_dict):
    if  pd.api.types.is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name+'_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict

In [8]:
def proc_df(df, y_fld=None, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
    if not ignore_flds: ignore_flds=[]
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    else: df = df.copy()
    ignored_flds = df.loc[:, ignore_flds]
    df.drop(ignore_flds, axis=1, inplace=True)
    if preproc_fn: preproc_fn(df)
    if y_fld is None: y = None
    else:
        if not pd.api.types.is_numeric_dtype(df[y_fld]): df[y_fld] = pd.Categorical(df[y_fld]).codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)

    if na_dict is None: na_dict = {}
    else: na_dict = na_dict.copy()
    na_dict_initial = na_dict.copy()
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    if len(na_dict_initial.keys()) > 0:
        df.drop([a + '_na' for a in list(set(na_dict.keys()) - set(na_dict_initial.keys()))], axis=1, inplace=True)
    if do_scale: mapper = scale_vars(df, mapper)
    for n,c in df.items(): numericalize(df, c, n, max_n_cat)
    df = pd.get_dummies(df, dummy_na=True)
    df = pd.concat([ignored_flds, df], axis=1)
    res = [df, y, na_dict]
    if do_scale: res = res + [mapper]
    return res

In [9]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

# Part 2

In [10]:
TRAIN_FILE='/home/thiago.martinelli/code/main/data/pubg/train_V2.csv'
TEST_FILE='/home/thiago.martinelli/code/main/data/pubg/test_V2.csv'

In [12]:
df_raw = pd.read_csv(TRAIN_FILE, engine='python')

In [13]:
train_cats(df_raw)

In [14]:
df1 = df_raw.set_index('matchId',drop=False)

In [15]:
dfr=df_raw

In [16]:
df3 = df1.copy()

In [17]:
ind = df1.index.value_counts()
dind = dict(ind)

In [18]:
df2 = df1.sort_index()
df3 = df2.iloc[:55500,:]
matches=dict(df3.index.value_counts())

In [45]:
df29=df3.copy()

In [46]:
for match in set(df3.index):
    if match not in m:
        print(match)
        df3.drop(match,axis=0,inplace=True)

01cc8f9b15d3d6


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


02cf1db07c3934
0195fbe135716e
001e7bc06b1611
0287a3001f1447
02f656fa780115
02ca9ace246f02
031c7b39718669
03044c944dde28
034a385dd060d4
0314fb2fb89340
02954a1ce681c0
02db72db1ecd2d
005aca992c9a78
011b0592429c85
02dba39585334b
013bead1c1736c
007edf38197cec
01ce13ad81c6f2
003c2b333747d9
0104eeb664494d
02d17b99f39a48
01ca563b808a48
009505123eaceb
011f1afe2db83d
030ce58f65df30
0101ab001bb2c2
00b217e69ba569
029e7db020600e
01aead02bb8901
02b4a6e639a609
01ee716430b23a
00930789f7075f
017563d39f54b4
01794e4f556206
01370c58128112
02fa6b06351447
02e566e5b8618f
00c0586a397e12
00f2dad24ae3a1
030eff2bfd4ad1
019e9035fb8c79
016d03d56d3dc1
0211af1f386a89
00a4afdb911815
01530800ed9120
020b0802aba439
01ad9ab6e931b8
02ceab5f102b39
0165d05e93e628
014f0eac673783
015c7dc3bfecf6
03445f606ff7f2
00f600051f84ac
01435d33376fd0
00df303460dfe6
016c4b1866bece
0249eb2549b198
001cd8e7e6b737
0027b1ffb2e346
00bf40fb9aa15c
023e7d5684fe09
01223b3a01f9dc
029b8d960efd17


In [47]:
def rank(a):
    c=list(a)
    b = np.asarray(list(set(np.sort(c))))
    for n,i in enumerate(a):
        a[n] = np.where(b==i)[0]
    return a
  
    

In [48]:
def norma(a):
    for n,i in enumerate(a):
        a[n] = float(a[n] / a.max())
    return a

In [49]:
def feat_eng(df,cols):
    newdf = pd.DataFrame()
    %time
    count=0
    for i, ndf in df.groupby(level=0):
        count+=1
        print(count, end='\r')
        for c in cols:
            serie=ndf[c]
            rank(np.asarray(serie))
            ndf[c + 'Pos'] = serie
        newdf = pd.concat( (newdf, ndf), axis=0)
    return newdf

In [50]:
def feat_eng2(df,cols):
    newdf = pd.DataFrame()
    %time
    count=0
    for mi in set(df['matchId']):
        ndf = df[df['matchId']==mi]
        count+=1
        print(count, end='\r')
        for c in cols:
            serie=ndf[c]
            serie=rank(np.asarray(serie))
            #serie=norma(serie)
            ndf[c + 'Pos'] = serie
        newdf = pd.concat( (newdf, ndf), axis=0)
    return newdf

In [56]:
def feat_eng3(df,cols):
    newdf = pd.DataFrame()
    %time
    count=0
    #for mi in set(df.index):
    for c in cols:
        df[c]=df.groupby('matchId').apply(rank)
    return newdf

In [52]:
%time dfa, _, _ = proc_df(df3)
mt = dfa.copy()

CPU times: user 212 ms, sys: 20 ms, total: 232 ms
Wall time: 239 ms


In [53]:
c1=list(['killStreaks'])

In [54]:
mt=mt.drop('matchId',axis=1)

In [57]:
df6=feat_eng3(mt,c1)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 21.9 µs


ValueError: Length of values does not match length of index

In [101]:
cols = list(['assists', 'boosts', 'damageDealt', 'DBNOs', \
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills', \
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace', \
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills', \
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance', \
       'weaponsAcquired', 'winPoints'])


In [120]:
c2=list(['headshotKills','maxPlace'])

In [122]:
mt2=mt.iloc[:1000,:]

In [123]:
newdf = feat_eng2(mt2,c2)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 12.6 µs
1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


11

In [124]:
display_all(newdf.head(100).T)

matchId,0000a43bce5eec,0000a43bce5eec.1,0000a43bce5eec.2,0000a43bce5eec.3,0000a43bce5eec.4,0000a43bce5eec.5,0000a43bce5eec.6,0000a43bce5eec.7,0000a43bce5eec.8,0000a43bce5eec.9,0000a43bce5eec.10,0000a43bce5eec.11,0000a43bce5eec.12,0000a43bce5eec.13,0000a43bce5eec.14,0000a43bce5eec.15,0000a43bce5eec.16,0000a43bce5eec.17,0000a43bce5eec.18,0000a43bce5eec.19,0000a43bce5eec.20,0000a43bce5eec.21,0000a43bce5eec.22,0000a43bce5eec.23,0000a43bce5eec.24,0000a43bce5eec.25,0000a43bce5eec.26,0000a43bce5eec.27,0000a43bce5eec.28,0000a43bce5eec.29,0000a43bce5eec.30,0000a43bce5eec.31,0000a43bce5eec.32,0000a43bce5eec.33,0000a43bce5eec.34,0000a43bce5eec.35,0000a43bce5eec.36,0000a43bce5eec.37,0000a43bce5eec.38,0000a43bce5eec.39,0000a43bce5eec.40,0000a43bce5eec.41,0000a43bce5eec.42,0000a43bce5eec.43,0000a43bce5eec.44,0000a43bce5eec.45,0000a43bce5eec.46,0000a43bce5eec.47,0000a43bce5eec.48,0000a43bce5eec.49,0000a43bce5eec.50,0000a43bce5eec.51,0000a43bce5eec.52,0000a43bce5eec.53,0000a43bce5eec.54,0000a43bce5eec.55,0000a43bce5eec.56,0000a43bce5eec.57,0000a43bce5eec.58,0000a43bce5eec.59,0000a43bce5eec.60,0000a43bce5eec.61,0000a43bce5eec.62,0000a43bce5eec.63,0000a43bce5eec.64,0000a43bce5eec.65,0000a43bce5eec.66,0000a43bce5eec.67,0000a43bce5eec.68,0000a43bce5eec.69,0000a43bce5eec.70,0000a43bce5eec.71,0000a43bce5eec.72,0000a43bce5eec.73,0000a43bce5eec.74,0000a43bce5eec.75,0000a43bce5eec.76,0000a43bce5eec.77,0000a43bce5eec.78,0000a43bce5eec.79,0000a43bce5eec.80,0000a43bce5eec.81,0000a43bce5eec.82,0000a43bce5eec.83,0000a43bce5eec.84,0000a43bce5eec.85,0000a43bce5eec.86,0000a43bce5eec.87,0000a43bce5eec.88,0000a43bce5eec.89,0000a43bce5eec.90,0000a43bce5eec.91,0000a43bce5eec.92,0000a43bce5eec.93,0000a43bce5eec.94,0000eb01ea6cdd,0000eb01ea6cdd.1,0000eb01ea6cdd.2,0000eb01ea6cdd.3,0000eb01ea6cdd.4
Id,4443029.0,331987.0,293764.0,415831.0,3860556.0,1068895.0,1616288.0,2449703.0,1257083.0,2665784.0,1742702.0,617164.0,4335817.0,4308171.0,2957133.0,421552.0,3642291.0,4384175.0,1909818.0,3571300.0,2067946.0,4043874.0,1602834.0,4264765.0,1209189.0,906929.0,1741871.0,293110.0,2133090.0,799636.0,91475.0,3996233.0,2017131.0,3168556.0,2624957.0,3549925.0,230214.0,2437963.0,2005776.0,3502335.0,3653795.0,3443301.0,3964037.0,35174.0,38839.0,4264347.0,1538612.0,2886511.0,1567446.0,2739580.0,923306.0,3124751.0,1393298.0,4024426.0,2427663.0,2092852.0,1982436.0,1500499.0,901205.0,913473.0,3624573.0,478151.0,2882225.0,2318280.0,2798432.0,3675110.0,1419566.0,685979.0,1422029.0,1872709.0,2562149.0,390180.0,2479088.0,2925295.0,3556463.0,2124464.0,3801216.0,4040552.0,1594177.0,3738702.0,2585121.0,3722148.0,728674.0,1716296.0,2992680.0,2471510.0,596448.0,1202292.0,1789834.0,4416604.0,1924356.0,670964.0,3799990.0,4396726.0,1999491.0,2090921.0,674968.0,4203934.0,1019291.0,3392996.0
groupId,711556.0,1199250.0,612351.0,280836.0,1244160.0,280836.0,1427215.0,711556.0,982090.0,1199250.0,859315.0,1599340.0,1095869.0,1147078.0,1844836.0,280836.0,1839290.0,463843.0,1599340.0,1147078.0,1527915.0,1641338.0,1916779.0,1147078.0,711556.0,1131961.0,1131961.0,1803227.0,463843.0,1244160.0,1244160.0,1595120.0,1310968.0,810650.0,1746278.0,1527915.0,1147078.0,939972.0,1427215.0,1147078.0,1095869.0,1467244.0,280836.0,982090.0,195603.0,1803227.0,1599340.0,810650.0,280836.0,612351.0,1199250.0,1310968.0,1199250.0,280836.0,939972.0,810650.0,1746278.0,1095869.0,1516323.0,1641338.0,1839290.0,1844836.0,939972.0,1746278.0,982090.0,1244160.0,859315.0,1244160.0,1839290.0,1839290.0,810650.0,1844836.0,1467244.0,982090.0,1527915.0,982090.0,1199250.0,603064.0,1803227.0,1147078.0,1599340.0,1427215.0,603064.0,1199250.0,1310968.0,1641338.0,1844836.0,939972.0,195603.0,1916779.0,1916779.0,859315.0,1527915.0,1916779.0,1131961.0,239209.0,221336.0,1841579.0,173815.0,1873688.0
matchId,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0
assists,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
boosts,3.0,0.0,3.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,4.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,1.0,0.0,3.0,2.0,3.0,3.0,0.0,0.0,0.0,4.0,2.0,2.0,0.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,1.0,0.0,4.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,1.0,7.0,2.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,4.0,1.0,0.0,1.0,4.0,0.0,0.0,1.0,0.0,0.0,4.0,1.0,0.0,0.0,1.0,2.0,1.0,3.0,3.0,3.0,2.0,0.0,1.0,0.0,0.0,3.0
damageDealt,19.35,0.0,151.8,0.0,74.88,78.69,121.6,80.48,246.3,23.22,629.9,983.7,0.0,49.5,0.0,121.3,471.2,0.0,810.3,61.88,129.5,171.6,54.03,0.0,19.35,396.2,168.4,428.2,0.0,45.82,29.4,77.34,661.6,155.5,0.0,324.8,100.0,0.0,0.0,185.3,70.12,0.0,21.29,0.0,200.0,368.9,375.6,256.1,66.65,530.1,0.0,107.0,0.0,0.0,81.9,82.2,0.0,200.0,155.8,90.67,100.0,0.0,0.0,0.0,0.0,54.18,289.7,2.191,233.7,32.87,276.8,0.0,242.4,0.0,57.32,211.9,91.65,23.4,319.8,178.4,196.6,101.0,0.0,0.0,432.4,272.7,0.0,18.1,19.35,78.0,25.04,333.7,281.2,189.9,0.0,11.28,0.0,300.0,0.0,78.72
DBNOs,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,2.0,3.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,2.0,0.0,1.0,0.0,1.0,4.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,4.0,3.0,0.0,4.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,4.0,2.0,0.0,1.0,0.0,1.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0
headshotKills,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
heals,0.0,0.0,2.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,2.0,2.0,0.0,0.0,2.0,4.0,2.0,0.0,5.0,0.0,0.0,2.0,2.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,0.0,4.0,9.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0,3.0,0.0,2.0,5.0,2.0,4.0,2.0,5.0,0.0,0.0,1.0
killPlace,48.0,83.0,23.0,90.0,63.0,89.0,57.0,46.0,43.0,81.0,4.0,2.0,53.0,72.0,85.0,88.0,10.0,95.0,3.0,74.0,26.0,19.0,30.0,71.0,47.0,12.0,29.0,7.0,94.0,32.0,65.0,54.0,9.0,37.0,77.0,13.0,35.0,68.0,58.0,34.0,52.0,39.0,92.0,42.0,21.0,8.0,6.0,38.0,93.0,5.0,80.0,27.0,78.0,91.0,67.0,15.0,75.0,31.0,14.0,55.0,69.0,86.0,66.0,76.0,41.0,64.0,17.0,62.0,22.0,70.0,16.0,87.0,28.0,44.0,36.0,40.0,82.0,59.0,11.0,73.0,24.0,56.0,60.0,79.0,1.0,20.0,84.0,33.0,61.0,50.0,49.0,18.0,25.0,51.0,45.0,79.0,72.0,11.0,58.0,24.0


In [126]:
df_tr = newdf.drop(['winPlacePerc'],axis=1)
y_tr = newdf['winPlacePerc']

n_valid = 100  # same as Kaggle's test set size
n_tr = len(df_tr)-n_valid
#raw_train, raw_valid = split_vals(df_raw, n_tr)
X_train, X_valid = split_vals(df_tr, n_tr)
y_train, y_valid = split_vals(y_tr, n_tr)
X_train.shape, y_train.shape, X_valid.shape

m = RandomForestRegressor(n_jobs=-1, n_estimators = 50 )
m.fit(X_train, y_train)
print_score(m)

[0.02175403460670194, 0.05691404314285713, 0.9868445674673793, 0.9112186780436199]
