In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
train = pd.read_csv('../input/train_V2.csv')

In [None]:
train.head()

**Feature importance**

In [None]:
print('Match types in the dataset: {}'.format(train['matchType'].nunique()))

In [None]:
# One-hot encode matchType
train = pd.get_dummies(train, columns=['matchType'])
matchType_onehot = train.filter(regex='matchType')
matchType_onehot.head()

In [None]:
# Change data type of groupId and match Id
train['groupId'] = train['groupId'].astype('category')
train['groupId_c'] = train['groupId'].cat.codes
train['matchId'] = train['matchId'].astype('category')
train['matchId_c'] = train['matchId'].cat.codes
train.drop(columns=['groupId', 'matchId'], inplace=True)
train[['groupId_c', 'matchId_c']].head()

In [None]:
# Drop Id variable and take sample
train.drop(columns = ['Id'], inplace=True)
sample = 500000
df_sample = train.sample(sample)

In [None]:
# Split sample into training data and target variable
df = df_sample.drop(columns = ['winPlacePerc']) #all columns except target
y = df_sample['winPlacePerc'] # Only target variable

In [None]:
# Function for splitting training and validation data
def split_vals(a, n : int): 
    return a[:n].copy(), a[n:].copy()
perc_valid = 0.12
n_valid = int(perc_valid * sample) 
n_trn = len(df)-n_valid
# Split data
raw_train, raw_valid = split_vals(df_sample, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

print('Sample train shape: ', X_train.shape, 
      'Sample target shape: ', y_train.shape, 
      'Sample validation shape: ', X_valid.shape)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function to print the MAE (Mean Absolute Error) score
def print_score(m : RandomForestRegressor):
    res = ['mae train: ', mean_absolute_error(m.predict(X_train), y_train), 
           'mae val: ', mean_absolute_error(m.predict(X_valid), y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [None]:
# Basic model
model_1 = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features='sqrt', n_jobs=-1)
model_1.fit(X_train, y_train)
print_score(model_1)

In [None]:
# Find the most important features according to our basic model
from fastai.imports import *
from fastai.structured import *
fi = rf_feat_importance(model_1, df); fi[:10]

In [None]:
# Plot a feature importance graph for the 20 most important features
plot_1 = fi[:20].plot('cols', 'imp', figsize=(14,6), legend=False, kind='barh', color='#2976bb')
plot_1

In [None]:
# Keep only significant features
to_keep = fi[fi.imp>0.005].cols
print('Significant features: ', len(to_keep))
to_keep

In [None]:
# Make a DataFrame with only significant features
data_keep = df[to_keep].copy()
X_train, X_valid = split_vals(data_keep, n_trn)

In [None]:
# Train model on top features
model_2 = RandomForestRegressor(n_estimators=80, min_samples_leaf=3, max_features='sqrt', n_jobs=-1)
model_2.fit(X_train, y_train)
print_score(model_2)

In [None]:
# Get feature importances of our top features
fi_to_keep = rf_feat_importance(model_2, data_keep)
plot_2 = fi_to_keep.plot('cols', 'imp', figsize=(14,6), legend=False, kind='barh', color='#2976bb')
plot_2

**Correlations**

In [None]:
from scipy.cluster import hierarchy as hc

# Create a Dendrogram to view highly correlated features
corr = np.round(scipy.stats.spearmanr(data_keep).correlation, 4)
corr_condensed = hc.distance.squareform(1-corr)
z = hc.linkage(corr_condensed, method='average')
fig = plt.figure(figsize=(14,10))
dendrogram = hc.dendrogram(z, labels=data_keep.columns, orientation='left', leaf_font_size=16)
plt.plot()

In [None]:
# Correlation heatmap
corr = data_keep.corr()
f, ax = plt.subplots(figsize=(11, 9))
heatmap = sns.heatmap(corr, annot=True, linewidths=.5, fmt= '.1f', cmap="YlGnBu")