In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import xgboost as xgb

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import make_scorer,  mean_absolute_error

from scipy.stats.distributions import uniform

# Define useful functions

In [2]:
def getCorrelationMatrix(df, method="pearson"):
    corr = df.corr(method=method)
    
    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(11, 9))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5},
               annot=True, fmt=".2f")

    
    return corr

# Read data

In [3]:
trainingDataPath = "D:/data/kaggle/PUBG/train.csv"

In [4]:
trainingData = pd.read_csv(trainingDataPath)

# Feature engineering

Kaggle page:
* https://www.kaggle.com/c/pubg-finish-placement-prediction

#### Problem description
In a PUBG game, up to 100 players start in each match (matchId). Players can be on teams (groupId) which get ranked at the end of the game (winPlacePerc) based on how many other teams are still alive when they are eliminated. In game, players can pick up different munitions, revive downed-but-not-out (knocked) teammates, drive vehicles, swim, run, shoot, and experience all of the consequences -- such as falling too far or running themselves over and eliminating themselves.

You are provided with a large number of anonymized PUBG game stats, formatted so that each row contains one player's post-game stats. The data comes from matches of all types: solos, duos, squads, and custom; there is no guarantee of there being 100 players per match, nor at most 4 player per group.

You must create a model which predicts players' finishing placement based on their final stats, on a scale from 1 (first place) to 0 (last place).

### Data
1. train.csv - set for model training and evaluation
1. test.csv - set for testing model predictions

#### Data dictionary
* DBNOs - Number of enemy players knocked.
* assists - Number of enemy players this player damaged that were killed by teammates.
* boosts - Number of boost items used.
* damageDealt - Total damage dealt. Note: Self inflicted damage is subtracted.
* headshotKills - Number of enemy players killed with headshots.
* heals - Number of healing items used.
* killPlace - Ranking in match of number of enemy players killed.
* killPoints - Kills-based external ranking of player. (Think of this as an Elo ranking where only kills matter.)
* killStreaks - Max number of enemy players killed in a short amount of time.
* kills - Number of enemy players killed.
* longestKill - Longest distance between player and player killed at time of death. This may be misleading, as downing a player and driving away may lead to a large longestKill stat.
* matchId - Integer ID to identify match. There are no matches that are in both the training and testing set.
* revives - Number of times this player revived teammates.
* rideDistance - Total distance traveled in vehicles measured in meters.
* roadKills - Number of kills while in a vehicle.
* swimDistance - Total distance traveled by swimming measured in meters.
* teamKills - Number of times this player killed a teammate.
* vehicleDestroys - Number of vehicles destroyed.
* walkDistance - Total distance traveled on foot measured in meters.
* weaponsAcquired - Number of weapons picked up.
* winPoints - Win-based external ranking of player. (Think of this as an Elo ranking where only winning matters.)
* groupId - Integer ID to identify a group within a match. If the same group of players plays in different matches, they will have a different groupId each time.
* numGroups - Number of groups we have data for in the match.
* maxPlace - Worst placement we have data for in the match. This may not match with numGroups, as sometimes the data skips over placements.
* winPlacePerc - The target of prediction. This is a percentile winning placement, where 1 corresponds to 1st place, and 0 corresponds to last place in the match. It is calculated off of maxPlace, not numGroups, so it is possible to have missing chunks in a match.

In [5]:
trainingData.dtypes

Id                   int64
groupId              int64
matchId              int64
assists              int64
boosts               int64
damageDealt        float64
DBNOs                int64
headshotKills        int64
heals                int64
killPlace            int64
killPoints           int64
kills                int64
killStreaks          int64
longestKill        float64
maxPlace             int64
numGroups            int64
revives              int64
rideDistance       float64
roadKills            int64
swimDistance       float64
teamKills            int64
vehicleDestroys      int64
walkDistance       float64
weaponsAcquired      int64
winPoints            int64
winPlacePerc       float64
dtype: object

In [6]:
trainingData.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,0,24,0,0,5,247.3,2,0,4,17,...,1,591.3,0,0.0,0,0,782.4,4,1458,0.8571
1,1,440875,1,1,0,37.65,1,1,0,45,...,0,0.0,0,0.0,0,0,119.6,3,1511,0.04
2,2,878242,2,0,1,93.73,1,0,2,54,...,1,0.0,0,0.0,0,0,3248.0,5,1583,0.7407
3,3,1319841,3,0,0,95.88,0,0,0,86,...,0,0.0,0,0.0,0,0,21.49,1,1489,0.1146
4,4,1757883,4,0,1,0.0,0,0,1,58,...,0,0.0,0,0.0,0,0,640.8,4,1475,0.5217


In [7]:
trainingData.describe()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
count,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,...,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0
mean,3102678.0,1024777.0,23855.39,0.265632,0.9636856,132.9033,0.6901455,0.2385866,1.187169,47.0344,...,0.1649345,423.8835,0.002557985,4.136261,0.01388555,0.005146264,1055.122,3.457289,1500.504,0.4718663
std,1797477.0,696719.7,13782.27,0.634216,1.560643,169.9439,1.191514,0.6103033,2.366389,27.32772,...,0.4672004,1222.927,0.06346679,27.57015,0.1329266,0.07425362,1116.122,2.402109,42.53571,0.3079147
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,350.0,0.0
25%,1537746.0,474137.8,11914.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,...,0.0,0.0,0.0,0.0,0.0,0.0,133.4,2.0,1491.0,0.1979
50%,3110606.0,943032.0,23837.0,0.0,0.0,87.76,0.0,0.0,0.0,47.0,...,0.0,0.0,0.0,0.0,0.0,0.0,573.2,3.0,1500.0,0.4583
75%,4657295.0,1418544.0,35801.0,0.0,1.0,188.4,1.0,0.0,1.0,70.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1808.0,5.0,1510.0,0.7407
max,6224522.0,2700605.0,47733.0,20.0,18.0,6384.0,63.0,26.0,59.0,100.0,...,41.0,48390.0,42.0,5286.0,6.0,5.0,17300.0,76.0,1923.0,1.0


In [8]:
sampleTrainingData = trainingData[:300000]

In [9]:
sampleTrainingData.describe()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
count,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,...,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0
mean,204389.025837,1037695.0,23092.860627,0.332613,1.081897,158.625853,0.830127,0.295047,1.331657,43.444277,...,0.186507,471.597186,0.00305,4.278871,0.013797,0.005497,1121.249752,3.685047,1504.713457,0.495785
std,123510.767527,719083.5,13949.010083,0.760309,1.656428,205.733674,1.445004,0.717985,2.522066,27.664437,...,0.502118,1311.253915,0.062188,27.778589,0.132463,0.076767,1146.533271,2.865617,39.148813,0.31297
min,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,385.0,0.0
25%,95457.75,464856.2,10714.75,0.0,0.0,19.35,0.0,0.0,0.0,19.0,...,0.0,0.0,0.0,0.0,0.0,0.0,149.3,2.0,1493.0,0.2151
50%,211371.5,947969.0,22733.0,0.0,0.0,100.0,0.0,0.0,0.0,41.0,...,0.0,0.0,0.0,0.0,0.0,0.0,661.4,3.0,1500.0,0.4891
75%,306828.25,1437213.0,35233.25,0.0,2.0,213.7,1.0,0.0,2.0,67.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1925.0,5.0,1514.0,0.7778
max,422742.0,2700605.0,47733.0,16.0,15.0,4981.0,34.0,20.0,55.0,100.0,...,18.0,48390.0,6.0,1956.0,6.0,3.0,13250.0,67.0,1917.0,1.0


# Ideas

When do I get beat down in games?
* When everyone else is better
* When my group is inexperienced
* When my team is not well balanced
* When my team is smaller than the competition
* playing an unfamiliar character, weapon, location

When do I win?
* When we have a strong combo of ultimate skills
* when a new feature or character is released and it is overpowered
* When everyone is clear on the optimal strategy

How to operationalize these?
* Group level
    * user level
        * scale and center features
    * group level aggregation of all features
    * match level aggregation of all features

In [24]:
def engineerFeatures(data):
    
    engineered = data.copy()
    
    ###########################
    # User level features
    
    ########################### 
    # Group level features
    # number of players in group
    # mean score of gropu
    groupLevelFeatures = (
        engineered.groupby(["matchId", "groupId"]).agg(
            {
                "Id": "count",
                "winPoints": "mean"
            }
        )
        .reset_index()
    )
    groupLevelFeatures.rename(
        columns={
            "Id": "numPlayersInGroup",
            "winPoints": "meanGroupWinPoints"
        }, 
        inplace=True
    )
    
    engineered = pd.merge(
        engineered,
        groupLevelFeatures,
        on=["matchId", "groupId"],
        how="left"
        
    )
    
    ###########################
    # Match level features
    
    matchLevelFeatures = (
        engineered.groupby(["matchId"]).agg(
            {
                "Id": "count",
                "winPoints": "mean"
            }
        )
        .reset_index()
    )
    matchLevelFeatures.rename(
        columns={
            "Id": "numPlayersInMatch",
            "winPoints": "meanMatchWinPoints"
        }, 
        inplace=True
    )
    
    engineered = pd.merge(
        engineered,
        matchLevelFeatures,
        on=["matchId"],
        how="left"
        
    )
    
    return engineered

In [25]:
engineered = engineerFeatures(sampleTrainingData)

In [26]:
engineered.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,meanGroupWinPoints,numPlayersInGroup,meanMatchWinPoints,numPlayersInMatch
0,0,24,0,0,5,247.3,2,0,4,17,...,0,0,782.4,4,1458,0.8571,1457.5,2,1479.0,7
1,1,440875,1,1,0,37.65,1,1,0,45,...,0,0,119.6,3,1511,0.04,1511.0,1,1543.857143,7
2,2,878242,2,0,1,93.73,1,0,2,54,...,0,0,3248.0,5,1583,0.7407,1574.333333,3,1519.857143,7
3,3,1319841,3,0,0,95.88,0,0,0,86,...,0,0,21.49,1,1489,0.1146,1489.0,1,1494.857143,7
4,4,1757883,4,0,1,0.0,0,0,1,58,...,0,0,640.8,4,1475,0.5217,1493.5,2,1507.714286,7
