In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib

%matplotlib inline

import xgboost as xgb

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import make_scorer,  mean_absolute_error

from scipy.stats.distributions import uniform

# Define runtime parameters

In [2]:
FIT_MODELS = True
SAVE_FITTED_MODELS = False
LOAD_FITTED_MODELS = False
CREATE_SUBMISSION_CSV = False

In [3]:
WORKING_DIRECTORY = "D:/data/kaggle/PUBG/"
MODEL_SAVE_DIRECTORY = WORKING_DIRECTORY + "/models"

# Define useful functions

In [4]:
def createPreprocessingPipeline():
    preprocPipeline = make_pipeline(
        StandardScaler(with_mean=True, with_std=True)
    )
    
    return preprocPipeline

def createModelPipeline(metricsScorer = None):
    
    paramDistributions = {
        "reg_alpha": uniform(0.0, 1.0),
        "reg_lambda": uniform(0.0, 1.0)
    }
    
    numCores = os.cpu_count()
    
    print("number of CPU cores detected = %i" % numCores)
    
    initParams =  {
        "max_depth": 3,
        "learning_rate": 0.1,
        "n_estimators": 1000,
        "silent": True,
        "objective": 'reg:linear', 
        "booster":'gbtree', 
        "n_jobs": numCores, 
        "gamma" : 0, 
        "min_child_weight" : 1, 
        "max_delta_step": 0, 
        "subsample" : 1, 
        "colsample_bytree" : 1, 
        "colsample_bylevel":1, 
        "reg_alpha": 0.6975947598968077, 
        "reg_lambda": 0.14942377117732686,
        "scale_pos_weight":1, 
        "base_score":0.5,
#         "tree_method": "gpu_hist" 
    }
    

    modelPipeline = make_pipeline(
        RandomizedSearchCV(
            estimator = xgb.XGBRegressor(**initParams),
            param_distributions = paramDistributions,
            n_iter=10,
            scoring = metricsScorer,
            cv=3,
            refit=True
        ),
    )
    
    return modelPipeline

# Read data

In [5]:
trainingDataPath = WORKING_DIRECTORY + "train.csv"
scoringDataPath = WORKING_DIRECTORY + "test.csv"

In [6]:
trainingData = pd.read_csv(trainingDataPath)
scoringData = pd.read_csv(scoringDataPath)

# Feature engineering

In [7]:
def engineerFeatures(data):
    
    engineered = data.copy()
    
    ###########################
    # User level features
    
    ########################### 
    # Group level features
    # number of players in group
    # mean score of gropu
    groupLevelFeatures = (
        engineered.groupby(["matchId", "groupId"]).agg(
            {
                "Id": "count",
                "winPoints": "mean"
            }
        )
        .reset_index()
    )
    groupLevelFeatures.rename(
        columns={
            "Id": "numPlayersInGroup",
            "winPoints": "meanGroupWinPoints"
        }, 
        inplace=True
    )
    
    engineered = pd.merge(
        engineered,
        groupLevelFeatures,
        on=["matchId", "groupId"],
        how="left"
        
    )
    
    ###########################
    # Match level features
    
    matchLevelFeatures = (
        engineered.groupby(["matchId"]).agg(
            {
                "Id": "count",
                "winPoints": "mean"
            }
        )
        .reset_index()
    )
    matchLevelFeatures.rename(
        columns={
            "Id": "numPlayersInMatch",
            "winPoints": "meanMatchWinPoints"
        }, 
        inplace=True
    )
    
    engineered = pd.merge(
        engineered,
        matchLevelFeatures,
        on=["matchId"],
        how="left"
        
    )
    
    return engineered

In [8]:
trainingData_engineered = engineerFeatures(trainingData)

# Train test split

Need to do something smarter here:
* split in away that respects matchid and groupIds

Select by matchIds instead

In [9]:
def trainTestSplitByMatchIds(df, matchIdFraction):

    uniqueMatchIds = trainingData_engineered.matchId.unique()
    numTrainingMatches = len(uniqueMatchIds)
    print("Number of unique matches %i " % numTrainingMatches)

    sampleFraction = 0.7

    trainMatchIds = np.random.choice(
        uniqueMatchIds, 
        size = int(sampleFraction* numTrainingMatches), 
        replace=False
    )

    len(trainMatchIds)

    X_train = (
        trainingData_engineered
        .loc[trainingData_engineered.matchId.isin(trainMatchIds)]
        .drop("winPlacePerc", axis="columns")
    )

    y_train = (
        trainingData_engineered
        .loc[trainingData_engineered.matchId.isin(trainMatchIds), "winPlacePerc"]
    )

    X_test = (
        trainingData_engineered
        .loc[~trainingData_engineered.matchId.isin(trainMatchIds)]
        .drop("winPlacePerc", axis="columns")
    )

    y_test = (
        trainingData_engineered
        .loc[~trainingData_engineered.matchId.isin(trainMatchIds), "winPlacePerc"]
    )

    print("Train X, y sizes ", X_train.shape, y_train.shape)
    print("Test X, y sizes ", X_test.shape, y_test.shape)
    
    return X_train, X_test, y_train, y_test

In [10]:
X_train, X_test, y_train, y_test = trainTestSplitByMatchIds(trainingData_engineered, matchIdFraction=0.7)

Number of unique matches 47734 
Train X, y sizes  (3049858, 29) (3049858,)
Test X, y sizes  (1307478, 29) (1307478,)


In [11]:
preprocessingPipeline = createPreprocessingPipeline()

In [12]:
xgbPipeline = createModelPipeline(
    metricsScorer =make_scorer(score_func=mean_absolute_error, greater_is_better=False)
)

number of CPU cores detected = 8


In [13]:
xgbPipeline

Pipeline(memory=None,
     steps=[('randomizedsearchcv', RandomizedSearchCV(cv=3, error_score='raise',
          estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimat...
          scoring=make_scorer(mean_absolute_error, greater_is_better=False),
          verbose=0))])

# Fit xgb

In [22]:
def fitPipelines(preprocessingPipeline, modelPipeline, selectedFeatures, X, y, Xtest, ytest):
    
    # run preprocessing pipeline on raw data
    print("Fit preprocessing pipeline")
    fittedPreprocessingPipeline = preprocessingPipeline.fit(X.loc[:, selectedFeatures], y)
    
    print("Preprocess data")
    X_processed = fittedPreprocessingPipeline.transform(X.loc[:, selectedFeatures])
    Xtest_processed = fittedPreprocessingPipeline.transform(Xtest.loc[:, selectedFeatures])
    
    eval_set = [
        (X_processed, y), 
        (Xtest_processed, ytest)
    ]
    
    print("Fit model pipeline")
    # run the model fitting pipeline on the processed data
    fittedModelPipeline = modelPipeline.fit(
        X_processed, 
        y,
        # xgb options
        randomizedsearchcv__verbose=0,
        randomizedsearchcv__eval_metric="mae",
        randomizedsearchcv__eval_set = eval_set,
        randomizedsearchcv__early_stopping_rounds=5
    )

    # Print out some common eval metrics
    cvResults = fittedModelPipeline.named_steps.get('randomizedsearchcv')
    
    print("best estimator")
    print(cvResults.best_estimator_)
    
    print("best score")
    print(cvResults.best_score_)
    
    print("early stopping")
    print("num trees = %i" % cvResults.best_estimator_.best_ntree_limit)
    print("best score = %i" % cvResults.best_estimator_.best_score)
    print("best iteration = %i" % cvResults.best_estimator_.best_iteration)
    
    
    # return the fitted preprocessing and modeling pipelines
    return fittedPreprocessingPipeline, fittedModelPipeline


def evalModel(y_true, y_pred, metric_function):
        
    result = metric_function(y_true, y_pred)
    
    plt.figure()
    plt.scatter(y_pred, y_true, alpha = 0.01)
    plt.xlabel("Prediction")
    plt.ylabel("Ground truth")
    
    
    return result

In [15]:
X_train.dtypes

Id                      int64
groupId                 int64
matchId                 int64
assists                 int64
boosts                  int64
damageDealt           float64
DBNOs                   int64
headshotKills           int64
heals                   int64
killPlace               int64
killPoints              int64
kills                   int64
killStreaks             int64
longestKill           float64
maxPlace                int64
numGroups               int64
revives                 int64
rideDistance          float64
roadKills               int64
swimDistance          float64
teamKills               int64
vehicleDestroys         int64
walkDistance          float64
weaponsAcquired         int64
winPoints               int64
meanGroupWinPoints    float64
numPlayersInGroup       int64
meanMatchWinPoints    float64
numPlayersInMatch       int64
dtype: object

In [16]:
selectedFeatures = [
    'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'maxPlace', 'numGroups', 'revives',
       'rideDistance', 'roadKills', 'swimDistance', 'teamKills',
       'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints',
        "numPlayersInGroup"
]

# Train Model

In [17]:
nRows_train = X_train.shape[0]
nRows_test = X_test.shape[0]

print("Train size = %i, Test size = %i" % (nRows_train, nRows_test))

Train size = 3049858, Test size = 1307478


In [18]:
# sampleTrainSize = 200000
# sampleTestSize = 100000
sampleTrainSize = 2000
sampleTestSize = 1000


In [None]:
%%time
if FIT_MODELS:
#     fittedPreprocessing, fittedModelPipeline = fitPipelines(
    fitPipelines(
        preprocessingPipeline=preprocessingPipeline,
        modelPipeline=xgbPipeline, 
        selectedFeatures=selectedFeatures, 
        X=X_train[:sampleTrainSize], 
        y=y_train[:sampleTrainSize],
        Xtest=X_test[:sampleTestSize],
        ytest=y_test[:sampleTestSize]
    )

Fit preprocessing pipeline
Preprocess data
Fit model pipeline


In [None]:
fittedPreprocessing

In [None]:
fittedModelPipeline

# Save trained models

In [None]:
if SAVE_FITTED_MODELS:
    joblib.dump(fittedPreprocessing, "./models/preprocessing.pkl")
    joblib.dump(fittedModelPipeline, "./models/xgbModelPipeline.pkl")

# Load models

In [None]:
if LOAD_FITTED_MODELS:
    fittedPreprocessing = joblib.load("./models/preprocessing.pkl")
    fittedModelPipeline = joblib.load("./models/xgbModelPipeline.pkl")

# Feature importances

In [None]:
fittedXgb = fittedModelPipeline.named_steps.get("randomizedsearchcv")

In [None]:
featureImportances = pd.Series(fittedXgb.best_estimator_.feature_importances_, index=selectedFeatures)

In [None]:
featureImportances.sort_values(ascending=False)

# Predict and evaluate on test set

In [None]:
def predictScores(fittedPreprocessingPipeline, fittedModelPipeline, selectedFeatures, X):
    
    X_processed = fittedPreprocessingPipeline.transform(X.loc[:, selectedFeatures])
    
    y_preds = fittedModelPipeline.predict(X_processed)

    return y_preds
    

In [None]:
y_user_preds = predictScores(fittedPreprocessing, fittedModelPipeline, selectedFeatures, X_test)

In [None]:
def estimateGroupScores(X, y_predictions):
    
    userPreds = X.copy()
    
    userPreds["user_prediction"] = y_predictions
    
    ################
    # Within each match players in the same group have the same score
    
#     aux = userPreds.groupby(['matchId','groupId'])['prediction'].agg('mean').groupby('matchId').rank(pct=True).reset_index()
#     return aux
    
    # get the mean score for users in the group
    groupMeanScores = (
        userPreds
        .groupby(["matchId", "groupId"])["user_prediction"]
        .agg("mean")
    )
    
    
    ######################
    # Within the match rank the scores by group
    matchPctRanks = (
        groupMeanScores
        .groupby(["matchId"])
        .rank(pct=True)
        .reset_index()
    )
    
    matchPctRanks.columns=["matchId", "groupId", "group_prediction"]
    
    # Merge the process scores back into the user data
    userGroupPctRankedScores = pd.merge(
        userPreds,
        matchPctRanks, 
        on = ["matchId", "groupId"],
        how = "left"
    )
    
    return userGroupPctRankedScores

In [None]:
groupData = estimateGroupScores(X_test, y_user_preds)

In [None]:
y_user_preds.shape, groupData.group_prediction.shape

#### user vs group predictions

In [None]:
plt.scatter(groupData.group_prediction, y_user_preds, alpha =0.1)
plt.xlabel("group prediction")
plt.ylabel("user prediction")

In [None]:
groupData.group_prediction.describe()

In [None]:
groupData.group_prediction.hist()

In [None]:
y_user_preds_clipped = np.clip(y_user_preds, 0.0, 1.0)

In [None]:
evalModel(y_test, y_user_preds, mean_absolute_error)

In [None]:
evalModel(y_test, y_user_preds_clipped, mean_absolute_error)

# Group scores look worse

something about our sampling method?

Are some of the groups arbitrarily split between train and test data?

yes...

need to do this for all training data to prevent group splits

Assumes scoring data will have complete groups

In [None]:
evalModel(y_test, groupData.group_prediction, mean_absolute_error)

# What is the group where we are predicting 1.0 when it is 0.0?

In [None]:
groupData["ground_truth"] = y_test

In [None]:
outliers = (
    groupData
    .loc[
        (groupData.ground_truth == 0) &\
        (groupData.group_prediction > 0.99),
        
    ]
)

In [None]:
outliers.loc[:, selectedFeatures].mean()

In [None]:
groupData.loc[:, selectedFeatures].mean()

In [None]:
plt.bar(range(len(selectedFeatures)), outliers.loc[:, selectedFeatures].mean(), label="outliers")
plt.bar(range(len(selectedFeatures)), groupData.loc[:, selectedFeatures].mean(), label="all")
plt.xticks(range(len(selectedFeatures)), selectedFeatures, rotation=90)
plt.legend()

In [None]:
plt.bar(range(len(selectedFeatures)), groupData.loc[:, selectedFeatures].mean())
plt.xticks(range(len(selectedFeatures)), selectedFeatures, rotation=90)

# Submission

In [None]:
scoringData.head()

In [None]:
scoringPreds = engineerFeatures(scoringData)

# user scores
scoringPreds["userPredictions"] = predictScores(fittedPreprocessing, fittedModelPipeline, selectedFeatures, scoringPreds)

In [None]:
scoringGroupPreds = estimateGroupScores(scoringData, scoringPreds["userPredictions"])

In [None]:
(
    scoringGroupPreds
    .loc[:, ["Id", "user_prediction", "group_prediction"]]
).head()

In [None]:
if CREATE_SUBMISSION_CSV:
    submission = (
        scoringGroupPreds
        .rename(columns={"group_prediction": "winPlacePerc"})
        .loc[:, ["Id", "winPlacePerc"]]
    )
    
    submission.to_csv('submission.csv', index=False)
    
    print(submission.head())