# XGBoost

In [318]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable


In [319]:
import pandas as pd
import numpy as np

# visual
import matplotlib.pyplot as plt

# sklearn
from sklearn.metrics import confusion_matrix, rand_score, accuracy_score
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, HalvingGridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

# xgboost

import xgboost as xgb
from xgboost import XGBClassifier

## Data step

### Noise

In [320]:
lst_str_cols = ['move']
dict_dtypes = {x: 'str' for x in lst_str_cols}
data = pd.read_csv('data_noise.csv', dtype=dict_dtypes)
df_noise = pd.DataFrame(data = data)
df_noise = df_noise.drop(['Unnamed: 0'], axis = 1)

In [321]:
df_noise.head()

Unnamed: 0,BallAcceleration,Time,DistanceWall,DistanceCeil,DistanceBall,PlayerSpeed,BallSpeed,goal,Class,move
0,0.522863,0.0,0.012304,0.498257,0.025869,0.453272,0.300277,0,Front Flick,3
1,0.525408,0.010311,0.01229,0.498257,0.01849,0.540155,0.309974,0,Front Flick,22
2,0.520195,0.012889,0.01229,0.498257,0.019278,0.540155,0.312201,0,Front Flick,18
3,0.536357,0.0232,0.012311,0.498257,0.010238,0.50102,0.342262,0,Front Flick,2
4,0.529825,0.0464,0.012387,0.498262,0.010521,0.543821,0.360175,0,Front Flick,3


In [322]:
df_noise.dtypes

BallAcceleration    float64
Time                float64
DistanceWall        float64
DistanceCeil        float64
DistanceBall        float64
PlayerSpeed         float64
BallSpeed           float64
goal                  int64
Class                object
move                 object
dtype: object

In [323]:
df_noise.dtypes

BallAcceleration    float64
Time                float64
DistanceWall        float64
DistanceCeil        float64
DistanceBall        float64
PlayerSpeed         float64
BallSpeed           float64
goal                  int64
Class                object
move                 object
dtype: object

In [324]:
# encoding the class labels

le1 = LabelEncoder()
df_noise['Class'] = le1.fit_transform(df_noise['Class']) 
print(df_noise['Class'].head())
print(le1.inverse_transform([0, 1, 2, 3, 4, 5, 6]))

label_dict = {
    0 : le1.inverse_transform([0]),
    1 : le1.inverse_transform([1]),
    2 : le1.inverse_transform([2]),
    3 : le1.inverse_transform([3]),
    4 : le1.inverse_transform([4]),
    5 : le1.inverse_transform([5]),
    6 : le1.inverse_transform([6]),
}
print(label_dict)


0    2
1    2
2    2
3    2
4    2
Name: Class, dtype: int64
['Air Dribble' 'Ceiling Shot' 'Front Flick' 'Musty Flick' 'Noise'
 'Power Shot' 'Waving Dash']
{0: array(['Air Dribble'], dtype=object), 1: array(['Ceiling Shot'], dtype=object), 2: array(['Front Flick'], dtype=object), 3: array(['Musty Flick'], dtype=object), 4: array(['Noise'], dtype=object), 5: array(['Power Shot'], dtype=object), 6: array(['Waving Dash'], dtype=object)}


In [325]:
# converting Class and move to type(int)

df_noise['Class'] = df_noise['Class'].astype('int')
df_noise['move'] = df_noise['move'].astype('int')

### Clean

In [326]:
lst_str_cols = ['move']
dict_dtypes = {x: 'str' for x in lst_str_cols}
data = pd.read_csv('data_clean.csv', dtype=dict_dtypes)
df_clean= pd.DataFrame(data = data)
df_clean = df_clean.drop(['Unnamed: 0'], axis = 1)

In [327]:
df_clean.head()

Unnamed: 0,BallAcceleration,Time,DistanceWall,DistanceCeil,DistanceBall,PlayerSpeed,BallSpeed,goal,Class,move
0,0.522863,0.0,0.012304,0.498257,0.025869,0.453272,0.300277,0,Front Flick,3
1,0.525408,0.010311,0.01229,0.498257,0.01849,0.540155,0.309974,0,Front Flick,22
2,0.520195,0.012889,0.01229,0.498257,0.019278,0.540155,0.312201,0,Front Flick,18
3,0.536357,0.0232,0.012311,0.498257,0.010238,0.50102,0.342262,0,Front Flick,2
4,0.529825,0.0464,0.012387,0.498262,0.010521,0.543821,0.360175,0,Front Flick,3


In [328]:
# encoding the class labels

le2 = LabelEncoder()
df_clean['Class'] = le2.fit_transform(df_clean['Class'])
print(df_clean['Class'].head())
print(le2.inverse_transform([0, 1, 2, 3, 4, 5]))

label_dict = {
    0 : le1.inverse_transform([0]),
    1 : le1.inverse_transform([1]),
    2 : le1.inverse_transform([2]),
    3 : le1.inverse_transform([3]),
    4 : le1.inverse_transform([4]),
    5 : le1.inverse_transform([5]),
}
print(label_dict)

0    2
1    2
2    2
3    2
4    2
Name: Class, dtype: int64
['Air Dribble' 'Ceiling Shot' 'Front Flick' 'Musty Flick' 'Power Shot'
 'Waving Dash']
{0: array(['Air Dribble'], dtype=object), 1: array(['Ceiling Shot'], dtype=object), 2: array(['Front Flick'], dtype=object), 3: array(['Musty Flick'], dtype=object), 4: array(['Noise'], dtype=object), 5: array(['Power Shot'], dtype=object)}


In [329]:
df_clean['Class'] = df_clean['Class'].astype('int')
df_clean['move'] = df_clean['move'].astype('int')

## Train test split

In [330]:
df_noise_no_target = df_noise.drop(columns = ['Class'])

In [331]:
df_noise_no_target.head()

Unnamed: 0,BallAcceleration,Time,DistanceWall,DistanceCeil,DistanceBall,PlayerSpeed,BallSpeed,goal,move
0,0.522863,0.0,0.012304,0.498257,0.025869,0.453272,0.300277,0,3
1,0.525408,0.010311,0.01229,0.498257,0.01849,0.540155,0.309974,0,22
2,0.520195,0.012889,0.01229,0.498257,0.019278,0.540155,0.312201,0,18
3,0.536357,0.0232,0.012311,0.498257,0.010238,0.50102,0.342262,0,2
4,0.529825,0.0464,0.012387,0.498262,0.010521,0.543821,0.360175,0,3


In [332]:
# noisy data
X = df_noise_no_target
y = df_noise['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [333]:
df_clean_no_target = df_clean.drop(columns = ['Class'])

In [334]:
df_clean_no_target.head()

Unnamed: 0,BallAcceleration,Time,DistanceWall,DistanceCeil,DistanceBall,PlayerSpeed,BallSpeed,goal,move
0,0.522863,0.0,0.012304,0.498257,0.025869,0.453272,0.300277,0,3
1,0.525408,0.010311,0.01229,0.498257,0.01849,0.540155,0.309974,0,22
2,0.520195,0.012889,0.01229,0.498257,0.019278,0.540155,0.312201,0,18
3,0.536357,0.0232,0.012311,0.498257,0.010238,0.50102,0.342262,0,2
4,0.529825,0.0464,0.012387,0.498262,0.010521,0.543821,0.360175,0,3


In [335]:
# clean data
X = df_clean_no_target
y = df_clean['Class']
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2, random_state=0)

## XGBoost Classifier

### Classification with noise





In [336]:
param_grid = {
    'min_child_weight': [1],
    'gamma': [0.5],
}

In [337]:
xg = XGBClassifier()

grid = HalvingGridSearchCV(xg, param_grid, refit=True, verbose=3, cv=3, scoring='accuracy', n_jobs = -1, error_score= 'raise')

%time grid.fit(X_train, y_train)

n_iterations: 1
n_required_iterations: 1
n_possible_iterations: 1
min_resources_: 5513
max_resources_: 5513
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1
n_resources: 5513
Fitting 3 folds for each of 1 candidates, totalling 3 fits


KeyboardInterrupt: 

In [343]:
xg = XGBClassifier(use_label_encoder=False).fit(X_train, y_train)

