# Import the dataset

In [1]:
# Import libraries and read the csv file
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

df = pd.read_csv('pokemon.csv')
df.drop(columns=['japanese_name', 'name'], inplace=True)

In [2]:
# Have a look at the first five rows
df.head()

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0
2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,3,122,120,80,grass,poison,100.0,1,0
3,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,4,60,50,65,fire,,8.5,1,0
4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,5,80,65,80,fire,,19.0,1,0


In [3]:
# Check for the columns having missing data
df.isna().sum()

abilities              0
against_bug            0
against_dark           0
against_dragon         0
against_electric       0
against_fairy          0
against_fight          0
against_fire           0
against_flying         0
against_ghost          0
against_grass          0
against_ground         0
against_ice            0
against_normal         0
against_poison         0
against_psychic        0
against_rock           0
against_steel          0
against_water          0
attack                 0
base_egg_steps         0
base_happiness         0
base_total             0
capture_rate           0
classfication          0
defense                0
experience_growth      0
height_m              20
hp                     0
percentage_male       98
pokedex_number         0
sp_attack              0
sp_defense             0
speed                  0
type1                  0
type2                384
weight_kg             20
generation             0
is_legendary           0
dtype: int64

In [4]:
# Check the length and width of the table
df.shape

(801, 39)

In [5]:
# Have a look at columns having missing data plus type1
for i in ['height_m', 'percentage_male', 'type1', 'type2']:
    print(df[i].unique())

[ 0.7  1.   2.   0.6  1.1  1.7  0.5  1.6  0.3  1.5  nan  1.2  3.5  0.4
  0.8  1.3  0.9  1.4  1.9  1.8  8.8  2.2  6.5  2.5  2.1  4.   2.3  0.2
  9.2  5.2  3.8 14.5  2.7  6.2  4.5  7.   2.4  5.4  4.2  3.7  3.2  3.3
  0.1  2.6  2.8  2.9  3.   5.8  5.   3.9  3.4  5.5]
[ 88.1  50.    0.  100.   24.6  75.4   nan  11.2]
['grass' 'fire' 'water' 'bug' 'normal' 'poison' 'electric' 'ground'
 'fairy' 'fighting' 'psychic' 'rock' 'ghost' 'ice' 'dragon' 'dark' 'steel'
 'flying']
['poison' nan 'flying' 'dark' 'electric' 'ice' 'ground' 'fairy' 'grass'
 'fighting' 'psychic' 'steel' 'fire' 'rock' 'water' 'dragon' 'ghost' 'bug'
 'normal']


Pokemons are like human: uniracial, biracial, or more. Thus, the missing data in type2 indicate that the respective pokemons are uniracial. Then, I'll assign them in type2 as normal


In [6]:
# Assign normal to missing data in type2
df.type2 = df.type2.fillna('normal')

In [7]:
# Retrieve numeric and categorical columns
num_features = df._get_numeric_data().columns
cat_features = list(set(df.columns) - set(num_features))

In [8]:
# Imputing missing data from numeric columns
from sklearn.impute import KNNImputer
from feature_engine import missing_data_imputers as mdi

numeric_imputer = KNNImputer()
df[num_features] = numeric_imputer.fit_transform(df[num_features])

cat_imputer = mdi.CategoricalVariableImputer()
df[cat_features] = cat_imputer.fit_transform(df[cat_features])

In [9]:
df.isna().sum()

abilities            0
against_bug          0
against_dark         0
against_dragon       0
against_electric     0
against_fairy        0
against_fight        0
against_fire         0
against_flying       0
against_ghost        0
against_grass        0
against_ground       0
against_ice          0
against_normal       0
against_poison       0
against_psychic      0
against_rock         0
against_steel        0
against_water        0
attack               0
base_egg_steps       0
base_happiness       0
base_total           0
capture_rate         0
classfication        0
defense              0
experience_growth    0
height_m             0
hp                   0
percentage_male      0
pokedex_number       0
sp_attack            0
sp_defense           0
speed                0
type1                0
type2                0
weight_kg            0
generation           0
is_legendary         0
dtype: int64

In [10]:
# Encode categorical features into numeric
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for cat in cat_features:
    df[cat] = le.fit_transform(df[cat])

In [11]:
# Split the dataset into train and test sets
from sklearn.model_selection import train_test_split

X = df.drop(columns='is_legendary')
y = df.is_legendary
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1)

In [12]:
# Use XGBClassifier + GridSearch to predict if the pokemon is legendary
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

gbm = XGBClassifier()
reg_cv = GridSearchCV(gbm, {"colsample_bytree":[1.0],"min_child_weight":[1.0,1.2]
                            ,'max_depth': [3,4,6], 'n_estimators': [500,1000]}, verbose=1)
reg_cv.fit(X_train,y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   14.3s finished


GridSearchCV(estimator=XGBClassifier(),
             param_grid={'colsample_bytree': [1.0], 'max_depth': [3, 4, 6],
                         'min_child_weight': [1.0, 1.2],
                         'n_estimators': [500, 1000]},
             verbose=1)

In [13]:
# Print the best parameters

print(reg_cv.best_params_)
gbm = XGBClassifier(**reg_cv.best_params_)
gbm.fit(X_train, y_train)

{'colsample_bytree': 1.0, 'max_depth': 3, 'min_child_weight': 1.0, 'n_estimators': 500}


XGBClassifier(colsample_bytree=1.0, min_child_weight=1.0, n_estimators=500)

In [14]:
# Predict the column 'is_legendary'
y_pred = gbm.predict(X_test)

In [15]:
# Evaluate the accuracy of the prediction
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred, y_test)
print('Accuracy: %.2f%%' % (accuracy * 100.0))

Accuracy: 98.76%
