# Import the dataset

In [1]:
# Import libraries and read the csv file
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import DataScience
import sidetable
%matplotlib inline

df = pd.read_csv('pokemon.csv')
df.drop(columns=['japanese_name', 'name'], inplace=True)

In [2]:
X = df.drop(columns='is_legendary')
y = df['is_legendary'].copy()

In [3]:
# Have a look at the first five rows
X.head()

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,hp,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation
0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,45,88.1,1,65,65,45,grass,poison,6.9,1
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,60,88.1,2,80,80,60,grass,poison,13.0,1
2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,80,88.1,3,122,120,80,grass,poison,100.0,1
3,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,39,88.1,4,60,50,65,fire,,8.5,1
4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,58,88.1,5,80,65,80,fire,,19.0,1


In [4]:
# Check for the columns having missing data
X.stb.missing(style=True)

Unnamed: 0,missing,total,percent
type2,384,801,47.94%
percentage_male,98,801,12.23%
height_m,20,801,2.50%
weight_kg,20,801,2.50%
base_happiness,0,801,0.00%
base_total,0,801,0.00%
capture_rate,0,801,0.00%
classfication,0,801,0.00%
defense,0,801,0.00%
experience_growth,0,801,0.00%


In [5]:
# Check the length and width of the table
X.shape

(801, 38)

In [6]:
# Have a look at columns having missing data plus type1
for i in ['height_m', 'percentage_male', 'type2']:
    print(X[i].unique())

[ 0.7  1.   2.   0.6  1.1  1.7  0.5  1.6  0.3  1.5  nan  1.2  3.5  0.4
  0.8  1.3  0.9  1.4  1.9  1.8  8.8  2.2  6.5  2.5  2.1  4.   2.3  0.2
  9.2  5.2  3.8 14.5  2.7  6.2  4.5  7.   2.4  5.4  4.2  3.7  3.2  3.3
  0.1  2.6  2.8  2.9  3.   5.8  5.   3.9  3.4  5.5]
[ 88.1  50.    0.  100.   24.6  75.4   nan  11.2]
['poison' nan 'flying' 'dark' 'electric' 'ice' 'ground' 'fairy' 'grass'
 'fighting' 'psychic' 'steel' 'fire' 'rock' 'water' 'dragon' 'ghost' 'bug'
 'normal']


In [7]:
# Retrieve numeric and categorical columns
num_cols = list(X._get_numeric_data().columns)
cat_cols = list(set(X.columns) - set(num_cols))

In [8]:
my_abilities = set()
d = {'[': '', ']': ''}
for abilities in X['abilities'].unique():
    abilities = abilities.replace('[', '').replace(']', '').replace('[', '').replace("'", '')
    abilities = abilities.split(', ')
    for sub_abi in abilities:
        if sub_abi not in my_abilities:
            my_abilities.add(sub_abi)
my_abilities = sorted(my_abilities)
print(my_abilities)



In [9]:
for ability in my_abilities:
    X['ability_' + ability] = 0

In [10]:
for index in range(len(df)):
    abilities = X.at[index, 'abilities'].replace('[', '').replace(']', '').replace('[', '').replace("'", '').split(', ')
    for sub_abi in abilities:
        X.at[index, 'ability_' + sub_abi] = 1
X.drop(columns=['abilities'], inplace=True)

In [11]:
# Impute column 'type2'
from feature_engine import missing_data_imputers as mdi
cat_imputer = mdi.CategoricalVariableImputer()
X[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

In [12]:
# Encode categorical features into numeric
X = pd.get_dummies(data=X, drop_first=True, columns=cat_cols)

In [13]:
X.head()

Unnamed: 0,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,against_grass,...,capture_rate_35,capture_rate_45,capture_rate_50,capture_rate_55,capture_rate_60,capture_rate_65,capture_rate_70,capture_rate_75,capture_rate_80,capture_rate_90
0,1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,0.25,...,0,1,0,0,0,0,0,0,0,0
1,1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,0.25,...,0,1,0,0,0,0,0,0,0,0
2,1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,0.25,...,0,1,0,0,0,0,0,0,0,0
3,0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,0.5,...,0,1,0,0,0,0,0,0,0,0
4,0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,0.5,...,0,1,0,0,0,0,0,0,0,0


In [14]:
# Split the dataset into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1)

In [15]:
# Test for the best imputation among KNN, Iterative, and Simpel Imputer
DataScience.test_imputations(X_train, y_train, num_cols)

Imputing row 1/640 with 0 missing, elapsed time: 0.109
Imputing row 101/640 with 0 missing, elapsed time: 0.110
Imputing row 201/640 with 0 missing, elapsed time: 0.111
Imputing row 301/640 with 0 missing, elapsed time: 0.112
Imputing row 401/640 with 0 missing, elapsed time: 0.113
Imputing row 501/640 with 0 missing, elapsed time: 0.114
Imputing row 601/640 with 0 missing, elapsed time: 0.115
The best imputation:  ('KNN', 0.8168265691907785)


In [16]:
# Test the best neighbors for KNN
DataScience.test_KNN_imputation(X_train, y_train, num_cols, range(2, 10))

Imputing row 1/640 with 0 missing, elapsed time: 0.102
Imputing row 101/640 with 0 missing, elapsed time: 0.103
Imputing row 201/640 with 0 missing, elapsed time: 0.104
Imputing row 301/640 with 0 missing, elapsed time: 0.105
Imputing row 401/640 with 0 missing, elapsed time: 0.106
Imputing row 501/640 with 0 missing, elapsed time: 0.107
Imputing row 601/640 with 0 missing, elapsed time: 0.108
Imputing row 1/640 with 0 missing, elapsed time: 0.095
Imputing row 101/640 with 0 missing, elapsed time: 0.096
Imputing row 201/640 with 0 missing, elapsed time: 0.097
Imputing row 301/640 with 0 missing, elapsed time: 0.098
Imputing row 401/640 with 0 missing, elapsed time: 0.099
Imputing row 501/640 with 0 missing, elapsed time: 0.101
Imputing row 601/640 with 0 missing, elapsed time: 0.102
Imputing row 1/640 with 0 missing, elapsed time: 0.102
Imputing row 101/640 with 0 missing, elapsed time: 0.104
Imputing row 201/640 with 0 missing, elapsed time: 0.105
Imputing row 301/640 with 0 missing, 

In [17]:
# Impute num_cols using the best KNN neighbors
from fancyimpute import KNN
num_imputer = KNN(2)
X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])

Imputing row 1/640 with 0 missing, elapsed time: 0.093
Imputing row 101/640 with 0 missing, elapsed time: 0.095
Imputing row 201/640 with 0 missing, elapsed time: 0.096
Imputing row 301/640 with 0 missing, elapsed time: 0.096
Imputing row 401/640 with 0 missing, elapsed time: 0.097
Imputing row 501/640 with 0 missing, elapsed time: 0.098
Imputing row 601/640 with 0 missing, elapsed time: 0.099


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


In [19]:
predictions = DataScience.xgb_classifier(X_train, y_train, X_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 50 folds for each of 96 candidates, totalling 4800 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 656 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 1656 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done 3056 tasks      | elapsed:   43.5s
[Parallel(n_jobs=-1)]: Done 4800 out of 4800 | elapsed:  1.2min finished


ValueError: feature_names may not contain [, ] or <

In [None]:
# Get the index of the predicted table
index = X_test.index
predictions = pd.DataFrame(predictions, columns=['is_legendary'], index=index)

In [None]:
# Evaluate the accuracy of the prediction
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(predictions, y_test)
print('Accuracy: %.2f%%' % (accuracy * 100.0))