# Import the dataset

In [1]:
# Import libraries and read the csv file
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import DataScience
import sidetable
%matplotlib inline

df = pd.read_csv('pokemon.csv')
df.drop(columns=['japanese_name', 'name'], inplace=True)

In [2]:
# Have a look at the first five rows
df.head()

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0
2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,3,122,120,80,grass,poison,100.0,1,0
3,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,4,60,50,65,fire,,8.5,1,0
4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,5,80,65,80,fire,,19.0,1,0


In [3]:
# Check for the columns having missing data
df.stb.missing(style=True)

Unnamed: 0,missing,total,percent
type2,384,801,47.94%
percentage_male,98,801,12.23%
weight_kg,20,801,2.50%
height_m,20,801,2.50%
abilities,0,801,0.00%
hp,0,801,0.00%
base_total,0,801,0.00%
capture_rate,0,801,0.00%
classfication,0,801,0.00%
defense,0,801,0.00%


In [4]:
# Check the length and width of the table
df.shape

(801, 39)

In [5]:
# Have a look at columns having missing data plus type1
for i in ['height_m', 'percentage_male', 'type1', 'type2']:
    print(df[i].unique())

[ 0.7  1.   2.   0.6  1.1  1.7  0.5  1.6  0.3  1.5  nan  1.2  3.5  0.4
  0.8  1.3  0.9  1.4  1.9  1.8  8.8  2.2  6.5  2.5  2.1  4.   2.3  0.2
  9.2  5.2  3.8 14.5  2.7  6.2  4.5  7.   2.4  5.4  4.2  3.7  3.2  3.3
  0.1  2.6  2.8  2.9  3.   5.8  5.   3.9  3.4  5.5]
[ 88.1  50.    0.  100.   24.6  75.4   nan  11.2]
['grass' 'fire' 'water' 'bug' 'normal' 'poison' 'electric' 'ground'
 'fairy' 'fighting' 'psychic' 'rock' 'ghost' 'ice' 'dragon' 'dark' 'steel'
 'flying']
['poison' nan 'flying' 'dark' 'electric' 'ice' 'ground' 'fairy' 'grass'
 'fighting' 'psychic' 'steel' 'fire' 'rock' 'water' 'dragon' 'ghost' 'bug'
 'normal']


In [6]:
# Retrieve numeric and categorical columns
num_cols = list(df._get_numeric_data().columns)
cat_cols = list(set(df.columns) - set(num_cols))

In [7]:
# Imputing missing data from numeric columns
from sklearn.impute import KNNImputer
from feature_engine import missing_data_imputers as mdi

numeric_imputer = KNNImputer()
df[num_cols] = numeric_imputer.fit_transform(df[num_cols])

cat_imputer = mdi.CategoricalVariableImputer()
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

In [8]:
# Verify missing data
df.stb.missing(style=True)

Unnamed: 0,missing,total,percent
abilities,0,801,0.00%
percentage_male,0,801,0.00%
base_total,0,801,0.00%
capture_rate,0,801,0.00%
classfication,0,801,0.00%
defense,0,801,0.00%
experience_growth,0,801,0.00%
height_m,0,801,0.00%
hp,0,801,0.00%
pokedex_number,0,801,0.00%


In [9]:
# Encode categorical features into numeric
df = pd.get_dummies(data=df, drop_first=True, columns=cat_cols)

In [10]:
# Split the dataset into train and test sets
from sklearn.model_selection import train_test_split

X = df.drop(columns='is_legendary')
y = df.is_legendary
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1)

In [11]:
# Compare Linear Regression models
DataScience.compare_linear_regression(X_train, y_train)

[[0.06002937837699217, 'huber'], [0.040933792694885555, 'linear'], [nan, 'ransac'], [0.041143992828539123, 'theilsen']]


In [None]:
predictions = DataScience.xgb_classifier(X_train, y_train, X_test)

In [None]:
# Get the index of the predicted table
index = X_test.index
predictions = pd.DataFrame(predictions, columns=['is_legendary'], index=index)

In [None]:
# Evaluate the accuracy of the prediction
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(predictions, y_test)
print('Accuracy: %.2f%%' % (accuracy * 100.0))