# Building Predictive Model

In [1]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt

## Import Data

In [2]:
# set the path of the processed data
processed_data_path = os.path.join(os.path.pardir,'Data_Projects', 'pokemon-processed.csv')
df = pd.read_csv(processed_data_path, index_col = 'Number')

In [3]:
# check info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 721 entries, 1 to 721
Data columns (total 99 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Total                        721 non-null    int64  
 1   Generation                   721 non-null    int64  
 2   isLegendary                  721 non-null    int64  
 3   hasGender                    721 non-null    int64  
 4   Pr_Male                      721 non-null    float64
 5   hasMegaEvolution             721 non-null    int64  
 6   Height_m                     721 non-null    float64
 7   Weight_kg                    721 non-null    float64
 8   Catch_Rate                   721 non-null    int64  
 9   Type_1_Bug                   721 non-null    int64  
 10  Type_1_Dark                  721 non-null    int64  
 11  Type_1_Dragon                721 non-null    int64  
 12  Type_1_Electric              721 non-null    int64  
 13  Type_1_Fairy        

## Preparing the Data

In [4]:
# properly formatting the data shape
X = df.loc[:,'Generation':].to_numpy().astype('float')
y = df['Total'].ravel()

print(X.shape, y.shape)

(721, 98) (721,)


In [5]:
# split the data into train and test data
import sklearn
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 25)

In [6]:
# checking the number of entries for each data set
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(576, 98) (576,)
(145, 98) (145,)


In [7]:
# average total value in train and test
print('Mean total value in train data set : {0:.3f}'.format(np.mean(y_train)))
print('Mean total value in test data set : {0:.3f}'.format(np.mean(y_test)))

Mean total value in train data set : 417.658
Mean total value in test data set : 419.090


## Model based on Height and Weight

In [8]:
# function to speed up the model testing process
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

def model_tester(X, y, k):
    model = LinearRegression()
    model.fit(X, y)
    predicted = model.predict(X)
    scored = r2_score(y, predicted)
    adjusted = 1 - ((1 - scored)*(576-1)/(576-k-1))
    
    return adjusted

In [9]:
# filter the data set
size_df = df[['Height_m', 'Weight_kg']]

In [10]:
# properly formatting the data
size_X = size_df.to_numpy().astype('float')
size_y = df['Total'].ravel()

size_X_train, size_X_test, size_y_train, size_y_test = train_test_split(size_X, 
                                                              size_y, 
                                                              test_size = 0.2, 
                                                              random_state = 25)

In [11]:
# checking the adjusted R-squared value
model_tester(size_X_train, size_y_train, 2)

0.371259227565206

## Model Based on Type

In [12]:
# filter the data set
type_df = df.loc[:, 'Type_1_Bug':'Type_2_none']

In [13]:
# properly formatting the data
type_X = type_df.to_numpy().astype('float')
type_y = df['Total'].ravel()

type_X_train, type_X_test, type_y_train, type_y_test = train_test_split(type_X, 
                                                                        type_y, 
                                                                        test_size = 0.2, 
                                                                        random_state = 25)

In [14]:
# checking the adjusted R-squared value
model_tester(type_X_train, type_y_train, 37)

0.08756330121958644

## Model Based on Type and Body Type 

In [15]:
# filter the data set
looks_df = df.loc[:, 'Color_Black':'Body_Style_with_fins']

In [16]:
# drop the egg group columns
looks_df[looks_df.columns.drop(list(looks_df.filter(regex='Egg')))]

Unnamed: 0_level_0,Color_Black,Color_Blue,Color_Brown,Color_Green,Color_Grey,Color_Pink,Color_Purple,Color_Red,Color_White,Color_Yellow,...,Body_Style_head_base,Body_Style_head_legs,Body_Style_head_only,Body_Style_insectoid,Body_Style_multiple_bodies,Body_Style_quadruped,Body_Style_serpentine_body,Body_Style_several_limbs,Body_Style_two_wings,Body_Style_with_fins
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
717,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
718,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
719,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
720,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [17]:
# properly formatting the data
looks_X = looks_df.to_numpy().astype('float')
looks_y = df['Total'].ravel()

looks_X_train, looks_X_test, looks_y_train, looks_y_test = train_test_split(looks_X, 
                                                                        looks_y, 
                                                                        test_size = 0.2, 
                                                                        random_state = 25)

In [18]:
# checking the adjusted R-squared value
model_tester(looks_X_train, looks_y_train, 24)

0.17498132496179108

## Using my Personal Knowledge of Pokemon to Create a Model

In [19]:
# Generally, large dragon type or legendary pokemon that are difficult to catch should have a high total stat value
# On the flip side, small bug type pokemon that are easy to catch should have a low stat value
personal_df = df[['Height_m', 'Weight_kg', 'Type_1_Dragon', 'Type_1_Bug', 'isLegendary', 'Catch_Rate']]

In [20]:
# properly formatting the data
personal_X = personal_df.to_numpy().astype('float')
personal_y = df['Total'].ravel()

personal_X_train, personal_X_test, personal_y_train, personal_y_test = train_test_split(personal_X,
                                                                   personal_y,
                                                                   test_size = 0.2,
                                                                   random_state = 25)

In [21]:
# checking the adjusted R-squared value
model_tester(personal_X_train, personal_y_train, 6)

0.6817829333103981

In [22]:
# testing my model with the test data set
model_tester(personal_X_test, personal_y_test, 6)

0.6322625434497096