# Building Predictive Model

In [1]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt

## Import Data

In [2]:
# set the path of the processed data
processed_data_path = os.path.join(os.path.pardir,'Data_Projects', 'pokemon-processed.csv')
df = pd.read_csv(processed_data_path, index_col = 'Number')

In [3]:
# check info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 721 entries, 1 to 721
Data columns (total 99 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Total                        721 non-null    int64  
 1   Generation                   721 non-null    int64  
 2   isLegendary                  721 non-null    int64  
 3   hasGender                    721 non-null    int64  
 4   Pr_Male                      721 non-null    float64
 5   hasMegaEvolution             721 non-null    int64  
 6   Height_m                     721 non-null    float64
 7   Weight_kg                    721 non-null    float64
 8   Catch_Rate                   721 non-null    int64  
 9   Type_1_Bug                   721 non-null    int64  
 10  Type_1_Dark                  721 non-null    int64  
 11  Type_1_Dragon                721 non-null    int64  
 12  Type_1_Electric              721 non-null    int64  
 13  Type_1_Fairy        

## Preparing the Data

In [4]:
# properly formatting the data shape
X = df.loc[:,'Generation':].to_numpy().astype('float')
y = df['Total'].ravel()

print(X.shape, y.shape)

(721, 98) (721,)


In [5]:
# split the data into train and test data
import sklearn
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 25)

In [6]:
# checking the number of entries for each data set
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(576, 98) (576,)
(145, 98) (145,)


In [7]:
# average total value in train and test
print('Mean total value in train data set : {0:.3f}'.format(np.mean(y_train)))
print('Mean total value in test data set : {0:.3f}'.format(np.mean(y_test)))

Mean total value in train data set : 417.658
Mean total value in test data set : 419.090


## Baseline Model

In [8]:
# import function
from sklearn.dummy import DummyRegressor

In [9]:
# create model
dummy_model = DummyRegressor(strategy = 'mean')

In [10]:
# fit the model
dummy_model.fit(X_train, y_train)

DummyRegressor()