In [31]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [38]:
df = pd.read_csv("Pokemon.csv")
df = df.drop(["#"], axis=1)

In [40]:
df["Type 1"].value_counts()

Bulbasaur              1
Uxie                   1
GalladeMega Gallade    1
Probopass              1
Dusknoir               1
                      ..
Lugia                  1
Ho-oh                  1
Celebi                 1
Treecko                1
Volcanion              1
Name: Name, Length: 800, dtype: int64

In [8]:
df["Type 1"].value_counts()

Water       112
Normal       98
Grass        70
Bug          69
Psychic      57
Fire         52
Electric     44
Rock         44
Dragon       32
Ground       32
Ghost        32
Dark         31
Poison       28
Steel        27
Fighting     27
Ice          24
Fairy        17
Flying        4
Name: Type 1, dtype: int64

In [10]:
df["Type 2"].value_counts()

Flying      97
Ground      35
Poison      34
Psychic     33
Fighting    26
Grass       25
Fairy       23
Steel       22
Dark        20
Dragon      18
Water       14
Ghost       14
Ice         14
Rock        14
Fire        12
Electric     6
Normal       4
Bug          3
Name: Type 2, dtype: int64

In [11]:
df.isnull().sum()

Name            0
Type 1          0
Type 2        386
Total           0
HP              0
Attack          0
Defense         0
Sp. Atk         0
Sp. Def         0
Speed           0
Generation      0
Legendary       0
dtype: int64

In [12]:
df = df.drop(["Type 2"], axis=1)

In [15]:
df["Generation"].value_counts()

1    166
5    165
3    160
4    121
2    106
6     82
Name: Generation, dtype: int64

In [16]:
df["Legendary"].value_counts()

False    735
True      65
Name: Legendary, dtype: int64

In [18]:
le1 = LabelEncoder().fit(df["Legendary"])
le_nm1 = dict(zip(le1.classes_, le1.transform(le1.classes_)))
df["Legendary"] = df["Legendary"].apply(lambda x: le_nm1[x])

In [22]:
le_nm1

{False: 0, True: 1}

In [20]:
le2 = LabelEncoder().fit(df["Type 1"])
le_nm2 = dict(zip(le2.classes_, le2.transform(le2.classes_)))
df["Type 1"] = df["Type 1"].apply(lambda x: le_nm2[x])

In [23]:
le_nm2

{'Bug': 0,
 'Dark': 1,
 'Dragon': 2,
 'Electric': 3,
 'Fairy': 4,
 'Fighting': 5,
 'Fire': 6,
 'Flying': 7,
 'Ghost': 8,
 'Grass': 9,
 'Ground': 10,
 'Ice': 11,
 'Normal': 12,
 'Poison': 13,
 'Psychic': 14,
 'Rock': 15,
 'Steel': 16,
 'Water': 17}

In [25]:
df.dtypes

Name          object
Type 1         int64
Total          int64
HP             int64
Attack         int64
Defense        int64
Sp. Atk        int64
Sp. Def        int64
Speed          int64
Generation     int64
Legendary      int64
dtype: object

In [26]:
ndf = df.drop(["Name"], axis=1)

In [27]:
X = ndf.drop(["Legendary"], axis=1)
y = ndf["Legendary"]

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4242)

In [29]:
X_train.shape, X_test.shape

((640, 9), (160, 9))

In [32]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("Random Forest: ", accuracy_score(y_test, y_pred))

Random Forest:  0.9625


In [33]:
rf_params = {
    "n_estimators": [100, 250, 500, 1000]
}

In [35]:
rf = RandomForestClassifier()
rf_cv_model = GridSearchCV(rf, rf_params, cv=10, n_jobs=-1).fit(X_train, y_train)
rf_cv_model.best_params_

{'n_estimators': 1000}

In [36]:
rf_tuned = RandomForestClassifier(n_estimators=1000)
rf_tuned.fit(X_train, y_train)
y_pred = rf_tuned.predict(X_test)
print("Random Forest: ", accuracy_score(y_test, y_pred))

Random Forest:  0.9625


In [37]:
pickle.dump(rf_tuned, open("RandomForest.pkl", "wb"))