In [1]:
#import dependencies
import pandas as pd
import sqlite3
from sklearn.preprocessing import LabelEncoder

#display all columns in dfs
pd.set_option('display.max_columns', None)

In [2]:
#connect to SQLite db
conn = sqlite3.connect('../Data-and-DBs/pokedex.db')

In [16]:
#call all entries from the gens_1_to_6 table in the db
training_df = pd.read_sql('SELECT * FROM gens_1_to_6', conn)

In [4]:
#encoding the training data
#F, A. (2017, March 11). convert text columns into numbers in sklearn [web log]. https://stackoverflow.com/questions/34915813/convert-text-columns-into-numbers-in-sklearn. 
le = LabelEncoder()
encoded_training = training_df[training_df.columns[:]].apply(le.fit_transform)

In [5]:
#pull necessary info from the gens_1_to_6 training df to create training data
X_train = encoded_training[['TYPE1', 'TYPE2', 'ABILITY1', 'ABILITY2', 'ABILITY_HIDDEN', 'HP', 'ATK', 'DEF',
                       'SP_ATK', 'SP_DEF', 'SPD', 'TOTAL', 'CAPTURE_RATE']]
y_train = encoded_training['LEGENDARY_FLAG']

#double-check shapes of X and y match
print(X_train.shape, y_train.shape)

(817, 13) (817,)


In [6]:
#call all entries from the gen7 and gen_8 table in the db
test_df_1 = pd.read_sql('SELECT * FROM gen_7', conn)
test_df_2 = pd.read_sql('SELECT * FROM gen_8', conn)

In [7]:
#encoding the testing data 
#F, A. (2017, March 11). convert text columns into numbers in sklearn [web log]. https://stackoverflow.com/questions/34915813/convert-text-columns-into-numbers-in-sklearn.
encoded_testing_1 = test_df_1[test_df_1.columns[:]].apply(le.fit_transform)
encoded_testing_2 = test_df_2[test_df_2.columns[:]].apply(le.fit_transform)

In [8]:
#pull necessary info from the gen_7 and gen_8 testing dfs to create test data
X_test_1 = encoded_testing_1[['TYPE1', 'TYPE2', 'ABILITY1', 'ABILITY2', 'ABILITY_HIDDEN', 'HP', 'ATK', 'DEF',
                      'SP_ATK', 'SP_DEF', 'SPD', 'TOTAL', 'CAPTURE_RATE']]                      
y_test_1 = encoded_testing_1['LEGENDARY_FLAG']

X_test_2 = encoded_testing_2[['TYPE1', 'TYPE2', 'ABILITY1', 'ABILITY2', 'ABILITY_HIDDEN', 'HP', 'ATK', 'DEF',
                      'SP_ATK', 'SP_DEF', 'SPD', 'TOTAL', 'CAPTURE_RATE']]                      
y_test_2 = encoded_testing_2['LEGENDARY_FLAG']

#double-check shapes of X and y match
print(X_test_1.shape, y_test_1.shape)
print(X_test_2.shape, y_test_2.shape)

(118, 13) (118,)
(117, 13) (117,)


In [9]:
#import random forest test
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)

#fit to the training data
rf = rf.fit(X_train, y_train)

In [10]:
#see the score for the test datasets
print(f'Gen 7 Score: {rf.score(X_test_1, y_test_1)}')
print(f'Gen 8 Score: {rf.score(X_test_2, y_test_2)}')

Gen 7 Score: 0.7372881355932204
Gen 8 Score: 0.8376068376068376


In [14]:
#determine the importance of each X variable
sorted(zip(rf.feature_importances_, X_train.columns), reverse=True)

[(0.40036244242248564, 'CAPTURE_RATE'),
 (0.19891759557263303, 'TOTAL'),
 (0.07599452153891625, 'SP_ATK'),
 (0.07400292301306971, 'SPD'),
 (0.05606024558390693, 'HP'),
 (0.04860815889935564, 'SP_DEF'),
 (0.042285710953519295, 'DEF'),
 (0.03759855094185577, 'ATK'),
 (0.0248547272344187, 'ABILITY1'),
 (0.014950543136795016, 'TYPE1'),
 (0.01252086195694642, 'TYPE2'),
 (0.011391232278041473, 'ABILITY_HIDDEN'),
 (0.002452486468056128, 'ABILITY2')]

In [24]:
# Make predictions with the model
predictions_1 = rf.predict(X_test_1)
predictions_2 = rf.predict(X_test_2)

In [25]:
#classification report
from sklearn.metrics import classification_report
print(f'Gen 7 Classification: {classification_report(y_test_1, predictions_1, target_names=["Non-Legendary", "Legendary"])}')
print(f'Gen 8 Classification: {classification_report(y_test_2, predictions_2, target_names=["Non-Legendary", "Legendary"])}')

Gen 7 Classification:                precision    recall  f1-score   support

Non-Legendary       0.74      1.00      0.85        87
    Legendary       0.00      0.00      0.00        31

     accuracy                           0.74       118
    macro avg       0.37      0.50      0.42       118
 weighted avg       0.54      0.74      0.63       118

Gen 8 Classification:                precision    recall  f1-score   support

Non-Legendary       0.84      1.00      0.91        98
    Legendary       0.00      0.00      0.00        19

     accuracy                           0.84       117
    macro avg       0.42      0.50      0.46       117
 weighted avg       0.70      0.84      0.76       117



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
