In [1]:
#import dependencies
import pandas as pd
import sqlite3
from sklearn.preprocessing import LabelEncoder

#display all columns in dfs
pd.set_option('display.max_columns', None)

In [2]:
#connect to SQLite db
conn = sqlite3.connect('../Data-and-DBs/pokedex.db')

In [3]:
cursor = conn.cursor()
cursor.execute('DROP table IF EXISTS random_forest_results')

<sqlite3.Cursor at 0x253b839d030>

In [4]:
#call all entries from the gens_1_to_6 table in the db
training_df = pd.read_sql('SELECT * FROM gens_1_to_6', conn)

In [5]:
#encoding the training data 
le = LabelEncoder()
encoded_training = training_df[training_df.columns[:]].apply(le.fit_transform)

In [6]:
#pull necessary info from the gens_1_to_6 training df to create training data
X_train = encoded_training[['TYPE1', 'TYPE2', 'ABILITY1', 'ABILITY2', 'ABILITY_HIDDEN', 'HP', 'ATK', 'DEF',
                       'SP_ATK', 'SP_DEF', 'SPD', 'TOTAL', 'CAPTURE_RATE']]
y_train = encoded_training['LEGENDARY_FLAG']

#double-check shapes of X and y match
print(X_train.shape, y_train.shape)

(817, 13) (817,)


In [7]:
#call all entries from the gen7 and gen_8 table in the db
test_df_1 = pd.read_sql('SELECT * FROM gen_7', conn)
test_df_2 = pd.read_sql('SELECT * FROM gen_8', conn)

In [8]:
#encoding the testing data 
encoded_testing_1 = test_df_1[test_df_1.columns[:]].apply(le.fit_transform)
encoded_testing_2 = test_df_2[test_df_2.columns[:]].apply(le.fit_transform)

In [9]:
#pull necessary info from the gen_7 and gen_8 testing dfs to create test data
X_test_1 = encoded_testing_1[['TYPE1', 'TYPE2', 'ABILITY1', 'ABILITY2', 'ABILITY_HIDDEN', 'HP', 'ATK', 'DEF',
                      'SP_ATK', 'SP_DEF', 'SPD', 'TOTAL', 'CAPTURE_RATE']]                      
y_test_1 = encoded_testing_1['LEGENDARY_FLAG']

X_test_2 = encoded_testing_2[['TYPE1', 'TYPE2', 'ABILITY1', 'ABILITY2', 'ABILITY_HIDDEN', 'HP', 'ATK', 'DEF',
                      'SP_ATK', 'SP_DEF', 'SPD', 'TOTAL', 'CAPTURE_RATE']]                      
y_test_2 = encoded_testing_2['LEGENDARY_FLAG']

#double-check shapes of X and y match
print(X_test_1.shape, y_test_1.shape)
print(X_test_2.shape, y_test_2.shape)

(118, 13) (118,)
(117, 13) (117,)


In [10]:
#import random forest test
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)

#fit to the training data
rf = rf.fit(X_train, y_train)

In [11]:
#see the score for the test datasets
print(f'Gen 7 Score: {rf.score(X_test_1, y_test_1)}')
print(f'Gen 8 Score: {rf.score(X_test_2, y_test_2)}')

Gen 7 Score: 0.7372881355932204
Gen 8 Score: 0.8376068376068376


In [12]:
#determine the importance of each X variable
sorted(zip(rf.feature_importances_, X_train.columns), reverse=True)

[(0.3710162967919019, 'CAPTURE_RATE'),
 (0.23586653419441175, 'TOTAL'),
 (0.08202623775868463, 'SP_ATK'),
 (0.06708207257261534, 'HP'),
 (0.058885909628883605, 'SPD'),
 (0.04274000732010812, 'SP_DEF'),
 (0.03691149427470383, 'ATK'),
 (0.03653685652730224, 'DEF'),
 (0.026529565357877146, 'ABILITY1'),
 (0.015844869534658867, 'TYPE1'),
 (0.012310734806069203, 'TYPE2'),
 (0.009717840808638993, 'ABILITY_HIDDEN'),
 (0.004531580424144431, 'ABILITY2')]

In [13]:
# Make predictions with the model
predictions_1 = rf.predict(X_test_1)
predictions_2 = rf.predict(X_test_2)

In [14]:
#classification report
from sklearn.metrics import classification_report
gen_7_classification = classification_report(y_test_1, predictions_1, target_names=["Non-Legendary", "Legendary"], output_dict=True, zero_division=0)
gen_8_classification = classification_report(y_test_2, predictions_2, target_names=["Non-Legendary", "Legendary"], output_dict=True, zero_division=0)

In [15]:
print(f'Gen 7 Classification: {gen_7_classification}')
print(f'Gen 8 Classification: {gen_8_classification}')

Gen 7 Classification: {'Non-Legendary': {'precision': 0.7372881355932204, 'recall': 1.0, 'f1-score': 0.848780487804878, 'support': 87}, 'Legendary': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 31}, 'accuracy': 0.7372881355932204, 'macro avg': {'precision': 0.3686440677966102, 'recall': 0.5, 'f1-score': 0.424390243902439, 'support': 118}, 'weighted avg': {'precision': 0.5435937948865269, 'recall': 0.7372881355932204, 'f1-score': 0.6257957833815626, 'support': 118}}
Gen 8 Classification: {'Non-Legendary': {'precision': 0.8376068376068376, 'recall': 1.0, 'f1-score': 0.9116279069767441, 'support': 98}, 'Legendary': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 19}, 'accuracy': 0.8376068376068376, 'macro avg': {'precision': 0.4188034188034188, 'recall': 0.5, 'f1-score': 0.45581395348837206, 'support': 117}, 'weighted avg': {'precision': 0.7015852144057273, 'recall': 0.8376068376068376, 'f1-score': 0.763585768236931, 'support': 117}}


In [16]:
#convert results to dataframe
gen_7_classification_df = pd.DataFrame.from_dict(gen_7_classification)
gen_7_classification_df.rename({'precision': 'precision_gen_7', 
                                     'recall': 'recall_gen_7', 
                                     'f1-score': 'f1-score_gen_7', 
                                     'support': 'support_gen_7'}, inplace=True)
gen_8_classification_df = pd.DataFrame.from_dict(gen_8_classification)
gen_8_classification_df.rename({'precision': 'precision_gen_8', 
                                     'recall': 'recall_gen_8', 
                                     'f1-score': 'f1-score_gen_8', 
                                     'support': 'support_gen_8'}, inplace=True)
random_forest_results = gen_7_classification_df.append(gen_8_classification_df)
random_forest_results.rename(columns={'macro avg': 'macro_avg', 'weighted avg': 'weighted_avg'}, inplace=True)
random_forest_results

Unnamed: 0,Non-Legendary,Legendary,accuracy,macro_avg,weighted_avg
precision_gen_7,0.737288,0.0,0.737288,0.368644,0.543594
recall_gen_7,1.0,0.0,0.737288,0.5,0.737288
f1-score_gen_7,0.84878,0.0,0.737288,0.42439,0.625796
support_gen_7,87.0,31.0,0.737288,118.0,118.0
precision_gen_8,0.837607,0.0,0.837607,0.418803,0.701585
recall_gen_8,1.0,0.0,0.837607,0.5,0.837607
f1-score_gen_8,0.911628,0.0,0.837607,0.455814,0.763586
support_gen_8,98.0,19.0,0.837607,117.0,117.0


In [17]:
#convert results to sql table
random_forest_results.to_sql('random_forest_results', conn, index=False)

In [18]:
#close the db connection
conn.close()