In [55]:
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [56]:
# Read in the data
df = pd.read_csv('../Resources/finalClean_games.csv')
df.head()

Unnamed: 0,Game Name,Franchise,Platform,Release Year,Years Since,First Release Year,Main Genre,Tier by Score,Critics,Critic_Score,Users,User Score,Developer,Global Sales,Other Info
0,#IDARB,#IDARB,Xbox,2015,0,2015,Action,Tier 3,31,77,88,6.9,OtherOceanInteractive,0.09,"Action,General"
1,007: Quantum of Solace,007: Quantum of Solace,PlayStation,2008,0,2008,Action,Tier 4,42,65,59,6.6,Treyarch,1.14,"Action,General,Shooter,Shooter,First-Person,Mo..."
2,007: Quantum of Solace,007: Quantum of Solace,PC,2008,0,2008,Action,Tier 3,18,70,64,6.1,"Treyarch,Beenox",0.03,"Action,General,Shooter,Shooter,First-Person,Mo..."
3,0RBITALIS,0RBITALIS,PC,2015,0,2015,General,Tier 3,9,73,4,7.3,AlanZucconi,,"Miscellaneous,Puzzle,Action"
4,10 Second Ninja,10 Second Ninja,PC,2014,0,2014,Action,Tier 3,13,72,16,7.1,GameDesignDan,0.02,"Action,Platformer,2D"


In [57]:
df = df.drop(columns=['Other Info','Game Name'])

In [58]:
df = df.dropna(axis='columns', how='all')

In [59]:
df = df.dropna()

# Split data into training and testing

In [60]:
df = df.reset_index(drop=True)
df.shape

(1287, 13)

In [61]:
# Create our features
X = df.drop(columns='Tier by Score')
# Create our target
y = df['Tier by Score']

In [62]:
encoded_X = pd.get_dummies(X)
encoded_X

Unnamed: 0,Release Year,Years Since,First Release Year,Critics,Critic_Score,Users,User Score,Global Sales,Franchise_#IDARB,Franchise_007: Quantum of Solace,...,Developer_WBGamesMontreal,Developer_Wargaming.net,Developer_WarnerBros.InteractiveEntertainment,"Developer_WarnerBros.InteractiveEntertainment,NetherRealmStudios",Developer_WestwoodStudios,Developer_YachtClubGames,Developer_Yager,Developer_Yuke's,Developer_ZeniMaxMedia,"Developer_idSoftware,RavenSoftware"
0,2015,0,2015,31,77,88,6.9,0.09,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2008,0,2008,42,65,59,6.6,1.14,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2008,0,2008,18,70,64,6.1,0.03,0,1,...,0,0,0,0,0,0,0,0,0,0
3,2014,0,2014,13,72,16,7.1,0.02,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2016,2,2014,10,82,8,4.5,0.11,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1282,2011,11,2000,11,82,17,8.2,0.18,0,0,...,0,0,0,0,0,0,0,0,0,0
1283,2004,3,2001,34,72,63,7.9,0.06,0,0,...,0,0,0,0,0,0,0,0,0,0
1284,2008,0,2008,32,81,17,7.3,0.03,0,0,...,0,0,0,0,0,0,0,0,0,0
1285,2010,2,2008,28,79,11,6.5,0.13,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
# Check the balance of our target values
y.value_counts()

Tier 2    595
Tier 3    499
Tier 1    119
Tier 4     74
Name: Tier by Score, dtype: int64

In [64]:
X.describe()

Unnamed: 0,Release Year,Years Since,First Release Year,Critics,Critic_Score,Users,User Score,Global Sales
count,1287.0,1287.0,1287.0,1287.0,1287.0,1287.0,1287.0,1287.0
mean,2010.987568,4.689977,2006.297591,36.008547,80.340326,752.666667,7.264724,0.934584
std,4.146782,4.830832,5.086577,20.562519,6.689918,1543.12836,1.206095,1.668164
min,1996.0,0.0,1996.0,0.0,65.0,4.0,2.1,0.01
25%,2008.0,0.0,2002.0,20.0,76.0,70.0,6.7,0.08
50%,2011.0,3.0,2007.0,32.0,80.0,211.0,7.6,0.37
75%,2014.0,8.0,2010.0,48.0,85.0,750.0,8.1,1.06
max,2020.0,24.0,2020.0,113.0,98.0,17537.0,9.4,21.04


In [65]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(encoded_X,
    y, random_state=1, stratify=y)

In [66]:
X_train

Unnamed: 0,Release Year,Years Since,First Release Year,Critics,Critic_Score,Users,User Score,Global Sales,Franchise_#IDARB,Franchise_007: Quantum of Solace,...,Developer_WBGamesMontreal,Developer_Wargaming.net,Developer_WarnerBros.InteractiveEntertainment,"Developer_WarnerBros.InteractiveEntertainment,NetherRealmStudios",Developer_WestwoodStudios,Developer_YachtClubGames,Developer_Yager,Developer_Yuke's,Developer_ZeniMaxMedia,"Developer_idSoftware,RavenSoftware"
326,2019,5,2014,10,73,41,6.9,1.93,0,0,...,0,0,0,0,0,0,0,0,0,0
63,2011,4,2007,20,80,1154,7.5,0.74,0,0,...,0,0,0,0,0,0,0,0,0,0
253,2004,0,2004,26,73,43,7.6,0.23,0,0,...,0,0,0,0,0,0,0,0,0,0
247,2009,3,2006,42,70,216,7.2,0.04,0,0,...,0,0,0,0,0,0,0,0,0,0
773,2015,7,2008,81,83,1058,7.8,2.78,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,2014,12,2002,14,97,1285,7.8,5.48,0,0,...,0,0,0,0,0,0,0,0,0,0
726,2009,8,2001,48,74,61,7.2,0.73,0,0,...,0,0,0,0,0,0,0,0,0,0
207,2010,0,2010,62,85,545,8.1,0.04,0,0,...,0,0,0,0,0,0,0,0,0,0
769,2011,3,2008,60,84,576,8.5,2.84,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
X_test 

Unnamed: 0,Release Year,Years Since,First Release Year,Critics,Critic_Score,Users,User Score,Global Sales,Franchise_#IDARB,Franchise_007: Quantum of Solace,...,Developer_WBGamesMontreal,Developer_Wargaming.net,Developer_WarnerBros.InteractiveEntertainment,"Developer_WarnerBros.InteractiveEntertainment,NetherRealmStudios",Developer_WestwoodStudios,Developer_YachtClubGames,Developer_Yager,Developer_Yuke's,Developer_ZeniMaxMedia,"Developer_idSoftware,RavenSoftware"
532,2015,2,2013,85,87,886,7.7,1.80,0,0,...,0,0,0,0,0,0,0,0,0,0
1073,2015,0,2015,70,81,293,8.0,0.26,0,0,...,0,0,0,0,0,0,0,0,0,0
1137,2011,4,2007,76,88,5504,8.5,0.96,0,0,...,0,0,0,0,0,0,0,0,0,0
817,2009,9,2000,35,88,44,8.1,0.57,0,0,...,0,0,0,0,0,0,0,0,0,0
332,2016,16,2000,18,83,251,7.5,0.17,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1162,2013,15,1998,13,82,901,7.5,0.19,0,0,...,0,0,0,0,0,0,0,0,0,0
432,2004,0,2004,51,75,125,8.3,0.02,0,0,...,0,0,0,0,0,0,0,0,0,0
363,2016,4,2012,62,88,1215,7.9,0.64,0,0,...,0,0,0,0,0,0,0,0,0,0
468,2016,16,2000,42,85,880,4.9,7.59,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
#new_games = some df in same format as x_test but made up data for possible future games 

SyntaxError: invalid syntax (1112549023.py, line 1)

# Balanced Random Forest Classifier

In [69]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brf = BalancedRandomForestClassifier(n_estimators=100 , random_state=1)
model = brf.fit(X_train, y_train)
y_pred_brf = brf.predict(X_test)
brf

BalancedRandomForestClassifier(random_state=1)

In [70]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
# predict new games 
#y_pred = model.predict(new_games)
confusion_matrix(y_test, y_pred)

array([[ 28,   2,   0,   0],
       [  9, 133,   7,   0],
       [  0,   4, 106,  15],
       [  0,   0,   0,  18]])

In [71]:
y_pred

array(['Tier 2', 'Tier 2', 'Tier 1', 'Tier 2', 'Tier 2', 'Tier 1',
       'Tier 3', 'Tier 2', 'Tier 2', 'Tier 3', 'Tier 2', 'Tier 1',
       'Tier 3', 'Tier 3', 'Tier 2', 'Tier 2', 'Tier 2', 'Tier 2',
       'Tier 2', 'Tier 3', 'Tier 2', 'Tier 2', 'Tier 3', 'Tier 2',
       'Tier 3', 'Tier 2', 'Tier 3', 'Tier 1', 'Tier 4', 'Tier 3',
       'Tier 2', 'Tier 3', 'Tier 1', 'Tier 2', 'Tier 2', 'Tier 1',
       'Tier 3', 'Tier 3', 'Tier 2', 'Tier 3', 'Tier 2', 'Tier 2',
       'Tier 4', 'Tier 1', 'Tier 2', 'Tier 2', 'Tier 3', 'Tier 2',
       'Tier 3', 'Tier 3', 'Tier 2', 'Tier 2', 'Tier 3', 'Tier 3',
       'Tier 1', 'Tier 1', 'Tier 2', 'Tier 3', 'Tier 2', 'Tier 1',
       'Tier 1', 'Tier 2', 'Tier 3', 'Tier 4', 'Tier 2', 'Tier 2',
       'Tier 3', 'Tier 3', 'Tier 3', 'Tier 2', 'Tier 1', 'Tier 3',
       'Tier 1', 'Tier 4', 'Tier 3', 'Tier 3', 'Tier 3', 'Tier 2',
       'Tier 2', 'Tier 2', 'Tier 4', 'Tier 1', 'Tier 3', 'Tier 2',
       'Tier 3', 'Tier 2', 'Tier 4', 'Tier 3', 'Tier 2', 'Tier

In [72]:
X_test

Unnamed: 0,Release Year,Years Since,First Release Year,Critics,Critic_Score,Users,User Score,Global Sales,Franchise_#IDARB,Franchise_007: Quantum of Solace,...,Developer_WBGamesMontreal,Developer_Wargaming.net,Developer_WarnerBros.InteractiveEntertainment,"Developer_WarnerBros.InteractiveEntertainment,NetherRealmStudios",Developer_WestwoodStudios,Developer_YachtClubGames,Developer_Yager,Developer_Yuke's,Developer_ZeniMaxMedia,"Developer_idSoftware,RavenSoftware"
532,2015,2,2013,85,87,886,7.7,1.80,0,0,...,0,0,0,0,0,0,0,0,0,0
1073,2015,0,2015,70,81,293,8.0,0.26,0,0,...,0,0,0,0,0,0,0,0,0,0
1137,2011,4,2007,76,88,5504,8.5,0.96,0,0,...,0,0,0,0,0,0,0,0,0,0
817,2009,9,2000,35,88,44,8.1,0.57,0,0,...,0,0,0,0,0,0,0,0,0,0
332,2016,16,2000,18,83,251,7.5,0.17,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1162,2013,15,1998,13,82,901,7.5,0.19,0,0,...,0,0,0,0,0,0,0,0,0,0
432,2004,0,2004,51,75,125,8.3,0.02,0,0,...,0,0,0,0,0,0,0,0,0,0
363,2016,4,2012,62,88,1215,7.9,0.64,0,0,...,0,0,0,0,0,0,0,0,0,0
468,2016,16,2000,42,85,880,4.9,7.59,0,0,...,0,0,0,0,0,0,0,0,0,0


In [73]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.9184876957494407

In [74]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     Tier 1       0.76      0.93      0.97      0.84      0.95      0.90        30
     Tier 2       0.96      0.89      0.97      0.92      0.93      0.86       149
     Tier 3       0.94      0.85      0.96      0.89      0.90      0.81       125
     Tier 4       0.55      1.00      0.95      0.71      0.98      0.96        18

avg / total       0.91      0.89      0.96      0.89      0.92      0.85       322



In [75]:
# List the features sorted in descending order by feature importance

features = sorted(zip(X.columns, brf.feature_importances_), key = lambda x: x[1], reverse=True)
cols = [brf for f in features]

features

[('First Release Year', 0.22061688186887532),
 ('Critics', 0.08515970401066449),
 ('Main Genre', 0.07610193601175319),
 ('Critic_Score', 0.056014550491156154),
 ('Years Since', 0.055236007629484815),
 ('Release Year', 0.036742340127729714),
 ('Platform', 0.036381411030521255),
 ('Franchise', 0.03586788328536146),
 ('User Score', 0.0009044021311465497),
 ('Users', 0.0001535571366164501),
 ('Developer', 7.403057320306796e-05),
 ('Global Sales', 7.259671312632498e-05)]

# Easy Ensemble AdaBoost Classifier

In [76]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)
eec

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [77]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = eec.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 30,   0,   0,   0],
       [  0, 149,   0,   0],
       [  0, 125,   0,   0],
       [  0,   0,   0,  18]])

In [78]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.75

In [79]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     Tier 1       1.00      1.00      1.00      1.00      1.00      1.00        30
     Tier 2       0.54      1.00      0.28      0.70      0.53      0.30       149
     Tier 3       0.00      0.00      1.00      0.00      0.00      0.00       125
     Tier 4       1.00      1.00      1.00      1.00      1.00      1.00        18

avg / total       0.40      0.61      0.67      0.48      0.39      0.29       322



  _warn_prf(average, modifier, msg_start, len(result))
