In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Read in the data
df = pd.read_csv('finalClean_games.csv')
df.head()

Unnamed: 0,Game Name,Franchise,Platform,Release Year,Years Since,First Release Year,Main Genre,Tier by Score,Critics,Critic_Score,Users,User Score,Developer,Global Sales,Other Info
0,#IDARB,#IDARB,Xbox,2015,0,2015,Action,Tier 3,31,77,88,6.9,OtherOceanInteractive,0.09,"Action,General"
1,007: Quantum of Solace,007: Quantum of Solace,PlayStation,2008,0,2008,Action,Tier 4,42,65,59,6.6,Treyarch,1.14,"Action,General,Shooter,Shooter,First-Person,Mo..."
2,007: Quantum of Solace,007: Quantum of Solace,PC,2008,0,2008,Action,Tier 3,18,70,64,6.1,"Treyarch,Beenox",0.03,"Action,General,Shooter,Shooter,First-Person,Mo..."
3,0RBITALIS,0RBITALIS,PC,2015,0,2015,General,Tier 3,9,73,4,7.3,AlanZucconi,,"Miscellaneous,Puzzle,Action"
4,10 Second Ninja,10 Second Ninja,PC,2014,0,2014,Action,Tier 3,13,72,16,7.1,GameDesignDan,0.02,"Action,Platformer,2D"


In [3]:
df = df.drop(columns=['Other Info','Game Name', 'Users'])

In [4]:
df = df.dropna(axis='columns', how='all')

In [5]:
df = df.dropna()

In [6]:
# Print out value counts for columns with many unique values
developer_counts = df.Developer.value_counts()
developer_counts

Codemasters                     33
UbisoftMontreal                 31
EASports                        26
EACanada                        24
Capcom                          22
                                ..
IRGurus                          1
IoInteractive,NixxesSoftware     1
ClapHanz                         1
1C:MaddoxGames                   1
SCE/WWS,SCEJapanStudio           1
Name: Developer, Length: 411, dtype: int64

In [7]:
# Determine which values to replace
replace_devs = list(developer_counts[developer_counts < 5].index)

# Replace in DataFrame
for devs in replace_devs:
    df.Developer = df.Developer.replace(devs,"Other")


# Check to make sure binning was successful
df.Developer.value_counts()

Other                       582
Codemasters                  33
UbisoftMontreal              31
EASports                     26
EACanada                     24
                           ... 
NamcoBandaiGames              5
ParadoxDevelopmentStudio      5
PlatinumGames                 5
Starbreeze                    5
EnsembleStudios               5
Name: Developer, Length: 70, dtype: int64

# Split data into training and testing

In [8]:
# Create our features
X = df.drop(columns='Tier by Score')
# Create our target
y = df['Tier by Score']

In [9]:
encoded_X = pd.get_dummies(X)
encoded_X

Unnamed: 0,Release Year,Years Since,First Release Year,Critics,Critic_Score,User Score,Global Sales,Franchise_#IDARB,Franchise_007: Quantum of Solace,Franchise_10 Second Ninja,...,Developer_Treyarch,Developer_TriumphStudios,Developer_Ubisoft,Developer_UbisoftMontreal,Developer_UbisoftRomania,Developer_ValveSoftware,Developer_VisceralGames,Developer_VisualConcepts,Developer_VolitionInc.,Developer_Yuke's
0,2015,0,2015,31,77,6.9,0.09,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2008,0,2008,42,65,6.6,1.14,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,2008,0,2008,18,70,6.1,0.03,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2014,0,2014,13,72,7.1,0.02,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,2016,2,2014,10,82,4.5,0.11,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5437,2011,11,2000,11,82,8.2,0.18,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5477,2004,3,2001,34,72,7.9,0.06,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5486,2008,0,2008,32,81,7.3,0.03,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5487,2010,2,2008,28,79,6.5,0.13,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Check the balance of our target values
y.value_counts()

Tier 2    595
Tier 3    499
Tier 1    119
Tier 4     74
Name: Tier by Score, dtype: int64

In [11]:
X.describe()

Unnamed: 0,Release Year,Years Since,First Release Year,Critics,Critic_Score,User Score,Global Sales
count,1287.0,1287.0,1287.0,1287.0,1287.0,1287.0,1287.0
mean,2010.987568,4.689977,2006.297591,36.008547,80.340326,7.264724,0.934584
std,4.146782,4.830832,5.086577,20.562519,6.689918,1.206095,1.668164
min,1996.0,0.0,1996.0,0.0,65.0,2.1,0.01
25%,2008.0,0.0,2002.0,20.0,76.0,6.7,0.08
50%,2011.0,3.0,2007.0,32.0,80.0,7.6,0.37
75%,2014.0,8.0,2010.0,48.0,85.0,8.1,1.06
max,2020.0,24.0,2020.0,113.0,98.0,9.4,21.04


In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(encoded_X,
    y, random_state=1, stratify=y)

In [13]:
X_train

Unnamed: 0,Release Year,Years Since,First Release Year,Critics,Critic_Score,User Score,Global Sales,Franchise_#IDARB,Franchise_007: Quantum of Solace,Franchise_10 Second Ninja,...,Developer_Treyarch,Developer_TriumphStudios,Developer_Ubisoft,Developer_UbisoftMontreal,Developer_UbisoftRomania,Developer_ValveSoftware,Developer_VisceralGames,Developer_VisualConcepts,Developer_VolitionInc.,Developer_Yuke's
1151,2019,5,2014,10,73,6.9,1.93,0,0,0,...,0,0,0,0,0,0,0,0,0,0
250,2011,4,2007,20,80,7.5,0.74,0,0,0,...,0,0,0,1,0,0,0,0,0,0
930,2004,0,2004,26,73,7.6,0.23,0,0,0,...,0,0,0,0,0,0,0,0,0,0
873,2009,3,2006,42,70,7.2,0.04,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2887,2015,7,2008,81,83,7.8,2.78,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1935,2014,12,2002,14,97,7.8,5.48,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2664,2009,8,2001,48,74,7.2,0.73,0,0,0,...,0,0,0,0,0,0,0,0,0,0
753,2010,0,2010,62,85,8.1,0.04,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2876,2011,3,2008,60,84,8.5,2.84,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
X_test 

Unnamed: 0,Release Year,Years Since,First Release Year,Critics,Critic_Score,User Score,Global Sales,Franchise_#IDARB,Franchise_007: Quantum of Solace,Franchise_10 Second Ninja,...,Developer_Treyarch,Developer_TriumphStudios,Developer_Ubisoft,Developer_UbisoftMontreal,Developer_UbisoftRomania,Developer_ValveSoftware,Developer_VisceralGames,Developer_VisualConcepts,Developer_VolitionInc.,Developer_Yuke's
1780,2015,2,2013,85,87,7.7,1.80,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4412,2015,0,2015,70,81,8.0,0.26,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4796,2011,4,2007,76,88,8.5,0.96,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3021,2009,9,2000,35,88,8.1,0.57,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1170,2016,16,2000,18,83,7.5,0.17,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4912,2013,15,1998,13,82,7.5,0.19,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1483,2004,0,2004,51,75,8.3,0.02,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1242,2016,4,2012,62,88,7.9,0.64,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1541,2016,16,2000,42,85,4.9,7.59,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Balanced Random Forest Classifier

In [15]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brf = BalancedRandomForestClassifier(n_estimators=100 , random_state=1)
model = brf.fit(X_train, y_train)
y_pred_brf = brf.predict(X_test)
brf

BalancedRandomForestClassifier(random_state=1)

In [16]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
# predict new games 
#y_pred = model.predict(new_games)
confusion_matrix(y_test, y_pred)

array([[ 28,   2,   0,   0],
       [ 11, 128,  10,   0],
       [  0,   1, 116,   8],
       [  0,   0,   0,  18]], dtype=int64)

In [17]:
y_pred

array(['Tier 2', 'Tier 2', 'Tier 1', 'Tier 2', 'Tier 3', 'Tier 2',
       'Tier 3', 'Tier 2', 'Tier 1', 'Tier 3', 'Tier 2', 'Tier 1',
       'Tier 3', 'Tier 3', 'Tier 2', 'Tier 2', 'Tier 2', 'Tier 2',
       'Tier 3', 'Tier 3', 'Tier 3', 'Tier 3', 'Tier 3', 'Tier 3',
       'Tier 3', 'Tier 2', 'Tier 3', 'Tier 1', 'Tier 4', 'Tier 3',
       'Tier 2', 'Tier 3', 'Tier 1', 'Tier 2', 'Tier 2', 'Tier 1',
       'Tier 3', 'Tier 3', 'Tier 2', 'Tier 3', 'Tier 2', 'Tier 2',
       'Tier 3', 'Tier 1', 'Tier 2', 'Tier 2', 'Tier 3', 'Tier 2',
       'Tier 3', 'Tier 3', 'Tier 2', 'Tier 3', 'Tier 3', 'Tier 3',
       'Tier 1', 'Tier 1', 'Tier 2', 'Tier 3', 'Tier 3', 'Tier 1',
       'Tier 1', 'Tier 2', 'Tier 3', 'Tier 4', 'Tier 2', 'Tier 2',
       'Tier 3', 'Tier 3', 'Tier 3', 'Tier 2', 'Tier 1', 'Tier 3',
       'Tier 2', 'Tier 4', 'Tier 3', 'Tier 3', 'Tier 2', 'Tier 2',
       'Tier 2', 'Tier 2', 'Tier 4', 'Tier 1', 'Tier 3', 'Tier 2',
       'Tier 3', 'Tier 2', 'Tier 4', 'Tier 3', 'Tier 2', 'Tier

In [18]:
X_test

Unnamed: 0,Release Year,Years Since,First Release Year,Critics,Critic_Score,User Score,Global Sales,Franchise_#IDARB,Franchise_007: Quantum of Solace,Franchise_10 Second Ninja,...,Developer_Treyarch,Developer_TriumphStudios,Developer_Ubisoft,Developer_UbisoftMontreal,Developer_UbisoftRomania,Developer_ValveSoftware,Developer_VisceralGames,Developer_VisualConcepts,Developer_VolitionInc.,Developer_Yuke's
1780,2015,2,2013,85,87,7.7,1.80,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4412,2015,0,2015,70,81,8.0,0.26,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4796,2011,4,2007,76,88,8.5,0.96,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3021,2009,9,2000,35,88,8.1,0.57,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1170,2016,16,2000,18,83,7.5,0.17,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4912,2013,15,1998,13,82,7.5,0.19,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1483,2004,0,2004,51,75,8.3,0.02,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1242,2016,4,2012,62,88,7.9,0.64,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1541,2016,16,2000,42,85,4.9,7.59,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.9300984340044742

In [20]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     Tier 1       0.72      0.93      0.96      0.81      0.95      0.90        30
     Tier 2       0.98      0.86      0.98      0.91      0.92      0.83       149
     Tier 3       0.92      0.93      0.95      0.92      0.94      0.88       125
     Tier 4       0.69      1.00      0.97      0.82      0.99      0.98        18

avg / total       0.92      0.90      0.97      0.90      0.93      0.87       322



In [21]:
# List the features sorted in descending order by feature importance

features = sorted(zip(X.columns, brf.feature_importances_), key = lambda x: x[1], reverse=True)
cols = [brf for f in features]

features

[('First Release Year', 0.26025213470532976),
 ('Main Genre', 0.09750696543732806),
 ('Years Since', 0.07351165643786887),
 ('Critics', 0.06598228886770126),
 ('Release Year', 0.04603051944376344),
 ('Franchise', 0.04589748939987766),
 ('Platform', 0.03473766917677299),
 ('User Score', 0.0010767378119059317),
 ('Developer', 0.00038969715308851153),
 ('Critic_Score', 0.00017189904464637992),
 ('Global Sales', 0.0)]

# Easy Ensemble AdaBoost Classifier

In [22]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)
eec

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [23]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = eec.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 30,   0,   0,   0],
       [  0, 149,   0,   0],
       [  0, 125,   0,   0],
       [  0,   0,   0,  18]], dtype=int64)

In [24]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.75

In [25]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     Tier 1       1.00      1.00      1.00      1.00      1.00      1.00        30
     Tier 2       0.54      1.00      0.28      0.70      0.53      0.30       149
     Tier 3       0.00      0.00      1.00      0.00      0.00      0.00       125
     Tier 4       1.00      1.00      1.00      1.00      1.00      1.00        18

avg / total       0.40      0.61      0.67      0.48      0.39      0.29       322



  _warn_prf(average, modifier, msg_start, len(result))
