In [15]:
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [16]:
# Read in the data
df = pd.read_csv('../Resources/finalClean_games.csv')
df.head()

Unnamed: 0,Game Name,Franchise,Platform,Release Year,Years Since,First Release Year,Main Genre,Tier by Score,Critics,Critic_Score,Users,User Score,Developer,Global Sales,Other Info
0,#IDARB,#IDARB,Xbox,2015,0,2015,Action,Tier 3,31,77,88,6.9,OtherOceanInteractive,0.09,"Action,General"
1,007: Quantum of Solace,007: Quantum of Solace,PlayStation,2008,0,2008,Action,Tier 4,42,65,59,6.6,Treyarch,1.14,"Action,General,Shooter,Shooter,First-Person,Mo..."
2,007: Quantum of Solace,007: Quantum of Solace,PC,2008,0,2008,Action,Tier 3,18,70,64,6.1,"Treyarch,Beenox",0.03,"Action,General,Shooter,Shooter,First-Person,Mo..."
3,0RBITALIS,0RBITALIS,PC,2015,0,2015,General,Tier 3,9,73,4,7.3,AlanZucconi,,"Miscellaneous,Puzzle,Action"
4,10 Second Ninja,10 Second Ninja,PC,2014,0,2014,Action,Tier 3,13,72,16,7.1,GameDesignDan,0.02,"Action,Platformer,2D"


In [24]:
df = df.drop(columns=['Other Info','Game Name'])

In [26]:
df = df.dropna(axis='columns', how='all')

In [27]:
df.dropna()

Unnamed: 0,Franchise,Platform,Release Year,Years Since,First Release Year,Main Genre,Tier by Score,Critics,Critic_Score,Users,User Score,Developer,Global Sales
0,#IDARB,Xbox,2015,0,2015,Action,Tier 3,31,77,88,6.9,OtherOceanInteractive,0.09
1,007: Quantum of Solace,PlayStation,2008,0,2008,Action,Tier 4,42,65,59,6.6,Treyarch,1.14
2,007: Quantum of Solace,PC,2008,0,2008,Action,Tier 3,18,70,64,6.1,"Treyarch,Beenox",0.03
4,10 Second Ninja,PC,2014,0,2014,Action,Tier 3,13,72,16,7.1,GameDesignDan,0.02
5,10 Second Ninja,Xbox,2016,2,2014,Action,Tier 2,10,82,8,4.5,GameDesignDan,0.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5437,You Don't Know Jack,PlayStation,2011,11,2000,General,Tier 2,11,82,17,8.2,"Jellyvision,IronGalaxyStudios",0.18
5477,Zoo Tycoon,PC,2004,3,2001,Strategy,Tier 3,34,72,63,7.9,BlueFangGames,0.06
5486,echochrome,PlayStation,2008,0,2008,General,Tier 2,32,81,17,7.3,"SCE/WWS,SCEJapanStudio",0.03
5487,echochrome,PlayStation,2010,2,2008,General,Tier 3,28,79,11,6.5,SCEJapanStudio,0.13


# Split data into training and testing

In [28]:
# Create our features
X = df.drop(columns='Global Sales')
# Create our target
y = df['Global Sales']

In [29]:
encoded_X = pd.get_dummies(X)

In [30]:
# Check the balance of our target values
y.value_counts()

0.02    80
0.03    66
0.04    47
0.05    43
0.01    37
        ..
1.52     1
2.63     1
2.82     1
1.40     1
1.32     1
Name: Global Sales, Length: 304, dtype: int64

In [31]:
X.describe()

Unnamed: 0,Release Year,Years Since,First Release Year,Critics,Critic_Score,Users,User Score
count,5497.0,5497.0,5497.0,5497.0,5497.0,5497.0,5497.0
mean,2013.422048,4.055121,2009.366927,25.326542,79.054393,390.055667,7.187084
std,5.629074,5.36547,6.755085,19.1994,6.044726,2227.400957,1.21743
min,1995.0,0.0,1995.0,0.0,65.0,0.0,0.4
25%,2010.0,0.0,2003.0,11.0,75.0,29.0,6.7
50%,2015.0,1.0,2010.0,19.0,79.0,78.0,7.5
75%,2018.0,7.0,2015.0,33.0,83.0,258.0,8.0
max,2020.0,24.0,2020.0,126.0,98.0,146262.0,9.7


In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(encoded_X,
    y, random_state=1, stratify=y)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

# Balanced Random Forest Classifier

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brf = BalancedRandomForestClassifier(n_estimators=100 , random_state=1)
model = brf.fit(X_train, y_train)
y_pred_brf = brf.predict(X_test)
brf

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance

features = sorted(zip(X.columns, brf.feature_importances_), key = lambda x: x[1], reverse=True)
cols = [brf for f in features]

features

# Easy Ensemble AdaBoost Classifier

In [None]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)
eec

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = eec.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))