In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
import graphviz
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB

In [2]:
# load dataset from csv file
df = pd.read_csv("datasets/Video_Game_Sales_as_of_Jan_2017.csv", index_col="Name")
df.head()

Unnamed: 0_level_0,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Rating
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.54,76.0,51.0,8.0,324.0,E
Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,
Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.8,3.79,3.29,35.57,82.0,73.0,8.3,712.0,E
Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.95,3.28,2.95,32.78,80.0,73.0,8.0,193.0,E
Pokemon Red/Pokemon Blue,G,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,


In [3]:
# drop columns that is useless
df.drop(columns=['Critic_Score', 'Critic_Count', 'User_Score', 'User_Count', 'Global_Sales', 'Year_of_Release'], inplace=True)
df.head()

Unnamed: 0_level_0,Platform,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Rating
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Wii Sports,Wii,Sports,Nintendo,41.36,28.96,3.77,8.45,E
Super Mario Bros.,NES,Platform,Nintendo,29.08,3.58,6.81,0.77,
Mario Kart Wii,Wii,Racing,Nintendo,15.68,12.8,3.79,3.29,E
Wii Sports Resort,Wii,Sports,Nintendo,15.61,10.95,3.28,2.95,E
Pokemon Red/Pokemon Blue,G,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,


In [4]:
# create label from 4 columns in dataset and drop them
label = df[['NA_Sales' ,'EU_Sales', 'JP_Sales', 'Other_Sales']].idxmax(axis=1)
df.drop(columns=['NA_Sales' ,'EU_Sales', 'JP_Sales', 'Other_Sales'], inplace=True)
df.head()

Unnamed: 0_level_0,Platform,Genre,Publisher,Rating
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Wii Sports,Wii,Sports,Nintendo,E
Super Mario Bros.,NES,Platform,Nintendo,
Mario Kart Wii,Wii,Racing,Nintendo,E
Wii Sports Resort,Wii,Sports,Nintendo,E
Pokemon Red/Pokemon Blue,G,Role-Playing,Nintendo,


In [5]:
# use One Hot Encoding to create binary columns
num_df = pd.get_dummies(df, dtype='float64', dummy_na=True, drop_first=True, columns=list(df))
num_df.head()

Unnamed: 0_level_0,Platform_3DO,Platform_3DS,Platform_DC,Platform_DS,Platform_G,Platform_GBA,Platform_GC,Platform_GEN,Platform_GG,Platform_N64,...,Publisher_responDESIGN,Publisher_nan,Rating_E,Rating_E10+,Rating_EC,Rating_K-A,Rating_M,Rating_RP,Rating_T,Rating_nan
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Wii Sports,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Super Mario Bros.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Mario Kart Wii,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Wii Sports Resort,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pokemon Red/Pokemon Blue,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [6]:
# split train test set from data input and label
X_train, X_test, y_train, y_test = train_test_split(num_df, label, shuffle=True, test_size=0.33, random_state=47)

In [7]:
# create decision tree object
dt_clf = tree.DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [8]:
# use model to predict on test set and calculate accuracy
y_pred = dt_clf.predict(X_test)
print("Accuracy: {0}".format(accuracy_score(y_test, y_pred)))

Accuracy: 0.8041057759220599


In [9]:
# create a view of decision tree and store it
dot_data = tree.export_graphviz(dt_clf,
                                feature_names=list(num_df), 
                                class_names=['NA_Sales','EU_Sales','JP_Sales','Other_Sales'], 
                                filled=True, 
                                rounded=True, 
                                special_characters=True)

In [10]:
# test Bernoulli Naive Bayes model
nb_clf = BernoulliNB()
nb_clf.fit(X_train, y_train)
y_pred = nb_clf.predict(X_test)
print("Accuracy: {0}".format(accuracy_score(y_test, y_pred)))

Accuracy: 0.8126304801670147
