In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import GaussianNB

In [2]:
df = pd.read_csv('vgsales.csv')

In [3]:
db = pd.read_csv('vgsales.csv', 
                         skiprows=1, names = ['Rank', 'Name'])

In [7]:
genre = df[['Name','Genre']]

In [9]:
genre.sample(10)

Unnamed: 0,Name,Genre
12694,Burning Rangers,Platform
9431,Cabela's Dangerous Hunts 2009,Sports
2710,Deer Drive,Sports
3297,Just Dance 2014,Misc
8225,Sports Illustrated for Kids: Baseball,Sports
14673,Kaitou Tenshi Twin Angel: Toki to Sekai no Mei...,Adventure
14429,Far East of Eden Shinden,Fighting
8748,Knockout Kings 2003,Sports
16218,Family Jockey,Sports
7179,A Boy and His Blob,Platform


In [11]:
genre.shape

(16598, 2)

In [12]:
genre['Genre'].nunique()

12

In [13]:
genre['Genre'].unique()

array(['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Misc',
       'Shooter', 'Simulation', 'Action', 'Fighting', 'Adventure',
       'Strategy'], dtype=object)

In [14]:
X = genre['Name']
Y = genre['Genre']

In [15]:
X.head(10)

0                   Wii Sports
1            Super Mario Bros.
2               Mario Kart Wii
3            Wii Sports Resort
4     Pokemon Red/Pokemon Blue
5                       Tetris
6        New Super Mario Bros.
7                     Wii Play
8    New Super Mario Bros. Wii
9                    Duck Hunt
Name: Name, dtype: object

In [16]:
Y.head(10)

0          Sports
1        Platform
2          Racing
3          Sports
4    Role-Playing
5          Puzzle
6        Platform
7            Misc
8        Platform
9         Shooter
Name: Genre, dtype: object

In [17]:
count_vectorizer = CountVectorizer(ngram_range=(2, 2))

transformed_vector = count_vectorizer.fit_transform(X)

In [18]:
transformed_vector.shape

(16598, 20535)

In [19]:
tfidf_transformer = TfidfTransformer()

tfidf_vector = tfidf_transformer.fit_transform(transformed_vector)

In [20]:
X_dense = tfidf_vector.todense()

In [21]:
X_dense.shape

(16598, 20535)

In [22]:
x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)

In [23]:
x_train.shape, x_test.shape

((13278, 20535), (3320, 20535))

In [24]:
y_train.shape, y_test.shape

((13278,), (3320,))

In [27]:
def summarize_classification(y_test,y_pred,avg_method='weighted'):
    acc = accuracy_score(y_test, y_pred,normalize=True)
    num_acc = accuracy_score(y_test, y_pred,normalize=False)
    prec = precision_score(y_test, y_pred, average=avg_method)
    recall = recall_score(y_test, y_pred, average=avg_method)
    
    print("Length of testing data: ", len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)

In [28]:
clf = GaussianNB().fit(x_train, y_train)

In [29]:
y_pred = clf.predict(x_test)

In [30]:
summarize_classification(y_test, y_pred)

Length of testing data:  3320
accuracy_count :  2165
accuracy_score :  0.6521084337349398
precision_score :  0.8264762106887507
recall_score :  0.6521084337349398


In [31]:
y_test.head()

8365       Action
9997    Adventure
8757       Puzzle
133       Shooter
2433       Sports
Name: Genre, dtype: object

In [32]:
y_test = np.array(y_test)

In [35]:
pred_results = pd.DataFrame({'y_test': pd.Series(y_test),
                             'y_pred': pd.Series(y_pred)})

pred_results.sample(55)

Unnamed: 0,y_test,y_pred
2137,Action,Action
2639,Sports,Sports
2854,Racing,Racing
1366,Role-Playing,Role-Playing
896,Adventure,Puzzle
3130,Misc,Misc
1110,Puzzle,Puzzle
3047,Action,Action
1259,Shooter,Shooter
3082,Shooter,Shooter
