# Attempting to Fit Some Models on the Data

In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.svm import SVC
import graphviz

In [2]:
df = pd.read_csv("project_preprocess.csv")
df = df.drop("positive_ratings", axis=1)

In [3]:
df.head()

Unnamed: 0,day_elapse,achievements,average_playtime,median_playtime,price,positive_percentage,english_0,english_1,required_age_0,required_age_3,...,Web Publishing.1,Werewolves,Western,Word Game,World War I,World War II,Wrestling,Zombies,e-sports,popularity
0,-7.58471,-0.128306,9.557829,0.072624,0.141186,1.110537,0,1,1,0,...,0,0,0,0,0,0,0,0,0,Main Stream
1,-8.329745,-0.128306,0.069619,-0.03571,-0.265175,0.536451,0,1,1,0,...,0,0,0,0,0,0,0,0,0,Main Stream
2,-6.414491,-0.128306,0.020358,-0.047606,-0.265175,0.775589,0,1,1,0,...,0,0,0,0,0,1,0,0,0,Main Stream
3,-7.312387,-0.128306,0.05922,0.01612,-0.265175,0.480095,0,1,1,0,...,0,0,0,0,0,0,0,0,0,Main Stream
4,-8.054853,-0.128306,0.259548,0.114258,-0.265175,0.999691,0,1,1,0,...,0,0,0,0,0,0,0,0,0,Main Stream


In [4]:
(df_train, df_test) = train_test_split(df, random_state=0)

In [5]:
X_train = df_train.drop("popularity", axis=1)
X_test = df_test.drop("popularity", axis=1)
y_train = df_train.popularity
y_test = df_test.popularity

## Simple Bias Classifier

In [6]:
y_train.value_counts()

Low popularity    13938
Obscure games      3588
Quite Popular      2355
Popular             369
Main Stream          56
Name: popularity, dtype: int64

In [7]:
y_test.value_counts()

Low popularity    4658
Obscure games     1166
Quite Popular      816
Popular            112
Main Stream         17
Name: popularity, dtype: int64

In [8]:
print('train error: ', 1 - 13938/ y_train.count())
print('test error: ', 1 - 4658 / y_test.count())

train error:  0.3136018910666798
test error:  0.311862904417196


## K Nearest Neighbors

In [11]:
grid = {'n_neighbors': np.arange(1, 1001, 100)}

In [15]:
knn = KNeighborsClassifier()
knnCV = GridSearchCV(knn, param_grid=grid, return_train_score=True, n_jobs=2)
knnCV.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(), n_jobs=2,
             param_grid={'n_neighbors': array([  1, 101, 201, 301, 401, 501, 601, 701, 801, 901])},
             return_train_score=True)

In [16]:
print('best params: ', knnCV.best_params_)
print('train error: ', 1 - knnCV.best_score_)
print('test error: ', 1 - knnCV.best_estimator_.score(X_test, y_test))

best params:  {'n_neighbors': 101}
train error:  0.28119795715050067
test error:  0.2786231348795982


### Finer Search

In [36]:
grid = {'n_neighbors': np.arange(50, 200, 5)}
knn = KNeighborsClassifier()
knnCV = GridSearchCV(knn, param_grid=grid, return_train_score=True, n_jobs=2)
knnCV.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(), n_jobs=2,
             param_grid={'n_neighbors': array([ 50,  55,  60,  65,  70,  75,  80,  85,  90,  95, 100, 105, 110,
       115, 120, 125, 130, 135, 140, 145, 150, 155, 160, 165, 170, 175,
       180, 185, 190, 195])},
             return_train_score=True)

In [37]:
print('best params: ', knnCV.best_params_)
print('train error: ', 1 - knnCV.best_score_)
print('test error: ', 1 - knnCV.best_estimator_.score(X_test, y_test))

best params:  {'n_neighbors': 55}
train error:  0.27381086874208205
test error:  0.2719751809720786


## Decision Tree

In [26]:
grid = {'max_depth': np.arange(1, 15, 1)}

In [27]:
dt = DecisionTreeClassifier()
dtCV = GridSearchCV(dt, param_grid=grid, return_train_score=True, n_jobs=-1)
dtCV.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])},
             return_train_score=True)

In [28]:
print('best params: ', dtCV.best_params_)
print('train error: ', 1 - dtCV.best_score_)
print('test error: ', 1 - dtCV.best_estimator_.score(X_test, y_test))

best params:  {'max_depth': 8}
train error:  0.20535810912147112
test error:  0.2006204756980352


In [30]:
# Too Big Can't Really Show
dot_data = export_graphviz(dtCV.best_estimator_,
                           out_file=None,
                           feature_names=X_train.columns,
                           proportion=True,
                           impurity=True,
                           filled=True,
                           rounded=True)
graph = graphviz.Source(dot_data)
# graph

## SVM

In [32]:
svc = SVC()

In [33]:
grid = {'C': [0.1, 1, 10, 100], 'kernel': ['rbf', 'poly', 'sigmoid']}

In [34]:
svcCV = GridSearchCV(svc, param_grid=grid, return_train_score=True, n_jobs=-1)
svcCV.fit(X_train, y_train)

GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100],
                         'kernel': ['rbf', 'poly', 'sigmoid']},
             return_train_score=True)

In [35]:
print('best params: ', svcCV.best_params_)
print('train error: ', 1 - svcCV.best_score_)
print('test error: ', 1 - svcCV.best_estimator_.score(X_test, y_test))

best params:  {'C': 10, 'kernel': 'rbf'}
train error:  0.20639231289550275
test error:  0.19456345102673955
