# Attempting to Fit Some Models on the Data

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
import graphviz

In [5]:
df = pd.read_csv("project_preprocess.csv")
df = df.drop("positive_ratings", axis=1)

In [6]:
df.head()

Unnamed: 0,day_elapse,achievements,average_playtime,median_playtime,price,positive_percentage,english_0,english_1,required_age_0,required_age_3,...,tag_Web Publishing,tag_Werewolves,tag_Western,tag_Word Game,tag_World War I,tag_World War II,tag_Wrestling,tag_Zombies,tag_e-sports,popularity
0,-7.58471,-0.128306,9.557829,0.072624,0.141186,1.110537,0,1,1,0,...,0,0,0,0,0,0,0,0,0,Popular
1,-8.329745,-0.128306,0.069619,-0.03571,-0.265175,0.536451,0,1,1,0,...,0,0,0,0,0,0,0,0,0,Popular
2,-6.414491,-0.128306,0.020358,-0.047606,-0.265175,0.775589,0,1,1,0,...,0,0,0,0,0,1,0,0,0,Popular
3,-7.312387,-0.128306,0.05922,0.01612,-0.265175,0.480095,0,1,1,0,...,0,0,0,0,0,0,0,0,0,Popular
4,-8.054853,-0.128306,0.259548,0.114258,-0.265175,0.999691,0,1,1,0,...,0,0,0,0,0,0,0,0,0,Popular


In [7]:
(df_train, df_test) = train_test_split(df, random_state=0)

In [8]:
X_train = df_train.drop("popularity", axis=1)
X_test = df_test.drop("popularity", axis=1)
y_train = df_train.popularity
y_test = df_test.popularity

## Simple Bias Classifier

In [9]:
y_train.value_counts()

Low popularity    13938
Average            3588
Popular            2780
Name: popularity, dtype: int64

In [10]:
y_test.value_counts()

Low popularity    4658
Average           1166
Popular            945
Name: popularity, dtype: int64

In [11]:
print('train error: ', 1 - 13938/ y_train.count())
print('test error: ', 1 - 4658 / y_test.count())

train error:  0.3136018910666798
test error:  0.311862904417196


## K Nearest Neighbors

In [12]:
grid = {'n_neighbors': np.arange(1, 1001, 100)}

In [13]:
knn = KNeighborsClassifier()
knnCV = GridSearchCV(knn, param_grid=grid, return_train_score=True, n_jobs=2)
knnCV.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(), n_jobs=2,
             param_grid={'n_neighbors': array([  1, 101, 201, 301, 401, 501, 601, 701, 801, 901])},
             return_train_score=True)

In [14]:
print('best params: ', knnCV.best_params_)
print('train error: ', 1 - knnCV.best_score_)
print('test error: ', 1 - knnCV.best_estimator_.score(X_test, y_test))

best params:  {'n_neighbors': 101}
train error:  0.26145013313100274
test error:  0.2607475254838233


### Finer Search

In [15]:
grid = {'n_neighbors': np.arange(50, 200, 5)}
knn = KNeighborsClassifier()
knnCV = GridSearchCV(knn, param_grid=grid, return_train_score=True, n_jobs=2)
knnCV.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
print('best params: ', knnCV.best_params_)
print('train error: ', 1 - knnCV.best_score_)
print('test error: ', 1 - knnCV.best_estimator_.score(X_test, y_test))

## Decision Tree

In [19]:
grid = {'max_depth': np.arange(1, 15, 1)}

In [20]:
dt = DecisionTreeClassifier()
dtCV = GridSearchCV(dt, param_grid=grid, return_train_score=True, n_jobs=-1)
dtCV.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])},
             return_train_score=True)

In [21]:
print('best params: ', dtCV.best_params_)
print('train error: ', 1 - dtCV.best_score_)
print('test error: ', 1 - dtCV.best_estimator_.score(X_test, y_test))

best params:  {'max_depth': 6}
train error:  0.18886036442528165
test error:  0.1880632294282759


In [22]:
# Too Big Can't Really Show
dot_data = export_graphviz(dtCV.best_estimator_,
                           out_file=None,
                           feature_names=X_train.columns,
                           proportion=True,
                           impurity=True,
                           filled=True,
                           rounded=True)
graph = graphviz.Source(dot_data)
# graph

## SVM

In [None]:
svc = SVC(kernel='rbf')

In [None]:
grid = {'C': [0.1, 1, 10, 100]}

In [None]:
svcCV = GridSearchCV(svc, param_grid=grid, return_train_score=True, n_jobs=-1)
svcCV.fit(X_train, y_train)

In [None]:
print('best params: ', svcCV.best_params_)
print('train error: ', 1 - svcCV.best_score_)
print('test error: ', 1 - svcCV.best_estimator_.score(X_test, y_test))

In [23]:
y_test_pred = dtCV.best_estimator_.predict(X_test)

In [24]:
y_test_pred

array(['Popular', 'Low popularity', 'Low popularity', ..., 'Popular',
       'Low popularity', 'Low popularity'], dtype=object)

In [25]:
confusion_matrix(y_test, y_test_pred)

array([[ 497,  469,  200],
       [ 253, 4369,   36],
       [ 279,   36,  630]])