# Regression Models

In [18]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import GridSearchCV

In [19]:
df = pd.read_csv("project_preprocess.csv")
df.drop('positive_percentage', axis=1)
df.head()

Unnamed: 0,day_elapse,achievements,average_playtime,median_playtime,price,positive_percentage,english_0,english_1,required_age_0,required_age_3,...,tag_Western,tag_Word Game,tag_World War I,tag_World War II,tag_Wrestling,tag_Zombies,tag_e-sports,popularity,positive_ratings,negative_ratings
0,-7.58471,-0.128306,9.557829,0.072624,0.141186,1.110537,0,1,1,0,...,0,0,0,0,0,0,0,Popular,124534,3339
1,-8.329745,-0.128306,0.069619,-0.03571,-0.265175,0.536451,0,1,1,0,...,0,0,0,0,0,0,0,Popular,3318,633
2,-6.414491,-0.128306,0.020358,-0.047606,-0.265175,0.775589,0,1,1,0,...,0,0,0,1,0,0,0,Popular,3416,398
3,-7.312387,-0.128306,0.05922,0.01612,-0.265175,0.480095,0,1,1,0,...,0,0,0,0,0,0,0,Popular,1273,267
4,-8.054853,-0.128306,0.259548,0.114258,-0.265175,0.999691,0,1,1,0,...,0,0,0,0,0,0,0,Popular,5250,288


In [20]:
df.positive_percentage = df.positive_ratings.divide(df.positive_ratings + df.negative_ratings)

In [21]:
(df_train, df_test) = train_test_split(df, random_state=0)

In [22]:
X_train = df_train.drop(["positive_percentage", "positive_ratings", "negative_ratings", "popularity"], axis=1)
X_test = df_test.drop(["positive_percentage", "positive_ratings", "negative_ratings", "popularity"], axis=1)
y_train = df_train.positive_percentage
y_test = df_test.positive_percentage

## Linear Model

In [6]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print("Training R2: ", lr.score(X_train, y_train))
print("Test R2: ", lr.score(X_test, y_test))

Training R2:  0.13063861062903948
Test R2:  -40164027.075303175


In [21]:
coef = pd.Series(lr.coef_, index=X_train.columns)
coef.sort_values()

required_age_12   -1.665435e+09
required_age_16   -1.665435e+09
required_age_18   -1.665435e+09
required_age_7    -1.665435e+09
required_age_0    -1.665435e+09
                       ...     
tag_3D Vision      3.736369e-01
tag_Blood          1.221183e+05
Documentary        8.964884e+06
english_1          1.857707e+10
english_0          1.857707e+10
Length: 413, dtype: float64

## Ridge

In [16]:
grid = {'alpha': [0.0001,0.01,0.1,1,10,100,1000,10000,100000]}
ridge = Ridge()
ridgeCV = GridSearchCV(ridge, param_grid=grid, return_train_score=True)
ridgeCV.fit(X_train, y_train)

GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': [0.0001, 0.01, 0.1, 1, 10, 100, 1000, 10000,
                                   100000]},
             return_train_score=True)

In [18]:
print('best params: ', ridgeCV.best_params_)
print('train R2: ', ridgeCV.best_score_)
print('test R2: ', ridgeCV.best_estimator_.score(X_test, y_test))

best params:  {'alpha': 10}
train R2:  0.107270963019565
test R2:  0.10244809238746344


In [20]:
ridge = ridgeCV.best_estimator_
coef = pd.Series(ridge.coef_, index=X_train.columns)
coef.sort_values()

tag_Basketball         -0.095402
tag_Comic Book         -0.085100
Software Training      -0.082750
tag_Mining             -0.057610
Violent                -0.057450
                          ...   
tag_Education           0.087521
tag_Visual Novel        0.105043
SteamVR Collectibles    0.111640
tag_Cute                0.127378
tag_Classic             0.148029
Length: 413, dtype: float64

## Lasso

In [30]:
grid = {'alpha': [0.00005, 0.0001,0.01,0.1,1,10,100,1000,10000,100000]}
lasso = Lasso()
lassoCV = GridSearchCV(lasso, param_grid=grid, return_train_score=True)
lassoCV.fit(X_train, y_train)

GridSearchCV(estimator=Lasso(),
             param_grid={'alpha': [5e-05, 0.0001, 0.01, 0.1, 1, 10, 100, 1000,
                                   10000, 100000]},
             return_train_score=True)

In [31]:
print('best params: ', lassoCV.best_params_)
print('train R2: ', lassoCV.best_score_)
print('test R2: ', lassoCV.best_estimator_.score(X_test, y_test))

best params:  {'alpha': 5e-05}
train R2:  0.10635674775028763
test R2:  0.10189801769083995


In [32]:
lasso = lassoCV.best_estimator_
coef = pd.Series(lasso.coef_, index=X_train.columns)
coef.sort_values()

tag_Basketball         -0.100052
Violent                -0.057733
Software Training      -0.057093
In-App Purchases       -0.056851
tag_Comic Book         -0.046748
                          ...   
tag_Education           0.079693
tag_Visual Novel        0.097388
SteamVR Collectibles    0.116532
tag_Cute                0.127730
tag_Classic             0.144534
Length: 413, dtype: float64

In [38]:
coef[coef.isin([0])]

english_1         -0.0
required_age_3     0.0
required_age_7    -0.0
required_age_18    0.0
windows           -0.0
                  ... 
tag_Werewolves    -0.0
tag_Word Game      0.0
tag_World War I    0.0
tag_Wrestling      0.0
tag_e-sports      -0.0
Length: 250, dtype: float64

In [39]:
coef[coef.isin([0])].count()

250

In [40]:
coef.count()

413

## Decision Tree

In [6]:
tree = DecisionTreeRegressor()

In [7]:
tree.fit(X_train, y_train)

DecisionTreeRegressor()

In [9]:
cv = cross_validate(tree, X_train, y_train, return_train_score=True)

In [11]:
training_score = cv['train_score'].mean()
validation_score = cv['test_score'].mean()
test_score = tree.score(X_test,y_test)

In [12]:
print('  training score:',training_score.round(3))
print('validation score:',validation_score.round(3))
print('test score:',test_score.round(3))

  training score: 0.996
validation score: -0.683
test score: -0.711


Score is worse than baseline

## Gradient Boosting Tree

In [23]:
tree = GradientBoostingRegressor()

In [24]:
tree.fit(X_train, y_train)

GradientBoostingRegressor()

In [25]:
tree.score(X_test, y_test).round(3)

0.136

Better than baseline, but not great

In [26]:
tr = GradientBoostingRegressor()
grid = {'n_estimators':[10,100,200], 'max_depth' : [1,5,10], 'learning_rate':[.1,.05]}
grid_search = GridSearchCV(tr, param_grid = grid, return_train_score = True, n_jobs=-1)

grid_search.fit(X_train,y_train)

ValueError: Length of values (18) does not match length of index (3)

In [28]:
print('         best B =',grid_search.best_params_)
print('  validation R2 =',grid_search.best_score_.round(3))

         best B = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
  validation R2 = 0.162


In [30]:
test_score = grid_search.best_estimator_.score(X_test, y_test).round(3)
test_score

0.159

.159 is a better result, still not great