In [6]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# read in csv file
game_info_df=pd.read_csv("../data/boardgames_07022021.csv")
# remove duplicate games
game_info_df.drop_duplicates(subset=['objectid'], inplace=True)
# =========================================================================================================
# choose the features based on correlations
game_info_df=game_info_df[['average', 'numwanting', 'siteviews', 'blogs', 'minage', 'news',
                           'podcast', 'totalvotes', 'numcomments', 'numgeeklists', 'weblink']].copy()
# =========================================================================================================
# remove the rows which have invalid values
game_info_df.drop(game_info_df[game_info_df['average'] ==0].index, inplace = True)
game_info_df.drop(game_info_df[game_info_df['totalvotes'] ==0].index, inplace = True)
# drop the null rows
game_info_df.dropna(inplace=True)
game_info_df.head()

Unnamed: 0,average,numwanting,siteviews,blogs,minage,news,podcast,totalvotes,numcomments,numgeeklists,weblink
0,8.77503,1374,10583753,690,14,8,147,1171,8021,4649,40
1,8.60721,850,3385064,653,13,10,170,668,6552,3978,70
2,8.66981,1477,1816252,208,14,4,28,521,3303,1630,23
3,8.4287,2085,6059031,1473,12,16,163,1800,10034,7276,71
4,8.68957,979,2267902,99,14,3,53,335,2206,1010,23


In [3]:
# Get the general idea about the dataset value range
game_info_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
average,20138.0,6.434485,0.938374,1.0,5.86667,6.472555,7.058972,9.5
numwanting,20138.0,43.792879,121.608511,0.0,3.0,9.0,30.0,2085.0
siteviews,20138.0,92644.494091,261624.524627,1431.0,14757.0,29878.5,72693.5,10583753.0
blogs,20138.0,12.260701,45.139607,0.0,0.0,2.0,8.0,1473.0
minage,20138.0,9.598073,3.710556,0.0,8.0,10.0,12.0,25.0
news,20138.0,0.580197,1.396812,0.0,0.0,0.0,1.0,37.0
podcast,20138.0,2.137899,8.391469,0.0,0.0,0.0,1.0,588.0
totalvotes,20138.0,16.454067,66.39387,1.0,2.0,3.0,9.0,2108.0
numcomments,20138.0,219.157364,676.344229,0.0,23.0,53.0,146.0,18840.0
numgeeklists,20138.0,259.978101,1183.245497,0.0,24.0,63.0,195.0,136869.0


## Select features, Create Train Test Split, Scale the data

In [7]:
# Set features (X) and target (y)
y=game_info_df['average']
X=game_info_df.drop(['average'],axis=1)

# Scale the data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X)
X_scaled = X_scaler.transform(X)

X_scaled.shape, y.shape

((20138, 10), (20138,))

## GaussianProcessRegressor

In [5]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
kernel = DotProduct() + WhiteKernel()
gpr = GaussianProcessRegressor(kernel=kernel,random_state=0).fit(X_scaled, y)
print(f"Training Data Score: {gpr.score(X, y):.5f}")

Training Data Score: -57118714833.19407


## linear_model

In [8]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
# Create a random forest regressor

regr = make_pipeline(StandardScaler(), SGDRegressor(max_iter=1000, tol=1e-3))
regr=regr.fit(X, y)

print(f"Training Data Score: {regr.score(X, y):.5f}")

Training Data Score: -836475.15862
