In [9]:
import numpy as np
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import string

In [10]:
games = pd.read_csv('switch_game_info_filtered.csv')
critic_reviews = pd.read_csv('switch_critic_review.csv')
user_reviews = pd.read_csv('switch_user_review.csv')
sales = pd.read_csv('sales_switch.csv')

Merge databases

In [11]:
games_sales = pd.merge(left=games, right=sales, how='inner', on='title')

Apply data engineering for developers and filter required columns

In [12]:
fir_party = ['Nintendo','Nintendo EAD Tokyo , Nintendo Software Technology',
             'Retro Studios','Intelligent Systems, Koei Tecmo Games',
             'HAL Labs','Nintendo, Nd Cube', 'Monolith Soft']
sec_party = ['PlatinumGames','Nintendo, PlatinumGames','Next Level Games, Nintendo','Asobo Studio, Engine Software',
             'Camelot Software Planning','Nintendo, GREZZO','Nintendo, HAL Labs, Bandai Namco Games, Sora Ltd.',
             'indieszero']

games_sales['Developer_type'] = 'Third'
games_sales.loc[games_sales['developer'].isin(fir_party), 'Developer_type'] = 'First'
games_sales.loc[games_sales['developer'].isin(sec_party) , 'Developer_type'] = 'Second'



games_sales = games_sales[['total_sales_USDMM',
                           'rating', 'meta_overview',
                           'user_overview', 'genre_action', 'genre_adventure', 'genre_fighting',
                           'genre_platform', 'genre_puzzle', 'genre_racing', 'genre_roleplay',
                           'genre_shooter', 'genre_simulation', 'genre_sports', 'genre_strategy',
                           'genre_misc', 'Developer_type']]
games_sales

Unnamed: 0,total_sales_USDMM,rating,meta_overview,user_overview,genre_action,genre_adventure,genre_fighting,genre_platform,genre_puzzle,genre_racing,genre_roleplay,genre_shooter,genre_simulation,genre_sports,genre_strategy,genre_misc,Developer_type
0,180.60,E10+,Mixed or average reviews,Generally unfavorable reviews,0,0,0,0,0,0,0,0,0,0,0,1,First
1,804.60,E,Universal acclaim,Mixed or average reviews,0,0,0,0,0,0,0,0,1,0,0,0,First
2,132.60,E10+,Generally favorable reviews,Mixed or average reviews,1,0,1,0,0,0,0,0,0,1,0,0,First
3,64.80,T,Generally favorable reviews,Generally favorable reviews,1,1,0,0,0,0,0,0,0,0,0,0,Second
4,0.17,E10+,Generally favorable reviews,Generally favorable reviews,1,1,0,0,0,0,0,0,0,0,0,0,Third
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,103.80,T,Generally favorable reviews,Generally favorable reviews,1,0,0,0,0,0,1,0,0,0,0,0,First
101,0.16,T,Generally favorable reviews,Generally favorable reviews,1,0,0,0,0,0,1,0,0,0,0,0,First
102,0.01,E10+,Generally favorable reviews,Generally favorable reviews,1,0,0,0,0,0,0,0,0,0,0,0,Third
103,0.03,E,Mixed or average reviews,Mixed or average reviews,1,1,0,0,0,0,0,0,0,0,0,0,Third


First model with all features

In [20]:
import statsmodels.api as sm
from scipy import stats

games_sales_dum = pd.get_dummies(games_sales, columns = ['rating','meta_overview','user_overview','Developer_type'])

# Setup dependent and independent variables
X=games_sales_dum.iloc[:,1:]
y=games_sales_dum['total_sales_USDMM']


X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:      total_sales_USDMM   R-squared:                       0.830
Model:                            OLS   Adj. R-squared:                  0.784
Method:                 Least Squares   F-statistic:                     18.21
Date:                Sun, 21 Feb 2021   Prob (F-statistic):           5.76e-23
Time:                        20:37:46   Log-Likelihood:                -637.73
No. Observations:                 105   AIC:                             1321.
Df Residuals:                      82   BIC:                             1383.
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                                                  coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------

Final model with removed features

In [31]:
X_filt = X.copy()
X_filt = X_filt.drop(['genre_roleplay','rating_T','Developer_type_Second',
                      'genre_fighting','genre_adventure','genre_platform', 'genre_puzzle',
                      'genre_sports','genre_misc','genre_strategy','genre_action',
                      'rating_E10+',
                      'Developer_type_Third'],axis=1)

X2 = sm.add_constant(X_filt)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:      total_sales_USDMM   R-squared:                       0.810
Model:                            OLS   Adj. R-squared:                  0.788
Method:                 Least Squares   F-statistic:                     36.10
Date:                Sun, 21 Feb 2021   Prob (F-statistic):           8.57e-29
Time:                        21:03:51   Log-Likelihood:                -643.53
No. Observations:                 105   AIC:                             1311.
Df Residuals:                      93   BIC:                             1343.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                                                  coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------