In [60]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder

df=pd.read_excel('../dataProcessing/outputOfJoinedTablexceptReleaseDate19862023.xlsx')

In [61]:
df.dropna(subset=['Genre_metacritic_game_info'], inplace=True)
df.info ()
#print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 826 entries, 21 to 8874
Data columns (total 25 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   title                                826 non-null    object 
 1   platform                             826 non-null    object 
 2   genre                                826 non-null    object 
 3   publisher                            826 non-null    object 
 4   vg_score                             826 non-null    float64
 5   critic_score                         826 non-null    float64
 6   user_score                           826 non-null    float64
 7   total_shipped                        826 non-null    float64
 8   aisan_sales                          826 non-null    float64
 9   north_american_sales                 826 non-null    float64
 10  japan_sales                          826 non-null    float64
 11  european_sales                

Data pre-processing

In [64]:
#Encoding Genre and Publisher
encoder = OneHotEncoder(sparse=False)

genre_encoded = pd.get_dummies(df['genre'], prefix='genre', drop_first=True)
data = pd.concat([df, genre_encoded], axis=1)
#data.drop('genre', axis=1, inplace=True) 


In [65]:
publisher_encoded = pd.get_dummies(data['publisher'], prefix='publisher', drop_first=True)
data = pd.concat([data, publisher_encoded], axis=1)

#data.to_excel('test_onehot.xlsx', index=False)

In [66]:
features = ['critic_score', 'user_score', 'Production Cost']

# Input variables redefine
X = pd.concat([genre_encoded, df[features]], axis=1)
y = data['global_sales']

X

Unnamed: 0,genre_Board Game,genre_Fighting,genre_Puzzle,genre_Racing,genre_Shooter,genre_Simulation,genre_Sports,critic_score,user_score,Production Cost
21,0,0,0,0,0,1,0,7.7,5.0,6.96
40,0,0,0,0,0,0,1,6.0,7.0,6.06
42,0,0,0,0,0,0,1,8.2,9.0,4.24
114,0,0,0,0,0,0,0,8.0,9.0,3.75
115,0,0,0,0,0,0,0,7.0,7.0,3.24
...,...,...,...,...,...,...,...,...,...,...
8847,0,0,0,0,1,0,0,7.0,2.0,4.02
8851,0,0,0,0,0,0,0,7.0,7.0,1.49
8852,0,0,0,0,0,0,0,10.0,10.0,5.31
8873,0,0,0,0,0,1,0,7.5,8.0,0.04


In [67]:
#Data splitting
# Split into training and test sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

Training set: 660 samples
Testing set: 166 samples


Model implementation

In [70]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)

lr_mse = mean_squared_error(y_test, lr_predictions)
lr_mae = mean_absolute_error(y_test, lr_predictions)
lr_rmse = np.sqrt(lr_mse)
lr_r2 = r2_score(y_test, lr_predictions)

print(f"Linear Regression Mean Squared Error (MSE): {lr_mse}")
print(f"Linear Regression Mean Absolute Error (MAE): {lr_mae}")
print(f"Linear Regression Root-mean-square deviation (RMSE): {lr_rmse}")
print(f"Linear Regression R-squared (R2): {lr_r2}")

Linear Regression Mean Squared Error (MSE): 35.88675404196141
Linear Regression Mean Absolute Error (MAE): 4.924318800204469
Linear Regression Root-mean-square deviation (RMSE): 5.990555403463139
Linear Regression R-squared (R2): 0.044984966488463174


In [83]:
# Neural Network
model = keras.Sequential([
    keras.layers.Dense(10, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dense(4, activation='relu'),
    keras.layers.Dense(1)  # Linear output for regression
])

model.compile(optimizer='sgd', loss='mean_squared_error', metrics = "accuracy")
model.fit(X_train, y_train, epochs=20, batch_size=100, verbose=1)

nn_predictions = model.predict(X_test)
nn_mse = mean_squared_error(y_test, nn_predictions)
nn_mae = mean_absolute_error(y_test, nn_predictions)
nn_r2 = r2_score(y_test, nn_predictions)

print(f"Neural Network Mean Squared Error (MSE): {nn_mse}")
print(f"Neural Network Mean Absolute Error (MAE): {nn_mae}")
print(f"Neural Network R-squared (R2): {nn_r2}")


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Neural Network Mean Squared Error (MSE): 38.67992116287788
Neural Network Mean Absolute Error (MAE): 5.326299927263375
Neural Network R-squared (R2): -0.029346542805085862


In [84]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, y_train)
log_reg_predictions = log_reg_model.predict(X_test)
log_reg_mse = mean_squared_error(y_test, log_reg_predictions)
log_reg_mae = mean_absolute_error(y_test, log_reg_predictions)
log_reg_r2 = r2_score(y_test, log_reg_predictions)

print(f"Logistic Regression Mean Squared Error (MSE): {log_reg_mse}")
print(f"Logistic Regression Mean Absolute Error (MAE): {log_reg_mae}")
print(f"Logistic Regression R-squared (R2): {log_reg_r2}")

ValueError: Unknown label type: 'continuous'

In [85]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

print(f"Random Forest Mean Squared Error (MSE): {rf_mse}")
print(f"Random Forest Mean Absolute Error (MAE): {rf_mae}")
print(f"Random Forest R-squared (R2): {rf_r2}")

Random Forest Mean Squared Error (MSE): 41.036904505783134
Random Forest Mean Absolute Error (MAE): 5.2607457831325295
Random Forest R-squared (R2): -0.09207036908312749
