# "Global sales" model (optimized)

In [1]:
import pandas as pd

from tensorflow import keras

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split



In [None]:
model_df = pd.read_csv("data/game_sales_ml_model.csv")

model_df.head()

Unnamed: 0,Publisher,Year of Release,Critic Score,User Score,NA Sales,EU Sales,JP Sales,Other Sales,Global Sales,Platform_3DS,...,Genre_Sports,Genre_Strategy,Rating_AO,Rating_E,Rating_E10+,Rating_K-A,Rating_M,Rating_RP,Rating_T,Rating_Unknown
0,167,2006,76.0,8.0,41.36,28.96,3.77,8.45,82.53,False,...,True,False,False,True,False,False,False,False,False,False
1,167,2008,82.0,8.3,15.68,12.76,3.79,3.29,35.52,False,...,False,False,False,True,False,False,False,False,False,False
2,167,2009,80.0,8.0,15.61,10.93,3.28,2.95,32.77,False,...,True,False,False,True,False,False,False,False,False,False
3,167,2006,89.0,8.5,11.28,9.14,6.5,2.88,29.8,False,...,False,False,False,True,False,False,False,False,False,False
4,167,2006,58.0,6.6,13.96,9.18,2.93,2.84,28.92,False,...,False,False,False,True,False,False,False,False,False,False


## Modeling (optimized)

### Selecting the target (y) and features (X)

In [3]:
y = model_df["Global Sales"] # target
X = model_df.drop(columns=["Global Sales"]) # features

### Splitting the data: 80% training and 20% testing

In [4]:
X_training, X_testing, y_training, y_testing = train_test_split(X, y, test_size=.20, random_state=42)

In [5]:
scaler = StandardScaler() # mean=0, standard_deviation=1

X_training = scaler.fit_transform(X_training)
X_testing = scaler.fit_transform(X_testing)

### Building the regression model (optimized)

In [6]:
model = keras.Sequential(
    [
        # layer 1
        keras.layers.Dense(128, activation="relu", input_shape=(X_training.shape[1],)),
        # layer 2
        keras.layers.Dense(64, activation="relu"),
        # layer 3
        keras.layers.Dense(32, activation="relu"),
        # layer 4
        keras.layers.Dense(16, activation="relu"),
        # output layer
        keras.layers.Dense(1)
    ]
)

model.compile(optimizer="adam", loss="mean_squared_error", metrics=[keras.metrics.RootMeanSquaredError()])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


### Training and evaluating the regression model

In [7]:
model.fit(X_training, y_training, epochs=50, batch_size=32, validation_data=(X_testing, y_testing))

Epoch 1/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 1.9688 - root_mean_squared_error: 1.3824 - val_loss: 0.1624 - val_root_mean_squared_error: 0.4030
Epoch 2/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 901us/step - loss: 0.1043 - root_mean_squared_error: 0.3201 - val_loss: 0.0858 - val_root_mean_squared_error: 0.2929
Epoch 3/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 918us/step - loss: 0.0217 - root_mean_squared_error: 0.1472 - val_loss: 0.1197 - val_root_mean_squared_error: 0.3460
Epoch 4/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 927us/step - loss: 0.0160 - root_mean_squared_error: 0.1263 - val_loss: 0.0900 - val_root_mean_squared_error: 0.3000
Epoch 5/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 908us/step - loss: 0.0099 - root_mean_squared_error: 0.0995 - val_loss: 0.1875 - val_root_mean_squared_error: 0.4330
Epoch 6/50
[1m173/173[0m [32m

<keras.src.callbacks.history.History at 0x16df6f790>

In [8]:
model.evaluate(X_testing, y_testing)

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 696us/step - loss: 0.0773 - root_mean_squared_error: 0.2746


[0.099199078977108, 0.3149588406085968]

### Predicting

In [9]:
y_prediction = model.predict(X_testing)

r2 = r2_score(y_testing, y_prediction)
print(f"r2: {r2}")

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 955us/step
r2: 0.9634942874587896
