# "Global sales" model

In [1]:
import pandas as pd

from tensorflow import keras

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split



In [2]:
game_sales_ml_df = pd.read_csv("data/game_sales_ml.csv")

game_sales_ml_df.head()

Unnamed: 0,Name,Platform,Publisher,Genre,Rating,Year of Release,Critic Score,User Score,NA Sales,EU Sales,JP Sales,Other Sales,Global Sales
0,Wii Sports,Wii,Nintendo,Sports,E,2006,76.0,8.0,41.36,28.96,3.77,8.45,82.53
1,Mario Kart Wii,Wii,Nintendo,Racing,E,2008,82.0,8.3,15.68,12.76,3.79,3.29,35.52
2,Wii Sports Resort,Wii,Nintendo,Sports,E,2009,80.0,8.0,15.61,10.93,3.28,2.95,32.77
3,New Super Mario Bros.,DS,Nintendo,Platform,E,2006,89.0,8.5,11.28,9.14,6.5,2.88,29.8
4,Wii Play,Wii,Nintendo,Misc,E,2006,58.0,6.6,13.96,9.18,2.93,2.84,28.92


## Data preprocessing

In [3]:
model_df = game_sales_ml_df.copy()

model_df.head()

Unnamed: 0,Name,Platform,Publisher,Genre,Rating,Year of Release,Critic Score,User Score,NA Sales,EU Sales,JP Sales,Other Sales,Global Sales
0,Wii Sports,Wii,Nintendo,Sports,E,2006,76.0,8.0,41.36,28.96,3.77,8.45,82.53
1,Mario Kart Wii,Wii,Nintendo,Racing,E,2008,82.0,8.3,15.68,12.76,3.79,3.29,35.52
2,Wii Sports Resort,Wii,Nintendo,Sports,E,2009,80.0,8.0,15.61,10.93,3.28,2.95,32.77
3,New Super Mario Bros.,DS,Nintendo,Platform,E,2006,89.0,8.5,11.28,9.14,6.5,2.88,29.8
4,Wii Play,Wii,Nintendo,Misc,E,2006,58.0,6.6,13.96,9.18,2.93,2.84,28.92


### Handling categorical data

In [4]:
# NEW DATA: model_df[["name", "platform", "Genre", "Publisher", "Developer", "Rating"]].nunique()
model_df[["Name", "Platform", "Publisher", "Genre", "Rating"]].nunique()

Name         4428
Platform       17
Publisher     271
Genre          12
Rating          8
dtype: int64

In [5]:
unnecessary_columns = ["Name"] # too many of them
onehot_encode_columns = ["Platform", "Genre", "Rating"] # <=50 of them
label_encode_columns = ["Publisher"] # >50 of them

In [6]:
# unnecessary columns
model_df = model_df.drop(columns=unnecessary_columns)

# one-hot encode columns
model_df = pd.get_dummies(model_df, columns=onehot_encode_columns)

# label encode columns
label_encoder = LabelEncoder()
for label_encode_column in label_encode_columns:
    model_df[label_encode_column] = label_encoder.fit_transform(model_df[label_encode_column])

In [7]:
model_df.head()

Unnamed: 0,Publisher,Year of Release,Critic Score,User Score,NA Sales,EU Sales,JP Sales,Other Sales,Global Sales,Platform_3DS,...,Genre_Sports,Genre_Strategy,Rating_AO,Rating_E,Rating_E10+,Rating_K-A,Rating_M,Rating_RP,Rating_T,Rating_Unknown
0,167,2006,76.0,8.0,41.36,28.96,3.77,8.45,82.53,False,...,True,False,False,True,False,False,False,False,False,False
1,167,2008,82.0,8.3,15.68,12.76,3.79,3.29,35.52,False,...,False,False,False,True,False,False,False,False,False,False
2,167,2009,80.0,8.0,15.61,10.93,3.28,2.95,32.77,False,...,True,False,False,True,False,False,False,False,False,False
3,167,2006,89.0,8.5,11.28,9.14,6.5,2.88,29.8,False,...,False,False,False,True,False,False,False,False,False,False
4,167,2006,58.0,6.6,13.96,9.18,2.93,2.84,28.92,False,...,False,False,False,True,False,False,False,False,False,False


## Modeling

### Selecting the target (y) and features (X)

In [8]:
y = model_df["Global Sales"] # target
X = model_df.drop(columns=["Global Sales"]) # features

### Splitting the data: 80% training and 20% testing

In [9]:
X_training, X_testing, y_training, y_testing = train_test_split(X, y, test_size=.20, random_state=42)

In [10]:
scaler = StandardScaler() # mean=0, standard_deviation=1

X_training = scaler.fit_transform(X_training)
X_testing = scaler.fit_transform(X_testing)

### Building the regression model

In [11]:
model = keras.Sequential(
    [
        # layer 1
        keras.layers.Dense(128, activation="relu", input_shape=(X_training.shape[1],)),
        # layer 2
        keras.layers.Dense(64, activation="relu"),
        # layer 3
        keras.layers.Dense(32, activation="relu"),
        # output layer
        keras.layers.Dense(1)
    ]
)

model.compile(optimizer="adam", loss="mean_squared_error", metrics=[keras.metrics.RootMeanSquaredError()])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


### Training and evaluating the regression model

In [12]:
model.fit(X_training, y_training, epochs=50, batch_size=32, validation_data=(X_testing, y_testing))

Epoch 1/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 2.5809 - root_mean_squared_error: 1.5321 - val_loss: 0.1253 - val_root_mean_squared_error: 0.3540
Epoch 2/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 910us/step - loss: 0.0836 - root_mean_squared_error: 0.2860 - val_loss: 0.0968 - val_root_mean_squared_error: 0.3110
Epoch 3/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 887us/step - loss: 0.0672 - root_mean_squared_error: 0.2545 - val_loss: 0.1199 - val_root_mean_squared_error: 0.3463
Epoch 4/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 894us/step - loss: 0.0325 - root_mean_squared_error: 0.1776 - val_loss: 0.1396 - val_root_mean_squared_error: 0.3736
Epoch 5/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 866us/step - loss: 0.0245 - root_mean_squared_error: 0.1556 - val_loss: 0.0668 - val_root_mean_squared_error: 0.2584
Epoch 6/50
[1m173/173[0m [32m

<keras.src.callbacks.history.History at 0x3003090a0>

In [13]:
model.evaluate(X_testing, y_testing)

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 686us/step - loss: 0.0871 - root_mean_squared_error: 0.2923


[0.11394671350717545, 0.33755993843078613]

### Predicting

In [14]:
y_prediction = model.predict(X_testing)

r2 = r2_score(y_testing, y_prediction)
print(f"r2: {r2}")

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 863us/step
r2: 0.9580670853858161


## Saving

In [15]:
model_df.to_csv("data/output/game_sales_ml_model.csv", index=False)