# "Global sales" model

In [1]:
import pandas as pd

from tensorflow import keras

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split



In [2]:
games_df = pd.read_csv("../data/games.csv")

games_df.head()

Unnamed: 0,Name,Platform,Publisher,Genre,Rating,Year of Release,Critic Score,User Score,NA Sales,EU Sales,JP Sales,Other Sales,Global Sales
0,Wii Sports,Wii,Nintendo,Sports,E,2006,76.0,8.0,41.36,28.96,3.77,8.45,82.53
1,Mario Kart Wii,Wii,Nintendo,Racing,E,2008,82.0,8.3,15.68,12.76,3.79,3.29,35.52
2,Wii Sports Resort,Wii,Nintendo,Sports,E,2009,80.0,8.0,15.61,10.93,3.28,2.95,32.77
3,New Super Mario Bros.,DS,Nintendo,Platform,E,2006,89.0,8.5,11.28,9.14,6.5,2.88,29.8
4,Wii Play,Wii,Nintendo,Misc,E,2006,58.0,6.6,13.96,9.18,2.93,2.84,28.92


## Data preprocessing

In [3]:
model_df = games_df.copy()

model_df.head()

Unnamed: 0,Name,Platform,Publisher,Genre,Rating,Year of Release,Critic Score,User Score,NA Sales,EU Sales,JP Sales,Other Sales,Global Sales
0,Wii Sports,Wii,Nintendo,Sports,E,2006,76.0,8.0,41.36,28.96,3.77,8.45,82.53
1,Mario Kart Wii,Wii,Nintendo,Racing,E,2008,82.0,8.3,15.68,12.76,3.79,3.29,35.52
2,Wii Sports Resort,Wii,Nintendo,Sports,E,2009,80.0,8.0,15.61,10.93,3.28,2.95,32.77
3,New Super Mario Bros.,DS,Nintendo,Platform,E,2006,89.0,8.5,11.28,9.14,6.5,2.88,29.8
4,Wii Play,Wii,Nintendo,Misc,E,2006,58.0,6.6,13.96,9.18,2.93,2.84,28.92


### Handling categorical data

In [4]:
model_df[["Name", "Platform", "Publisher", "Genre", "Rating"]].nunique()

Name         4428
Platform       17
Publisher     271
Genre          12
Rating          8
dtype: int64

In [5]:
unnecessary_columns = ["Name"] # too many of them
onehot_encode_columns = ["Platform", "Genre", "Rating"] # <=50 of them
label_encode_columns = ["Publisher"] # >50 of them

In [6]:
# unnecessary columns
model_df = model_df.drop(columns=unnecessary_columns)

# one-hot encode columns
model_df = pd.get_dummies(model_df, columns=onehot_encode_columns)

# label encode columns
label_encoder = LabelEncoder()
for label_encode_column in label_encode_columns:
    model_df[label_encode_column] = label_encoder.fit_transform(model_df[label_encode_column])

In [7]:
model_df.head()

Unnamed: 0,Publisher,Year of Release,Critic Score,User Score,NA Sales,EU Sales,JP Sales,Other Sales,Global Sales,Platform_3DS,...,Genre_Sports,Genre_Strategy,Rating_AO,Rating_E,Rating_E10+,Rating_K-A,Rating_M,Rating_RP,Rating_T,Rating_Unknown
0,167,2006,76.0,8.0,41.36,28.96,3.77,8.45,82.53,False,...,True,False,False,True,False,False,False,False,False,False
1,167,2008,82.0,8.3,15.68,12.76,3.79,3.29,35.52,False,...,False,False,False,True,False,False,False,False,False,False
2,167,2009,80.0,8.0,15.61,10.93,3.28,2.95,32.77,False,...,True,False,False,True,False,False,False,False,False,False
3,167,2006,89.0,8.5,11.28,9.14,6.5,2.88,29.8,False,...,False,False,False,True,False,False,False,False,False,False
4,167,2006,58.0,6.6,13.96,9.18,2.93,2.84,28.92,False,...,False,False,False,True,False,False,False,False,False,False


## Modeling

### Selecting the target (y) and features (X)

In [8]:
y = model_df["Global Sales"] # target
X = model_df.drop(columns=["Global Sales"]) # features

### Splitting the data: 80% training and 20% testing

In [9]:
X_training, X_testing, y_training, y_testing = train_test_split(X, y, test_size=.20, random_state=42)

In [10]:
scaler = StandardScaler() # mean=0, standard_deviation=1

X_training = scaler.fit_transform(X_training)
X_testing = scaler.fit_transform(X_testing)

### Building the regression model

In [11]:
model = keras.Sequential(
    [
        # layer 1
        keras.layers.Dense(128, activation="relu", input_shape=(X_training.shape[1],)),
        # layer 2
        keras.layers.Dense(64, activation="relu"),
        # layer 3
        keras.layers.Dense(32, activation="relu"),
        # output layer
        keras.layers.Dense(1)
    ]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


### Compiling the regression model

In [12]:
model.compile(optimizer="adam", loss="mean_squared_error", metrics=[keras.metrics.RootMeanSquaredError()])

### Training the regression model

In [13]:
model.fit(X_training, y_training, epochs=50, batch_size=32, validation_data=(X_testing, y_testing))

Epoch 1/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.9255 - root_mean_squared_error: 0.9321 - val_loss: 0.1934 - val_root_mean_squared_error: 0.4398
Epoch 2/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 897us/step - loss: 0.0650 - root_mean_squared_error: 0.2525 - val_loss: 0.0514 - val_root_mean_squared_error: 0.2268
Epoch 3/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 886us/step - loss: 0.0710 - root_mean_squared_error: 0.2624 - val_loss: 0.0977 - val_root_mean_squared_error: 0.3126
Epoch 4/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 872us/step - loss: 0.0267 - root_mean_squared_error: 0.1621 - val_loss: 0.0796 - val_root_mean_squared_error: 0.2822
Epoch 5/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 864us/step - loss: 0.0138 - root_mean_squared_error: 0.1174 - val_loss: 0.0553 - val_root_mean_squared_error: 0.2351
Epoch 6/50
[1m173/173[0m [32m

<keras.src.callbacks.history.History at 0x176d090a0>

### Evaluating the regression model

In [14]:
model.evaluate(X_testing, y_testing)

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 709us/step - loss: 0.0258 - root_mean_squared_error: 0.1600


[0.031045323237776756, 0.17619682848453522]

### Prediction

In [15]:
y_prediction = model.predict(X_testing)

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 877us/step


### Prediction evaluation

In [16]:
r2 = r2_score(y_testing, y_prediction)
print(f"r2: {r2}")

r2: 0.9885751793631635


## Saving

In [17]:
model_df.to_csv("output/model_data.csv", index=False)