# "Global sales" model

In [1]:
import pandas as pd

from tensorflow import keras

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split



In [2]:
games_df = pd.read_csv("../data/games.csv")

games_df.head()

Unnamed: 0,Name,Platform,Publisher,Genre,Rating,Year of Release,Critic Score,User Score,NA Sales,EU Sales,JP Sales,Other Sales,Global Sales
0,Wii Sports,Wii,Nintendo,Sports,E,2006,76.0,8.0,41.36,28.96,3.77,8.45,82.53
1,Mario Kart Wii,Wii,Nintendo,Racing,E,2008,82.0,8.3,15.68,12.76,3.79,3.29,35.52
2,Wii Sports Resort,Wii,Nintendo,Sports,E,2009,80.0,8.0,15.61,10.93,3.28,2.95,32.77
3,New Super Mario Bros.,DS,Nintendo,Platform,E,2006,89.0,8.5,11.28,9.14,6.5,2.88,29.8
4,Wii Play,Wii,Nintendo,Misc,E,2006,58.0,6.6,13.96,9.18,2.93,2.84,28.92


## Data preprocessing

### Handling categorical data

In [3]:
games_df[["Name", "Platform", "Publisher", "Genre", "Rating"]].nunique()

Name         4428
Platform       17
Publisher     271
Genre          12
Rating          8
dtype: int64

In [4]:
unnecessary_columns = ["Name"] # too many of them
onehot_encode_columns = ["Platform", "Genre", "Rating"] # <=50 of them
label_encode_columns = ["Publisher"] # >50 of them

In [5]:
# unnecessary columns
games_df = games_df.drop(columns=unnecessary_columns)

# one-hot encode columns
games_df = pd.get_dummies(games_df, columns=onehot_encode_columns)

# label encode columns
label_encoder = LabelEncoder()
for label_encode_column in label_encode_columns:
    games_df[label_encode_column] = label_encoder.fit_transform(games_df[label_encode_column])

In [6]:
games_df.head()

Unnamed: 0,Publisher,Year of Release,Critic Score,User Score,NA Sales,EU Sales,JP Sales,Other Sales,Global Sales,Platform_3DS,...,Genre_Sports,Genre_Strategy,Rating_AO,Rating_E,Rating_E10+,Rating_K-A,Rating_M,Rating_RP,Rating_T,Rating_Unknown
0,167,2006,76.0,8.0,41.36,28.96,3.77,8.45,82.53,False,...,True,False,False,True,False,False,False,False,False,False
1,167,2008,82.0,8.3,15.68,12.76,3.79,3.29,35.52,False,...,False,False,False,True,False,False,False,False,False,False
2,167,2009,80.0,8.0,15.61,10.93,3.28,2.95,32.77,False,...,True,False,False,True,False,False,False,False,False,False
3,167,2006,89.0,8.5,11.28,9.14,6.5,2.88,29.8,False,...,False,False,False,True,False,False,False,False,False,False
4,167,2006,58.0,6.6,13.96,9.18,2.93,2.84,28.92,False,...,False,False,False,True,False,False,False,False,False,False


## Modeling

### Selecting the target (y) and features (X)

In [7]:
y = games_df["Global Sales"] # target
X = games_df.drop(columns=["Global Sales"]) # features

### Splitting the data: 80% training and 20% testing

In [8]:
X_training, X_testing, y_training, y_testing = train_test_split(X, y, test_size=.20, random_state=42)

In [9]:
scaler = StandardScaler() # mean=0, standard_deviation=1

X_training = scaler.fit_transform(X_training)
X_testing = scaler.fit_transform(X_testing)

### Building the regression model

In [10]:
model = keras.Sequential(
    [
        # layer 1
        keras.layers.Dense(128, activation="relu", input_shape=(X_training.shape[1],)),
        # layer 2
        keras.layers.Dense(64, activation="relu"),
        # layer 3
        keras.layers.Dense(32, activation="relu"),
        # output layer
        keras.layers.Dense(1)
    ]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


### Compiling the regression model

In [11]:
model.compile(optimizer="adam", loss="mean_squared_error", metrics=[keras.metrics.RootMeanSquaredError()])

### Training the regression model

In [12]:
model.fit(X_training, y_training, epochs=50, batch_size=32, validation_data=(X_testing, y_testing))

Epoch 1/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.4815 - root_mean_squared_error: 0.6828 - val_loss: 0.4570 - val_root_mean_squared_error: 0.6760
Epoch 2/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 887us/step - loss: 0.1552 - root_mean_squared_error: 0.3846 - val_loss: 0.0854 - val_root_mean_squared_error: 0.2922
Epoch 3/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 870us/step - loss: 0.0229 - root_mean_squared_error: 0.1511 - val_loss: 0.1354 - val_root_mean_squared_error: 0.3680
Epoch 4/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 854us/step - loss: 0.0118 - root_mean_squared_error: 0.1085 - val_loss: 0.1978 - val_root_mean_squared_error: 0.4448
Epoch 5/50
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 859us/step - loss: 0.0162 - root_mean_squared_error: 0.1260 - val_loss: 0.0930 - val_root_mean_squared_error: 0.3050
Epoch 6/50
[1m173/173[0m [32m

<keras.src.callbacks.history.History at 0x168580970>

### Evaluating the regression model

In [13]:
model.evaluate(X_testing, y_testing)

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0684 - root_mean_squared_error: 0.2560


[0.08475751429796219, 0.2911314368247986]

### Prediction

In [14]:
y_prediction = model.predict(X_testing)

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 959us/step


### Prediction evaluation

In [15]:
r2 = r2_score(y_testing, y_prediction)
print(f"r2: {r2}")

r2: 0.9688088514755955
