# "Global sales" model

In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder

In [2]:
games_df = pd.read_csv("../data/games.csv")

games_df.head()

Unnamed: 0,Name,Platform,Publisher,Genre,Rating,Year of Release,Critic Score,User Score,NA Sales,EU Sales,JP Sales,Other Sales,Global Sales
0,Wii Sports,Wii,Nintendo,Sports,E,2006,76.0,8,41.36,28.96,3.77,8.45,82.53
1,Super Mario Bros.,NES,Nintendo,Platform,Unknown,1985,Unknown,Unknown,29.08,3.58,6.81,0.77,40.24
2,Mario Kart Wii,Wii,Nintendo,Racing,E,2008,82.0,8.3,15.68,12.76,3.79,3.29,35.52
3,Wii Sports Resort,Wii,Nintendo,Sports,E,2009,80.0,8,15.61,10.93,3.28,2.95,32.77
4,Pokemon Red/Pokemon Blue,GB,Nintendo,Role-Playing,Unknown,1996,Unknown,Unknown,11.27,8.89,10.22,1.0,31.37


## Data preprocessing

### Handling categorical data

In [3]:
games_df[["Name", "Platform", "Publisher", "Genre", "Rating"]].nunique()

Name         11563
Platform        31
Publisher      581
Genre           13
Rating           9
dtype: int64

In [4]:
unnecessary_columns = ["Name"] # too many of them
onehot_encode_columns = ["Platform", "Genre", "Rating"] # <=50 of them
label_encode_columns = ["Publisher"] # >50 of them

In [5]:
# unnecessary columns
games_df = games_df.drop(columns=unnecessary_columns)

# one-hot encode columns
games_df = pd.get_dummies(games_df, columns=onehot_encode_columns)

# label encode columns
label_encoder = LabelEncoder()
for label_encode_column in label_encode_columns:
    games_df[label_encode_column] = label_encoder.fit_transform(games_df[label_encode_column])

In [6]:
games_df.head()

Unnamed: 0,Publisher,Year of Release,Critic Score,User Score,NA Sales,EU Sales,JP Sales,Other Sales,Global Sales,Platform_2600,...,Genre_Unknown,Rating_AO,Rating_E,Rating_E10+,Rating_EC,Rating_K-A,Rating_M,Rating_RP,Rating_T,Rating_Unknown
0,361,2006,76.0,8,41.36,28.96,3.77,8.45,82.53,False,...,False,False,True,False,False,False,False,False,False,False
1,361,1985,Unknown,Unknown,29.08,3.58,6.81,0.77,40.24,False,...,False,False,False,False,False,False,False,False,False,True
2,361,2008,82.0,8.3,15.68,12.76,3.79,3.29,35.52,False,...,False,False,True,False,False,False,False,False,False,False
3,361,2009,80.0,8,15.61,10.93,3.28,2.95,32.77,False,...,False,False,True,False,False,False,False,False,False,False
4,361,1996,Unknown,Unknown,11.27,8.89,10.22,1.0,31.37,False,...,False,False,False,False,False,False,False,False,False,True
