In [1]:
import numpy as np
import pandas as pd

In [2]:
vg_df = pd.read_csv("./data/vgsales.csv", encoding = "ISO-8859-1")
vg_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,259,Asteroids,2600,1980,Shooter,Atari,4.0,0.26,0.0,0.05,4.31
1,545,Missile Command,2600,1980,Shooter,Atari,2.56,0.17,0.0,0.03,2.76
2,1768,Kaboom!,2600,1980,Misc,Activision,1.07,0.07,0.0,0.01,1.15
3,1971,Defender,2600,1980,Misc,Atari,0.99,0.05,0.0,0.01,1.05
4,2671,Boxing,2600,1980,Fighting,Activision,0.72,0.04,0.0,0.01,0.77


In [3]:
vg_df.shape

(16324, 11)

In [4]:
vg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16324 entries, 0 to 16323
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16324 non-null  int64  
 1   Name          16324 non-null  object 
 2   Platform      16324 non-null  object 
 3   Year          16324 non-null  int64  
 4   Genre         16324 non-null  object 
 5   Publisher     16288 non-null  object 
 6   NA_Sales      16324 non-null  float64
 7   EU_Sales      16324 non-null  float64
 8   JP_Sales      16324 non-null  float64
 9   Other_Sales   16324 non-null  float64
 10  Global_Sales  16324 non-null  float64
dtypes: float64(5), int64(2), object(4)
memory usage: 1.4+ MB


In [5]:
vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[0:7, :]

Unnamed: 0,Name,Platform,Year,Genre,Publisher
0,Asteroids,2600,1980,Shooter,Atari
1,Missile Command,2600,1980,Shooter,Atari
2,Kaboom!,2600,1980,Misc,Activision
3,Defender,2600,1980,Misc,Atari
4,Boxing,2600,1980,Fighting,Activision
5,Ice Hockey,2600,1980,Sports,Activision
6,Freeway,2600,1980,Action,Activision


In [6]:
vg_df.Name.unique()

array(['Asteroids', 'Missile Command', 'Kaboom!', ...,
       'Codename: Panzers Complete Collection',
       'Farming 2017 - The Simulation',
       'Chou Ezaru wa Akai Hana: Koi wa Tsuki ni Shirube Kareru'],
      dtype=object)

In [7]:
len(vg_df.Name.unique())

11358

In [8]:
len(vg_df.Publisher.unique())

577

In [9]:
len(vg_df.Platform.unique())

31

In [10]:
vg_df.Platform.unique()

array(['2600', 'NES', 'PC', 'DS', 'GB', 'SNES', 'GEN', 'GG', 'SCD', 'NG',
       'PS', 'SAT', '3DO', 'TG16', 'N64', 'PCFX', 'DC', 'WS', 'PS2', 'XB',
       'GBA', 'GC', 'PSP', 'X360', 'Wii', 'PS3', '3DS', 'PSV', 'WiiU',
       'PS4', 'XOne'], dtype=object)

In [11]:
len(vg_df.Genre.unique())

12

In [12]:
vg_df.Genre.unique()

array(['Shooter', 'Misc', 'Fighting', 'Sports', 'Action', 'Platform',
       'Puzzle', 'Racing', 'Simulation', 'Adventure', 'Role-Playing',
       'Strategy'], dtype=object)

## LabelEncoder


In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
genre_le = LabelEncoder()
genre_labels = genre_le.fit_transform(vg_df['Genre'])
genre_mappings = {index: label for index, label in enumerate(genre_le.classes_)}
genre_mappings

{0: 'Action',
 1: 'Adventure',
 2: 'Fighting',
 3: 'Misc',
 4: 'Platform',
 5: 'Puzzle',
 6: 'Racing',
 7: 'Role-Playing',
 8: 'Shooter',
 9: 'Simulation',
 10: 'Sports',
 11: 'Strategy'}

In [15]:
vg_df['GenreLabel'] = genre_labels
vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']]

Unnamed: 0,Name,Platform,Year,Genre,GenreLabel
0,Asteroids,2600,1980,Shooter,8
1,Missile Command,2600,1980,Shooter,8
2,Kaboom!,2600,1980,Misc,3
3,Defender,2600,1980,Misc,3
4,Boxing,2600,1980,Fighting,2
...,...,...,...,...,...
16319,Mighty No. 9,XOne,2016,Platform,4
16320,Resident Evil 4 HD,XOne,2016,Shooter,8
16321,Farming 2017 - The Simulation,PS4,2016,Simulation,9
16322,Rugby Challenge 3,XOne,2016,Sports,10


## Map

In [16]:
vg_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,GenreLabel
0,259,Asteroids,2600,1980,Shooter,Atari,4.0,0.26,0.0,0.05,4.31,8
1,545,Missile Command,2600,1980,Shooter,Atari,2.56,0.17,0.0,0.03,2.76,8
2,1768,Kaboom!,2600,1980,Misc,Activision,1.07,0.07,0.0,0.01,1.15,3
3,1971,Defender,2600,1980,Misc,Atari,0.99,0.05,0.0,0.01,1.05,3
4,2671,Boxing,2600,1980,Fighting,Activision,0.72,0.04,0.0,0.01,0.77,2


In [17]:
vg_df.drop(axis = 1, columns = ['GenreLabel'], inplace = True)

In [18]:
vg_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,259,Asteroids,2600,1980,Shooter,Atari,4.0,0.26,0.0,0.05,4.31
1,545,Missile Command,2600,1980,Shooter,Atari,2.56,0.17,0.0,0.03,2.76
2,1768,Kaboom!,2600,1980,Misc,Activision,1.07,0.07,0.0,0.01,1.15
3,1971,Defender,2600,1980,Misc,Atari,0.99,0.05,0.0,0.01,1.05
4,2671,Boxing,2600,1980,Fighting,Activision,0.72,0.04,0.0,0.01,0.77


In [19]:
vg_df['Genre'].unique()

array(['Shooter', 'Misc', 'Fighting', 'Sports', 'Action', 'Platform',
       'Puzzle', 'Racing', 'Simulation', 'Adventure', 'Role-Playing',
       'Strategy'], dtype=object)

In [20]:
genre_map = {'Action': 1, 'Adventure':2, 'Fighting':3, 'Misc': 4, 'Platform': 5,
            'Puzzle': 6, 'Racing': 7, 'Role-Playing': 8, 'Shooter': 9,
            'Simulation': 10, 'Sports': 11, 'Strategy': 12}

In [21]:
vg_df['GenreLabel'] = vg_df['Genre'].map(genre_map)

In [22]:
vg_df[['Rank', 'Name', 'Genre', 'GenreLabel']].head()

Unnamed: 0,Rank,Name,Genre,GenreLabel
0,259,Asteroids,Shooter,9
1,545,Missile Command,Shooter,9
2,1768,Kaboom!,Misc,4
3,1971,Defender,Misc,4
4,2671,Boxing,Fighting,3


In [23]:
vg_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,GenreLabel
0,259,Asteroids,2600,1980,Shooter,Atari,4.0,0.26,0.0,0.05,4.31,9
1,545,Missile Command,2600,1980,Shooter,Atari,2.56,0.17,0.0,0.03,2.76,9
2,1768,Kaboom!,2600,1980,Misc,Activision,1.07,0.07,0.0,0.01,1.15,4
3,1971,Defender,2600,1980,Misc,Atari,0.99,0.05,0.0,0.01,1.05,4
4,2671,Boxing,2600,1980,Fighting,Activision,0.72,0.04,0.0,0.01,0.77,3


In [24]:
col_name = vg_df.columns.tolist()
col_name

['Rank',
 'Name',
 'Platform',
 'Year',
 'Genre',
 'Publisher',
 'NA_Sales',
 'EU_Sales',
 'JP_Sales',
 'Other_Sales',
 'Global_Sales',
 'GenreLabel']

In [25]:
type(col_name)

list

In [26]:
col_name.insert(5, 'GenreLabel')
col_name

['Rank',
 'Name',
 'Platform',
 'Year',
 'Genre',
 'GenreLabel',
 'Publisher',
 'NA_Sales',
 'EU_Sales',
 'JP_Sales',
 'Other_Sales',
 'Global_Sales',
 'GenreLabel']

In [27]:
del(col_name[-1])

In [28]:
col_name

['Rank',
 'Name',
 'Platform',
 'Year',
 'Genre',
 'GenreLabel',
 'Publisher',
 'NA_Sales',
 'EU_Sales',
 'JP_Sales',
 'Other_Sales',
 'Global_Sales']

In [29]:
vg_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,GenreLabel
0,259,Asteroids,2600,1980,Shooter,Atari,4.0,0.26,0.0,0.05,4.31,9
1,545,Missile Command,2600,1980,Shooter,Atari,2.56,0.17,0.0,0.03,2.76,9
2,1768,Kaboom!,2600,1980,Misc,Activision,1.07,0.07,0.0,0.01,1.15,4
3,1971,Defender,2600,1980,Misc,Atari,0.99,0.05,0.0,0.01,1.05,4
4,2671,Boxing,2600,1980,Fighting,Activision,0.72,0.04,0.0,0.01,0.77,3


In [30]:
vg_df.drop(axis = 1, columns = ['GenreLabel'], inplace = True)

In [31]:
vg_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,259,Asteroids,2600,1980,Shooter,Atari,4.0,0.26,0.0,0.05,4.31
1,545,Missile Command,2600,1980,Shooter,Atari,2.56,0.17,0.0,0.03,2.76
2,1768,Kaboom!,2600,1980,Misc,Activision,1.07,0.07,0.0,0.01,1.15
3,1971,Defender,2600,1980,Misc,Atari,0.99,0.05,0.0,0.01,1.05
4,2671,Boxing,2600,1980,Fighting,Activision,0.72,0.04,0.0,0.01,0.77


## One-hot Encoding

In [32]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [33]:
genre_le = LabelEncoder()
genre_labels = genre_le.fit_transform(vg_df['Genre'])
vg_df['GenreLabel'] = genre_labels
vg_df_sub = vg_df[['Rank', 'Name', 'Platform', 'Year', 'Genre', 'GenreLabel']]
vg_df_sub.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,GenreLabel
0,259,Asteroids,2600,1980,Shooter,8
1,545,Missile Command,2600,1980,Shooter,8
2,1768,Kaboom!,2600,1980,Misc,3
3,1971,Defender,2600,1980,Misc,3
4,2671,Boxing,2600,1980,Fighting,2


In [34]:
genre_ohe = OneHotEncoder()
genre_feature_arr = genre_ohe.fit_transform(vg_df[['GenreLabel']]).toarray()
genre_feature_labels = list(genre_le.classes_)
genre_features = pd.DataFrame(genre_feature_arr, columns = genre_feature_labels)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [35]:
genre_features.head()

Unnamed: 0,Action,Adventure,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
vg_df_ohe = pd.concat([vg_df_sub, genre_features], axis = 1)
vg_df_ohe.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,GenreLabel,Action,Adventure,Fighting,Misc,Platform.1,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,259,Asteroids,2600,1980,Shooter,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,545,Missile Command,2600,1980,Shooter,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1768,Kaboom!,2600,1980,Misc,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1971,Defender,2600,1980,Misc,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2671,Boxing,2600,1980,Fighting,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
columns = sum([['Rank', 'Name', 'Platform', 'Year'], genre_feature_labels, ])

TypeError: unsupported operand type(s) for +: 'int' and 'list'

# Pandas Get Dummy is better than Onehot encoder

In [40]:
poke = pd.read_csv('./data/pokemon_data.csv')
poke.head()

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,39,52,43,60,50,65,1,False


##### gen_onehot_features = pd.get_dummies(df['column_to_onehot'])
pd.concat(df[['col_1', 'col_2']], gen_onehot_features], axis = 1).head() 
