In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
vg_df = pd.read_csv('vgsales.csv', encoding='utf-8')

In [3]:
vg_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [5]:
vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].head(7)

Unnamed: 0,Name,Platform,Year,Genre,Publisher
0,Wii Sports,Wii,2006.0,Sports,Nintendo
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo
5,Tetris,GB,1989.0,Puzzle,Nintendo
6,New Super Mario Bros.,DS,2006.0,Platform,Nintendo


# Transforming Nominal Attributes

In [9]:
genre = np.unique(vg_df['Genre'])
genre

array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',
       'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',
       'Strategy'], dtype=object)

In [13]:
gle = LabelEncoder()
genre_label = gle.fit_transform(vg_df['Genre'])
genre_mappings = {index: label for index, label in 
                  enumerate(gle.classes_)}
genre_mappings

{0: 'Action',
 1: 'Adventure',
 2: 'Fighting',
 3: 'Misc',
 4: 'Platform',
 5: 'Puzzle',
 6: 'Racing',
 7: 'Role-Playing',
 8: 'Shooter',
 9: 'Simulation',
 10: 'Sports',
 11: 'Strategy'}

In [15]:
vg_df['GenreLabel'] = genre_label

In [16]:
vg_df[['Name', 'Platform', 'Year', 'GenreLabel', 'Publisher']].head(7)

Unnamed: 0,Name,Platform,Year,GenreLabel,Publisher
0,Wii Sports,Wii,2006.0,10,Nintendo
1,Super Mario Bros.,NES,1985.0,4,Nintendo
2,Mario Kart Wii,Wii,2008.0,6,Nintendo
3,Wii Sports Resort,Wii,2009.0,10,Nintendo
4,Pokemon Red/Pokemon Blue,GB,1996.0,7,Nintendo
5,Tetris,GB,1989.0,5,Nintendo
6,New Super Mario Bros.,DS,2006.0,4,Nintendo


# Transforming Ordinal Attributes

In [18]:
poke_df = pd.read_csv('Pokemon.csv', encoding='utf-8')

In [19]:
poke_df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [20]:
poke_df = poke_df.sample(random_state=1, frac=1).reset_index(drop=True)

In [23]:
poke_df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,6,CharizardMega Charizard Y,Fire,Flying,634,78,104,78,159,115,100,1,False
1,460,Abomasnow,Grass,Ice,494,90,92,75,92,85,60,4,False
2,161,Sentret,Normal,,215,35,46,34,35,45,20,2,False
3,667,Litleo,Fire,Normal,369,62,50,58,73,54,72,6,False
4,224,Octillery,Water,,480,75,105,75,105,75,45,2,False


In [24]:
np.unique(poke_df['Generation'])

array([1, 2, 3, 4, 5, 6])

In [29]:
gen_ord_map = {1: 1, 2: 2, 3: 3, 
               4: 4, 5: 5, 6: 6}
poke_df['GenerationLabel'] = poke_df['Generation'].map(gen_ord_map)

In [30]:
poke_df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,GenerationLabel
0,6,CharizardMega Charizard Y,Fire,Flying,634,78,104,78,159,115,100,1,False,1
1,460,Abomasnow,Grass,Ice,494,90,92,75,92,85,60,4,False,4
2,161,Sentret,Normal,,215,35,46,34,35,45,20,2,False,2
3,667,Litleo,Fire,Normal,369,62,50,58,73,54,72,6,False,6
4,224,Octillery,Water,,480,75,105,75,105,75,45,2,False,2


# Encoding Categorical Attributes

In [31]:
#one hot
#dummy variables
#bit counting
#effect coding scheme

# Feature Hashing Scheme

In [32]:
from sklearn.feature_extraction import FeatureHasher

In [38]:
fh = FeatureHasher(n_features=6, input_type='string')
hashed_genre = fh.fit_transform(vg_df['Genre'])
hashed_genre = hashed_genre.toarray()
pd.concat([vg_df[['Name', 'Genre']], pd.DataFrame(hashed_genre)], axis=1).head()

Unnamed: 0,Name,Genre,0,1,2,3,4,5
0,Wii Sports,Sports,-2.0,2.0,0.0,-2.0,0.0,0.0
1,Super Mario Bros.,Platform,0.0,2.0,2.0,-1.0,1.0,0.0
2,Mario Kart Wii,Racing,-1.0,0.0,0.0,0.0,0.0,-1.0
3,Wii Sports Resort,Sports,-2.0,2.0,0.0,-2.0,0.0,0.0
4,Pokemon Red/Pokemon Blue,Role-Playing,-1.0,1.0,2.0,0.0,1.0,-1.0
