# Import necessary dependencies and settings

In [1]:
import pandas as pd
import numpy as np

# Transforming Nominal Features

In [2]:
vg_df = pd.read_csv('/content/vgsales.csv', encoding='utf-8')
vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]

Unnamed: 0,Name,Platform,Year,Genre,Publisher
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo
5,Tetris,GB,1989.0,Puzzle,Nintendo
6,New Super Mario Bros.,DS,2006.0,Platform,Nintendo


In [3]:
genres = np.unique(vg_df['Genre'])
genres

array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',
       'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',
       'Strategy'], dtype=object)

In [4]:
from sklearn.preprocessing import LabelEncoder

gle = LabelEncoder()
genre_labels = gle.fit_transform(vg_df['Genre'])
genre_mappings = {index: label for index, label in enumerate(gle.classes_)}
genre_mappings

{0: 'Action',
 1: 'Adventure',
 2: 'Fighting',
 3: 'Misc',
 4: 'Platform',
 5: 'Puzzle',
 6: 'Racing',
 7: 'Role-Playing',
 8: 'Shooter',
 9: 'Simulation',
 10: 'Sports',
 11: 'Strategy'}

In [5]:
vg_df['GenreLabel'] = genre_labels
vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7]

Unnamed: 0,Name,Platform,Year,Genre,GenreLabel
1,Super Mario Bros.,NES,1985.0,Platform,4
2,Mario Kart Wii,Wii,2008.0,Racing,6
3,Wii Sports Resort,Wii,2009.0,Sports,10
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,7
5,Tetris,GB,1989.0,Puzzle,5
6,New Super Mario Bros.,DS,2006.0,Platform,4


# Transforming Ordinal Features

In [7]:
poke_df = pd.read_csv('/content/Pokemon.csv', encoding='utf-8')
poke_df = poke_df.sample(random_state=1, frac=1).reset_index(drop=True)

np.unique(poke_df['Generation'])

array([1, 2, 3, 4, 5, 6])

In [8]:
gen_ord_map = {'Gen 1': 1, 'Gen 2': 2, 'Gen 3': 3,
               'Gen 4': 4, 'Gen 5': 5, 'Gen 6': 6}

poke_df['GenerationLabel'] = poke_df['Generation'].map(gen_ord_map)
poke_df[['Name', 'Generation', 'GenerationLabel']].iloc[4:10]

Unnamed: 0,Name,Generation,GenerationLabel
4,Octillery,2,
5,Helioptile,6,
6,Dialga,4,
7,DeoxysDefense Forme,3,
8,Rapidash,1,
9,Swanna,5,


# Encoding Categorical Features

## One-hot Encoding Scheme

In [9]:
poke_df[['Name', 'Generation', 'Legendary']].iloc[4:10]

Unnamed: 0,Name,Generation,Legendary
4,Octillery,2,False
5,Helioptile,6,False
6,Dialga,4,True
7,DeoxysDefense Forme,3,True
8,Rapidash,1,False
9,Swanna,5,False


In [10]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# transform and map pokemon generations
gen_le = LabelEncoder()
gen_labels = gen_le.fit_transform(poke_df['Generation'])
poke_df['Gen_Label'] = gen_labels

# transform and map pokemon legendary status
leg_le = LabelEncoder()
leg_labels = leg_le.fit_transform(poke_df['Legendary'])
poke_df['Lgnd_Label'] = leg_labels

poke_df_sub = poke_df[['Name', 'Generation', 'Gen_Label', 'Legendary', 'Lgnd_Label']]
poke_df_sub.iloc[4:10]

Unnamed: 0,Name,Generation,Gen_Label,Legendary,Lgnd_Label
4,Octillery,2,1,False,0
5,Helioptile,6,5,False,0
6,Dialga,4,3,True,1
7,DeoxysDefense Forme,3,2,True,1
8,Rapidash,1,0,False,0
9,Swanna,5,4,False,0


In [16]:
# encode generation labels using one-hot encoding scheme
gen_ohe = OneHotEncoder()
gen_feature_arr = gen_ohe.fit_transform(poke_df[['Gen_Label']]).toarray()
gen_feature_labels = list(gen_le.classes_)
gen_features = pd.DataFrame(gen_feature_arr, columns=gen_feature_labels)

# encode legendary status labels using one-hot encoding scheme
leg_ohe = OneHotEncoder()
leg_feature_arr = leg_ohe.fit_transform(poke_df[['Lgnd_Label']]).toarray()
leg_feature_labels = ['Legendary_'+str(cls_label) for cls_label in leg_le.classes_]
leg_features = pd.DataFrame(leg_feature_arr, columns=leg_feature_labels)

In [17]:
poke_df_ohe = pd.concat([poke_df_sub, gen_features, leg_features], axis=1)
columns = sum([['Name', 'Generation', 'Gen_Label'],gen_feature_labels,
              ['Legendary', 'Lgnd_Label'],leg_feature_labels], [])
poke_df_ohe[columns].iloc[4:10]

Unnamed: 0,Name,Generation,Gen_Label,1,2,3,4,5,6,Legendary,Lgnd_Label,Legendary_False,Legendary_True
4,Octillery,2,1,0.0,1.0,0.0,0.0,0.0,0.0,False,0,1.0,0.0
5,Helioptile,6,5,0.0,0.0,0.0,0.0,0.0,1.0,False,0,1.0,0.0
6,Dialga,4,3,0.0,0.0,0.0,1.0,0.0,0.0,True,1,0.0,1.0
7,DeoxysDefense Forme,3,2,0.0,0.0,1.0,0.0,0.0,0.0,True,1,0.0,1.0
8,Rapidash,1,0,1.0,0.0,0.0,0.0,0.0,0.0,False,0,1.0,0.0
9,Swanna,5,4,0.0,0.0,0.0,0.0,1.0,0.0,False,0,1.0,0.0


In [18]:
new_poke_df = pd.DataFrame([['PikaZoom', 'Gen 3', True],
                           ['CharMyToast', 'Gen 4', False]],
                           columns=['Name', 'Generation', 'Legendary'])
new_poke_df

Unnamed: 0,Name,Generation,Legendary
0,PikaZoom,Gen 3,True
1,CharMyToast,Gen 4,False


In [19]:
new_gen_labels = gen_le.transform(new_poke_df['Generation'])
new_poke_df['Gen_Label'] = new_gen_labels

new_leg_labels = leg_le.transform(new_poke_df['Legendary'])
new_poke_df['Lgnd_Label'] = new_leg_labels

new_poke_df[['Name', 'Generation', 'Gen_Label', 'Legendary', 'Lgnd_Label']]

ValueError: invalid literal for int() with base 10: 'Gen 3'

In [20]:
from sklearn.preprocessing import LabelEncoder

# Original data
original_generations = ['Gen 1', 'Gen 2', 'Gen 3', 'Gen 4']
original_legendaries = [True, False]

# Initialize and fit the label encoders
gen_le = LabelEncoder()
gen_le.fit(original_generations)

leg_le = LabelEncoder()
leg_le.fit(original_legendaries)

# Transform the new data
new_gen_labels = gen_le.transform(new_poke_df['Generation'])
new_poke_df['Gen_Label'] = new_gen_labels

new_leg_labels = leg_le.transform(new_poke_df['Legendary'])
new_poke_df['Lgnd_Label'] = new_leg_labels

# Display the updated DataFrame
new_poke_df[['Name', 'Generation', 'Gen_Label', 'Legendary', 'Lgnd_Label']]


Unnamed: 0,Name,Generation,Gen_Label,Legendary,Lgnd_Label
0,PikaZoom,Gen 3,2,True,1
1,CharMyToast,Gen 4,3,False,0


In [21]:
new_gen_feature_arr = gen_ohe.transform(new_poke_df[['Gen_Label']]).toarray()
new_gen_features = pd.DataFrame(new_gen_feature_arr, columns=gen_feature_labels)

new_leg_feature_arr = leg_ohe.transform(new_poke_df[['Lgnd_Label']]).toarray()
new_leg_features = pd.DataFrame(new_leg_feature_arr, columns=leg_feature_labels)

new_poke_ohe = pd.concat([new_poke_df, new_gen_features, new_leg_features], axis=1)
columns = sum([['Name', 'Generation', 'Gen_Label'], gen_feature_labels,
               ['Legendary', 'Lgnd_Label'], leg_feature_labels], [])
new_poke_ohe[columns]

Unnamed: 0,Name,Generation,Gen_Label,1,2,3,4,5,6,Legendary,Lgnd_Label,Legendary_False,Legendary_True
0,PikaZoom,Gen 3,2,0.0,0.0,1.0,0.0,0.0,0.0,True,1,0.0,1.0
1,CharMyToast,Gen 4,3,0.0,0.0,0.0,1.0,0.0,0.0,False,0,1.0,0.0


In [22]:
gen_onehot_features = pd.get_dummies(poke_df['Generation'])
pd.concat([poke_df[['Name', 'Generation']], gen_onehot_features], axis=1).iloc[4:10]

Unnamed: 0,Name,Generation,1,2,3,4,5,6
4,Octillery,2,False,True,False,False,False,False
5,Helioptile,6,False,False,False,False,False,True
6,Dialga,4,False,False,False,True,False,False
7,DeoxysDefense Forme,3,False,False,True,False,False,False
8,Rapidash,1,True,False,False,False,False,False
9,Swanna,5,False,False,False,False,True,False


## Dummy Coding Scheme

In [23]:
gen_dummy_features = pd.get_dummies(poke_df['Generation'], drop_first=True)
pd.concat([poke_df[['Name', 'Generation']], gen_dummy_features], axis=1).iloc[4:10]

Unnamed: 0,Name,Generation,2,3,4,5,6
4,Octillery,2,True,False,False,False,False
5,Helioptile,6,False,False,False,False,True
6,Dialga,4,False,False,True,False,False
7,DeoxysDefense Forme,3,False,True,False,False,False
8,Rapidash,1,False,False,False,False,False
9,Swanna,5,False,False,False,True,False


In [24]:
gen_onehot_features = pd.get_dummies(poke_df['Generation'])


In [27]:
gen_onehot_features.iloc[:,:]

Unnamed: 0,1,2,3,4,5,6
0,True,False,False,False,False,False
1,False,False,False,True,False,False
2,False,True,False,False,False,False
3,False,False,False,False,False,True
4,False,True,False,False,False,False
...,...,...,...,...,...,...
795,False,False,False,False,True,False
796,False,False,False,False,False,True
797,True,False,False,False,False,False
798,False,True,False,False,False,False


In [28]:
gen_dummy_features = gen_onehot_features.iloc[:,:-1]
pd.concat([poke_df[['Name', 'Generation']], gen_dummy_features], axis=1).iloc[4:10]

Unnamed: 0,Name,Generation,1,2,3,4,5
4,Octillery,2,False,True,False,False,False
5,Helioptile,6,False,False,False,False,False
6,Dialga,4,False,False,False,True,False
7,DeoxysDefense Forme,3,False,False,True,False,False
8,Rapidash,1,True,False,False,False,False
9,Swanna,5,False,False,False,False,True


## Effect Coding Scheme

In [29]:
gen_onehot_features = pd.get_dummies(poke_df['Generation'])
gen_effect_features = gen_onehot_features.iloc[:,:-1]


In [34]:
gen_effect_features

Unnamed: 0,1,2,3,4,5
0,True,False,False,False,False
1,False,False,False,True,False
2,False,True,False,False,False
3,False,False,False,False,False
4,False,True,False,False,False
...,...,...,...,...,...
795,False,False,False,False,True
796,False,False,False,False,False
797,True,False,False,False,False
798,False,True,False,False,False


In [32]:
x=np.all(gen_effect_features == 0, axis=1)

In [37]:
x

0      False
1      False
2      False
3       True
4      False
       ...  
795    False
796     True
797    False
798    False
799    False
Length: 800, dtype: bool

In [38]:
gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1.
pd.concat([poke_df[['Name', 'Generation']], gen_effect_features], axis=1).iloc[4:10]

Unnamed: 0,Name,Generation,1,2,3,4,5
4,Octillery,2,False,True,False,False,False
5,Helioptile,6,-1.0,-1.0,-1.0,-1.0,-1.0
6,Dialga,4,False,False,False,True,False
7,DeoxysDefense Forme,3,False,False,True,False,False
8,Rapidash,1,True,False,False,False,False
9,Swanna,5,False,False,False,False,True


## Feature Hashing scheme

In [11]:
unique_genres = np.unique(vg_df[['Genre']])
print("Total game genres:", len(unique_genres))
print(unique_genres)

Total game genres: 12
['Action' 'Adventure' 'Fighting' 'Misc' 'Platform' 'Puzzle' 'Racing'
 'Role-Playing' 'Shooter' 'Simulation' 'Sports' 'Strategy']


In [15]:
from sklearn.feature_extraction import FeatureHasher
vg_df['Genre'] = vg_df['Genre'].apply(lambda x: [x])
fh = FeatureHasher(n_features=6, input_type='string')
hashed_features = fh.fit_transform(vg_df['Genre'])
hashed_features = hashed_features.toarray()
pd.concat([vg_df[['Name', 'Genre']], pd.DataFrame(hashed_features)], axis=1).iloc[1:7]

TypeError: feature names must be strings

In [39]:
# Initialize the FeatureHasher
fh = FeatureHasher(n_features=6, input_type='string')

# Apply FeatureHasher to the Genre column
hashed_features = fh.fit_transform(vg_df['Genre'])

# Convert the hashed features to an array
hashed_features = hashed_features.toarray()

TypeError: feature names must be strings

In [None]:
fh.get_params()

{'dtype': numpy.float64,
 'input_type': 'string',
 'n_features': 6,
 'non_negative': False}