# Label Encoder Recoding Example

#### By Matthew Gerardino
---

### We will start by exploring our data

In [1]:
#import data
import pandas as pd
df=pd.read_csv("../Data/CarData.csv")
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [2]:
# view shape of data
df.shape

(11914, 16)

In [3]:
# muliply rows and columns for total unique values
total = 11914 * 16
print(f'There are {total} total entries')

There are 190624 total entries


In [4]:
# print columns
df.columns

Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
       'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
      dtype='object')

In [5]:
# print unique values in the Make column
df['Make'].unique()

array(['BMW', 'Audi', 'FIAT', 'Mercedes-Benz', 'Chrysler', 'Nissan',
       'Volvo', 'Mazda', 'Mitsubishi', 'Ferrari', 'Alfa Romeo', 'Toyota',
       'McLaren', 'Maybach', 'Pontiac', 'Porsche', 'Saab', 'GMC',
       'Hyundai', 'Plymouth', 'Honda', 'Oldsmobile', 'Suzuki', 'Ford',
       'Cadillac', 'Kia', 'Bentley', 'Chevrolet', 'Dodge', 'Lamborghini',
       'Lincoln', 'Subaru', 'Volkswagen', 'Spyker', 'Buick', 'Acura',
       'Rolls-Royce', 'Maserati', 'Lexus', 'Aston Martin', 'Land Rover',
       'Lotus', 'Infiniti', 'Scion', 'Genesis', 'HUMMER', 'Tesla',
       'Bugatti'], dtype=object)

In [6]:
# number of unique values in the Make column
df.Make.nunique()

48

In [7]:
# dtypes
df.dtypes.unique()

array([dtype('O'), dtype('int64'), dtype('float64')], dtype=object)

---
## After exploring your data you may find it necessary to recode
### For this I will use LabelEncoder from sklearn.preprocessing

In [8]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [9]:
# store all objescts ("strings") into a variable
list1=df.select_dtypes('O')

# create an empty list that we will use for the recode
list_categorical=[]

# append all object dtypes into our empty list
for i in list1:
    list_categorical.append(i)
    
# time for the recode
for i in list_categorical:
    df[i]=le.fit_transform(df[i])

In [10]:
# view new recoded data
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,4,1,2011,8,335.0,6.0,3,3,2.0,38,0,8,26,19,3916,46135
1,4,0,2011,8,300.0,6.0,3,3,2.0,67,0,6,28,19,3916,40650
2,4,0,2011,8,300.0,6.0,3,3,2.0,64,0,8,28,20,3916,36350
3,4,0,2011,8,230.0,6.0,3,3,2.0,67,0,8,28,18,3916,29450
4,4,0,2011,8,230.0,6.0,3,3,2.0,63,0,6,28,18,3916,34500


In [11]:
# confirm that there are no more object dtypes
df.dtypes.unique()

array([dtype('int32'), dtype('int64'), dtype('float64')], dtype=object)

In [12]:
# further confirm that there are no more object dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  int32  
 1   Model              11914 non-null  int32  
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11914 non-null  int32  
 4   Engine HP          11845 non-null  float64
 5   Engine Cylinders   11884 non-null  float64
 6   Transmission Type  11914 non-null  int32  
 7   Driven_Wheels      11914 non-null  int32  
 8   Number of Doors    11908 non-null  float64
 9   Market Category    11914 non-null  int32  
 10  Vehicle Size       11914 non-null  int32  
 11  Vehicle Style      11914 non-null  int32  
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  MSRP               11914 non-null  int64  
dtypes: float64(3), int32(8