# Convert Categorical values to numeric 
1. Pandas - Categorical ( modfies dataframe)
2. Pandas - astype
3. Pandas - factorize
4. sklearn - LabelEncoder

#### There are three columns with Categorical data - sex, smoker and region. We will look at different methods to convert then to numeric columns

## Import Library


In [54]:
import pandas as pd
import numpy as np
LabelEncoder
print(" Library imported")

 Library imported


## Load Data

In [55]:
df_insurance = pd.read_csv("insurance.csv")
df_insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Pandas - Categorical
1. Use pandas Categorical function to change data type toCategorical and then
2. use cat.codes to retrive values 
3. pd.Categorical will sort categories alphabetically and then assign numeric values starting at 0

In [56]:
#Use pandas Categorical function to change data type toCategorical, this will modify dataframe 
df = df_insurance.copy()
df.smoker = pd.Categorical(df.smoker)
df['smoker_code'] = df.smoker.cat.codes
df.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,smoker_code
0,19,female,27.9,0,yes,southwest,16884.924,1
1,18,male,33.77,1,no,southeast,1725.5523,0
2,28,male,33.0,3,no,southeast,4449.462,0
3,33,male,22.705,0,no,northwest,21984.47061,0
4,32,male,28.88,0,no,northwest,3866.8552,0


### Pandas - Categorical - Lambda single Column


In [57]:
# lambda single column

df = df_insurance.copy()
df[['smoker_code']] = df[['smoker']].apply(lambda col:pd.Categorical(col).codes)
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,smoker_code
0,19,female,27.900,0,yes,southwest,16884.92400,1
1,18,male,33.770,1,no,southeast,1725.55230,0
2,28,male,33.000,3,no,southeast,4449.46200,0
3,33,male,22.705,0,no,northwest,21984.47061,0
4,32,male,28.880,0,no,northwest,3866.85520,0
...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830,0
1334,18,female,31.920,0,no,northeast,2205.98080,0
1335,18,female,36.850,0,no,southeast,1629.83350,0
1336,21,female,25.800,0,no,southwest,2007.94500,0


### Pandas - Categorical - Lambda Multiple Column


In [58]:
# lambda list of columns 
df = df_insurance.copy()
df[["smoker_code", "region_code"]] = df[["smoker", "region"]].apply(lambda col:pd.Categorical(col).codes)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,smoker_code,region_code
0,19,female,27.9,0,yes,southwest,16884.924,1,3
1,18,male,33.77,1,no,southeast,1725.5523,0,2
2,28,male,33.0,3,no,southeast,4449.462,0,2
3,33,male,22.705,0,no,northwest,21984.47061,0,1
4,32,male,28.88,0,no,northwest,3866.8552,0,1


### Pandas - Categorical - Lambda Retaining nan values


In [59]:
#keep your NaN values
df = df_insurance.copy()
df[['region_code']] = df[['region']].apply(lambda col:pd.Categorical(col).codes).replace(-1,np.nan)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,region_code
0,19,female,27.9,0,yes,southwest,16884.924,3
1,18,male,33.77,1,no,southeast,1725.5523,2
2,28,male,33.0,3,no,southeast,4449.462,2
3,33,male,22.705,0,no,northwest,21984.47061,1
4,32,male,28.88,0,no,northwest,3866.8552,1


## Pandas - astype

In [60]:
df = df_insurance.copy()
df.smoker.astype('category').cat.codes


0       1
1       0
2       0
3       0
4       0
       ..
1333    0
1334    0
1335    0
1336    0
1337    1
Length: 1338, dtype: int8

## Pandas - factorize
1. Use pandas factorize function 
2. Sort category values and then convert to numeric
3. pd.factorize will NOT sort categories alphabetically.
4. pd.factorize will start numbering at 0 by defgault


In [61]:
# default
df = df_insurance.copy()
df['regioncode'] = pd.factorize(df['region'])[0]
df.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges,regioncode
0,19,female,27.9,0,yes,southwest,16884.924,0
1,18,male,33.77,1,no,southeast,1725.5523,1
2,28,male,33.0,3,no,southeast,4449.462,1
3,33,male,22.705,0,no,northwest,21984.47061,2
4,32,male,28.88,0,no,northwest,3866.8552,2


In [62]:
# sort alphabetically - sort=True

In [63]:
df = df_insurance.copy()
df['regioncode'] = pd.factorize(df['region'], sort=True)[0] + 1 
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,regioncode
0,19,female,27.9,0,yes,southwest,16884.924,4
1,18,male,33.77,1,no,southeast,1725.5523,3
2,28,male,33.0,3,no,southeast,4449.462,3
3,33,male,22.705,0,no,northwest,21984.47061,2
4,32,male,28.88,0,no,northwest,3866.8552,2


In [64]:
# Start numbering at 1 instead of 0

In [65]:
df = df_insurance.copy()
df['regioncode'] = pd.factorize(df['region'])[0] + 1 
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,regioncode
0,19,female,27.9,0,yes,southwest,16884.924,1
1,18,male,33.77,1,no,southeast,1725.5523,2
2,28,male,33.0,3,no,southeast,4449.462,2
3,33,male,22.705,0,no,northwest,21984.47061,3
4,32,male,28.88,0,no,northwest,3866.8552,3


## sklearn - LabelEncoder

In [66]:
from sklearn.preprocessing import LabelEncoder
df = df_insurance.copy()
labelencoder = LabelEncoder()
df['region'] = labelencoder.fit_transform(df['region'])
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,3,16884.924
1,18,male,33.77,1,no,2,1725.5523
2,28,male,33.0,3,no,2,4449.462
3,33,male,22.705,0,no,1,21984.47061
4,32,male,28.88,0,no,1,3866.8552
