<a href='https://ai.meng.duke.edu'> = <img align="left" style="padding-top:10px;" src=https://storage.googleapis.com/aipi_datasets/Duke-AIPI-Logo.png>

# Encoding categorical variables
In this example we are using data from a health insurance company regarding their customers and the annual medical expenses for each customer.  The objective is to develop a model which can predict the medical expenses of a customer based on demographic information about the customer.

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import statsmodels.api as sm
from statsmodels.formula.api import ols

import warnings
warnings.filterwarnings("ignore")

In [7]:
# Run this before any other code cell
# This downloads the csv data files into the same directory where you have saved this notebook

import urllib.request
from pathlib import Path
import os
path = Path()

# Dictionary of file names and download links
files = {'insurance_modified.csv':'https://storage.googleapis.com/aipi_datasets/insurance_modified.csv'}

# Download each file
for key,value in files.items():
    filename = path/key
    url = value
    # If the file does not already exist in the directory, download it
    if not os.path.exists(filename):
        urllib.request.urlretrieve(url,filename)

In [8]:
# Read in the data
data = pd.read_csv('insurance_modified.csv')
data = data
data.head()

Unnamed: 0,sex,age_group,bmi,children,smoker,region,charges
0,female,10-19,27.9,0,yes,southwest,16884.924
1,male,10-19,33.77,1,no,southeast,1725.5523
2,male,20-29,33.0,3,no,southeast,4449.462
3,male,30-39,22.705,0,no,northwest,21984.47061
4,male,30-39,28.88,0,no,northwest,3866.8552


In [9]:
# Create feature data and target
X = data.drop('charges',axis=1)
y = data['charges']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
print("Shape of X_train, y_train:",X_train.shape,y_train.shape)
print("Shape of X_test, y_test:",X_test.shape,y_test.shape)

Shape of X_train, y_train: (1070, 6) (1070,)
Shape of X_test, y_test: (268, 6) (268,)


## Encode training set
### Ordinal encode 'sex', 'smoker', 'age_group'

In [19]:
def ordinal_encode(X,cols):
    enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    # Fit the encoder on training data and transform it.  We can also use it to transform test data
    X[cols] = enc.fit_transform(X[cols])
    return X,enc

In [20]:
# Ordinal encode the binary variables (sex, smoker) and the ordinal variable (age_group)
X_train_encoded = X_train.copy()
ordinal_cols = ['sex','smoker','age_group']

X_train_encoded,ordinal_enc = ordinal_encode(X_train_encoded,ordinal_cols)
X_train_encoded.head()

Unnamed: 0,sex,age_group,bmi,children,smoker,region
621,1.0,2.0,34.1,4,1.0,southwest
194,1.0,0.0,34.43,0,0.0,southeast
240,0.0,1.0,36.67,2,1.0,northeast
1168,1.0,2.0,35.2,2,0.0,southwest
1192,0.0,4.0,32.395,1,0.0,northeast


### One-hot encode 'region' and 'children'

In [21]:
def onehot_encode(X,cols):
    # Treat new categories as a new 'unknown' category (all onehot columns are 0)
    onehot_enc = OneHotEncoder(handle_unknown='ignore')
    # Fit encoder on training data
    onehot_enc.fit(X[cols])
    # Get the names of the new columns created
    colnames = columns=list(onehot_enc.get_feature_names(input_features=cols))
    # Transform the data
    onehot_vals = onehot_enc.transform(X[cols]).toarray()
    # Put transformed data into dataframe
    enc_df = pd.DataFrame(onehot_vals,columns=colnames,index=X.index)
    # Add onehot columns back onto original dataframe and drop the original columns
    X = pd.concat([X,enc_df],axis=1).drop(cols,axis=1)
    return X,onehot_enc

In [22]:
# Features to one-hot encode
onehotcols = ['region','children']
X_train_encoded, onehot_enc = onehot_encode(X_train_encoded,onehotcols)

X_train_encoded.head()

Unnamed: 0,sex,age_group,bmi,smoker,region_northeast,region_northwest,region_southeast,region_southwest,children_0,children_1,children_2,children_3,children_4,children_5
621,1.0,2.0,34.1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
194,1.0,0.0,34.43,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
240,0.0,1.0,36.67,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1168,1.0,2.0,35.2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1192,0.0,4.0,32.395,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## Encode test set
### Your turn
Complete the below function `encode_test_set()`, which takes as input the test set, the list of ordinal columns `ordinal_cols`, the list of columns to one-hot encode `onehot_cols`, the fitted ordinal encoder and the fitted one-hot encoder.  The function should use the ordinal encoder to encode the data in the columns listed in `ordinal_cols` and use the one-hot encoder to encode the data in the columns in `onehot_cols`.  The function should then return the test dataset with the columns encoded.  Note: don't forget to drop the original columns in `onehot_cols` once you have one-hot encoded them.

In [23]:
def encode_test_set(X,ordinal_cols,onehot_cols,ordinal_encoder,onehot_encoder):
    ### BEGIN SOLUTION ###
    
    # Apply ordinal encoder
    X[ordinal_cols] = ordinal_encoder.transform(X[ordinal_cols])
    
    # Apply onehot encoder
    colnames = columns=list(onehot_encoder.get_feature_names(input_features=onehot_cols))
    onehot_vals = onehot_encoder.transform(X[onehot_cols]).toarray()
    # Put transformed data into dataframe
    enc_df = pd.DataFrame(onehot_vals,columns=colnames,index=X.index)
    # Add onehot columns back onto original dataframe and drop the original columns
    X = pd.concat([X,enc_df],axis=1).drop(onehot_cols,axis=1)
    
    return X
    ### END SOLUTION ###

In [31]:
X_test_encoded = X_test.copy()
X_test_encoded = encode_test_set(X_test_encoded,ordinal_cols,onehotcols,ordinal_enc,onehot_enc)
display(X_test_encoded.head())

assert X_test_encoded.shape==(268, 14)

Unnamed: 0,sex,age_group,bmi,smoker,region_northeast,region_northwest,region_southeast,region_southwest,children_0,children_1,children_2,children_3,children_4,children_5
578,1.0,4.0,30.2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
610,0.0,3.0,29.37,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
569,1.0,3.0,40.565,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1034,1.0,5.0,38.38,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
198,0.0,4.0,18.05,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


### Run model

In [33]:
model = LinearRegression()
model.fit(X_train_encoded,y_train)
testpreds = model.predict(X_test_encoded)
r2 = r2_score(y_test,testpreds)
print("The model's R-squared value on the training set is {:.3f}".format(r2))

assert np.round(r2,2)==0.80

The model's R-squared value on the training set is 0.797
