# Intro to dummy encoding with Skoot

This notebook accompanies the **"An intro to dummy encoding with Skoot"** post. It was developed on a Python 3.5 kernel.

In [1]:
import pandas as pd
import sys
import skoot
v_inf = sys.version_info

print("Pandas version: %r" % pd.__version__)
print("Python version: %r.%r.%r" % (v_inf.major, v_inf.minor, v_inf.micro))
print("Skoot version: %r" % skoot.__version__)

Pandas version: '0.19.2'
Python version: 3.5.3
Skoot version: '0.19.2-dev1'


## Read in our data

In [2]:
df = pd.read_csv("~/Downloads/adult.data.txt", header=None,
                 names=["age", "workclass", "fnlwgt", "education", 
                        "education-num", "marital-status", "occupation", 
                        "relationship", "race", "sex", "capital-gain", 
                        "capital-loss", "hours-per-week", "native-country", "target"])

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
# drop off the y
y = df.pop("target")

In [4]:
df.drop("education-num", axis=1, inplace=True)

## Split the data

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

In [6]:
object_cols = X_train.select_dtypes(["object", "category"]).columns.tolist()
object_cols

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

## Dummy encode the categoricals


In [7]:
from skoot.preprocessing import DummyEncoder

encoder = DummyEncoder(cols=object_cols, drop_one_level=True)
encoded = encoder.fit_transform(X_train)
encoded.head()

Unnamed: 0,age,fnlwgt,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Poland,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam
5514,33,198183,0,0,50,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
19777,36,86459,0,1887,50,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10781,58,203039,0,0,40,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
32240,21,180190,0,0,46,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9876,27,279872,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Apply to the test set

In [8]:
encoder.transform(X_test).head()

Unnamed: 0,age,fnlwgt,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Poland,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam
14160,27,160178,0,0,38,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
27048,45,50567,0,0,40,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
28868,29,185908,0,0,55,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5667,30,190040,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7827,29,189346,2202,0,50,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## What if there is a factor level in the test set we've never seen?

In [9]:
test_row = X_test.iloc[0]
test_row

age                           27
workclass                Private
fnlwgt                    160178
education           Some-college
marital-status          Divorced
occupation          Adm-clerical
relationship       Not-in-family
race                       White
sex                       Female
capital-gain                   0
capital-loss                   0
hours-per-week                38
native-country     United-States
Name: 14160, dtype: object

In [10]:
# Set a fake country that we'd never have seen before
test_row.set_value('native-country', "Atlantis")
test_row

age                           27
workclass                Private
fnlwgt                    160178
education           Some-college
marital-status          Divorced
occupation          Adm-clerical
relationship       Not-in-family
race                       White
sex                       Female
capital-gain                   0
capital-loss                   0
hours-per-week                38
native-country          Atlantis
Name: 14160, dtype: object

In [12]:
# By default, the encoder will handle this unless "handle_unknown='error'"
trans2 = encoder.transform(pd.DataFrame([test_row]))
trans2

Unnamed: 0,age,fnlwgt,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Poland,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam
14160,27,160178,0,0,38,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Prove that we didn't assign the `native-country`

In [16]:
nc_mask = trans2.columns.str.contains("native-country")
assert trans2[trans2.columns[nc_mask]].sum().sum() == 0