# Intro to Skoot

This notebook accompanies the "Intro to Skoot" post. It was developed on a Python 3.5 kernel.

In [1]:
import pandas as pd
import sys
import skoot
v_inf = sys.version_info

print("Pandas version: %r" % pd.__version__)
print("Python version: %r.%r.%r" % (v_inf.major, v_inf.minor, v_inf.micro))
print("Skoot version: %r" % skoot.__version__)

Pandas version: '0.19.2'
Python version: 3.5.3
Skoot version: '0.19.2-dev1'


## Read in our data

In [6]:
df = pd.read_csv("~/Downloads/adult.data.txt", header=None,
                 names=["age", "workclass", "fnlwgt", "education", 
                        "education-num", "marital-status", "occupation", 
                        "relationship", "race", "sex", "capital-gain", 
                        "capital-loss", "hours-per-week", "native-country", "target"])

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
# drop off the y
y = df.pop("target")

In [14]:
df.drop("education-num", axis=1, inplace=True)

In [15]:
object_cols = df.select_dtypes(["object", "category"]).columns.tolist()
object_cols

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

## Dummy encode the categoricals


In [16]:
from skoot.preprocessing import DummyEncoder

encoder = DummyEncoder(cols=object_cols, drop_one_level=True)
encoded = encoder.fit_transform(df)
encoded.head()

Unnamed: 0,age,fnlwgt,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Poland,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam
0,39,77516,2174,0,40,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,50,83311,0,0,13,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,38,215646,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,53,234721,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,28,338409,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
cols = encoded.columns.tolist()
print("| " + " | ".join(cols) + " |")
print("| " + " | ".join([":" + ("-" * (len(c) - 1)) for c in cols]) + " |")

for i in range(5):
    r = [str(j) for j in encoded.iloc[i]]
    print("| " + " | ".join(r) + " |")

| age | fnlwgt | capital-gain | capital-loss | hours-per-week | workclass_ ? | workclass_ Federal-gov | workclass_ Local-gov | workclass_ Never-worked | workclass_ Private | workclass_ Self-emp-inc | workclass_ Self-emp-not-inc | workclass_ State-gov | education_ 10th | education_ 11th | education_ 12th | education_ 1st-4th | education_ 5th-6th | education_ 7th-8th | education_ 9th | education_ Assoc-acdm | education_ Assoc-voc | education_ Bachelors | education_ Doctorate | education_ HS-grad | education_ Masters | education_ Preschool | education_ Prof-school | marital-status_ Divorced | marital-status_ Married-AF-spouse | marital-status_ Married-civ-spouse | marital-status_ Married-spouse-absent | marital-status_ Never-married | marital-status_ Separated | occupation_ ? | occupation_ Adm-clerical | occupation_ Armed-Forces | occupation_ Craft-repair | occupation_ Exec-managerial | occupation_ Farming-fishing | occupation_ Handlers-cleaners | occupation_ Machine-op-inspct | occupatio