In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv")

In [16]:
import datetime as dt

def timeit(f):
    def wrapper(df, *args, **kwargs):
        tic = dt.datetime.now()
        result = f(df, *args, **kwargs)
        toc = dt.datetime.now()
        print(f'{f.__name__} took {toc-tic}')
        return result
    return wrapper

# Feature engineering 1, extract deck

In [17]:
@timeit
def extract_deck(df):
    df['deck'] = df['cabin'].str[0]
    df['deck'].fillna('Z', inplace=True)
    return df

# Feature engineering 2, family size

In [18]:
@timeit
def calc_family_size(df):
    df['family_size'] = df['sibsp'] + df['parch'] + 1
    
    bins = [0, 1, 4, 100]
    group_names = ['singleton', 'small', 'large']
    df['family_size_cat'] = pd.cut(df['family_size'], bins, labels=group_names)
    return df

# Feature engineering 3, length of name

In [19]:
@timeit
def calc_name_length(df):
    df['name_length'] = df['name'].apply(lambda x: len(x))
    
    bins = [0, 20, 40, 57, 85]
    group_names = ['short', 'ok', 'good', 'long']
    df['name_length_cat'] = pd.cut(df['name_length'], bins, labels=group_names)
    return df

# Feature engineering 4, fill missing embarked

In [20]:
@timeit
def fillna_embarked(df):
    df['embarked'].fillna('S', inplace=True)
    return df

# Feature engineering 5, LabelEncoder

In [21]:
@timeit
def label_encode(df):
    labelEnc = LabelEncoder()

    cat_vars = ['embarked', 'sex', 'family_size_cat', 'name_length_cat', 'deck']
    for col in cat_vars:
        df[col] = labelEnc.fit_transform(df[col])
    return df

In [55]:
df_end = (df
          .pipe(extract_deck)
          .pipe(calc_family_size)
          .pipe(calc_name_length)
          .pipe(fillna_embarked)
          .pipe(label_encode))

extract_deck took 0:00:00
calc_family_size took 0:00:00.009251
calc_name_length took 0:00:00.002024
fillna_embarked took 0:00:00
label_encode took 0:00:00.001006


In [14]:
df_end

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,deck,family_size,family_size_cat,name_length,name_length_cat
0,1,1,"Allen, Miss. Elisabeth Walton",0,29.00,0,0,24160,211.3375,B5,2,2,,"St Louis, MO",1,1,1,29,2
1,1,1,"Allison, Master. Hudson Trevor",1,0.92,1,2,113781,151.5500,C22 C26,2,11,,"Montreal, PQ / Chesterville, ON",2,4,2,30,2
2,1,0,"Allison, Miss. Helen Loraine",0,2.00,1,2,113781,151.5500,C22 C26,2,,,"Montreal, PQ / Chesterville, ON",2,4,2,28,2
3,1,0,"Allison, Mr. Hudson Joshua Creighton",1,30.00,1,2,113781,151.5500,C22 C26,2,,135.0,"Montreal, PQ / Chesterville, ON",2,4,2,36,2
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",0,25.00,1,2,113781,151.5500,C22 C26,2,,,"Montreal, PQ / Chesterville, ON",2,4,2,47,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",0,14.50,1,0,2665,14.4542,,0,,328.0,,8,2,2,20,3
1305,3,0,"Zabour, Miss. Thamine",0,,1,0,2665,14.4542,,0,,,,8,2,2,21,2
1306,3,0,"Zakarian, Mr. Mapriededer",1,26.50,0,0,2656,7.2250,,0,,304.0,,8,1,1,25,2
1307,3,0,"Zakarian, Mr. Ortin",1,27.00,0,0,2670,7.2250,,0,,,,8,1,1,19,3
