In [1]:
import pandas as pd
import sklearn as sklearn
import numpy as np
from sklearn.pipeline import Pipeline
from pandas.core.frame import DataFrame
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [3]:
class Pipe:
    def __init__(self, funcs, **kwargs):
        self.funcs = funcs
        self.kwargs = kwargs
    
    def transform(self, df:DataFrame) -> DataFrame:
        for f in self.funcs:
            df = f(df, **self.kwargs)
            
        return df.copy()

In [4]:
def split_test_train(df:DataFrame):
    test, train = df[df['ind'].eq('test')], df[df['ind'].eq('train')]
    test.drop(['ind'])
    train.drop(['ind'])
    return test, train
    
def combine_test_train(test:DataFrame, train:DataFrame):
    combine = pd.concat([test.assign(ind='test'), train.assign(ind='train')])
    return combine

In [5]:
df_train = pd.read_csv('Titanic/Train.csv')
df_test = pd.read_csv('Titanic/Test.csv')
df_combine = combine_test_train(df_test, df_train)

In [6]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [7]:
name_dict = {
    'Mr': 'ordinary',
    'Mrs': 'ordinary',
    'Miss': 'ordinary',
    'Master': 'child',
    'Don': 'aristocracy',
    'Dona': 'aristocracy',
    'Rev': 'aristocracy',
    'Dr': 'doctor',
    'Mme': 'ordinary',
    'Ms' : 'ordinary',
    'Major' : 'military',
    'Lady' : 'ordinary',
    'Sir' : 'ordinary',
    'Mlle' : 'ordinary',
    'Col': 'military',
    'Capt': 'military',
    'Countess' : 'aristocracy',
    'Jonkheer': 'military'
}

def name_to_appeal(name:str):
    parts = name.split(' ')
    for p in parts:
        if p.endswith('.'):
            return p[0:len(p)-1]
    
    return ''

def appeal_to_category(appeal:str):
    return name_dict[appeal]

def process_appeal(df:DataFrame, **kwargs) -> DataFrame:
    df['Appeal'] = df['Name'].apply(name_to_appeal).apply(appeal_to_category)
    df['Appeal'].astype('category')
    return pd.get_dummies(df, columns=['Appeal'])

In [8]:
def get_empty_cols(df:DataFrame):
    return list(df.isnull().sum()[lambda x: x > 0].index)

In [9]:
has_husband = {
    'Mr': 0,
    'Mrs': 1,
    'Miss': 0,
    'Master': 0,
    'Don': 0,
    'Dona': 1,
    'Rev': 0,
    'Dr': 0,
    'Mme': 0,
    'Ms' : 0,
    'Major' : 0,
    'Lady' : 0,
    'Sir' : 0,
    'Mlle' : 0,
    'Col': 0,
    'Capt': 0,
    'Countess' : 0,
    'Jonkheer': 0
}

def name_to_appeal(name:str):
    parts = name.split(' ')
    for p in parts:
        if p.endswith('.'):
            return p[0:len(p)-1]
    
    return ''

def appeal_to_husband(appeal:str):
    return has_husband[appeal]

def process_husband(df:DataFrame, **kwargs) -> DataFrame:
    df['HasHusband'] = df['Name'].apply(name_to_appeal).apply(appeal_to_husband)
    return df

In [10]:
def process_sex(df:DataFrame, **kwargs) -> DataFrame:
    df['Sex_bin'] = df['Sex'] == 'male'
    df['Sex_bin'] = df['Sex_bin'].replace({True: 1, False: 0})
    return df

In [11]:
def process_family_size(df:DataFrame, **kwargs) -> DataFrame:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = df['FamilySize'] == 1
    df['IsAlone'] = df['IsAlone'].replace({True:1, False: 0})
    return df

In [12]:
def process_deck(df:DataFrame, **kwargs) -> DataFrame:
    df['Deck'] = df['Cabin'].apply(lambda x: str(x)[0].capitalize())
    df = pd.get_dummies(df, columns = ['Deck'])
    return df

In [13]:
def process_deck_all(df:DataFrame, **kwargs) -> DataFrame:    
    
    # get all possible values from combined ds
    combined_ds = kwargs['combine']
    
    combined_ds['Deck'] = combined_ds['Cabin'].apply(lambda x: str(x)[0].capitalize())
    df['Deck'] = df['Cabin'].apply(lambda x: str(x)[0].capitalize())
    
    decks = list(combined_ds['Deck'].unique())
    decks_str = ['Deck_{0}'.format(d) for d in decks]

    for i in range(len(decks)):
        df[decks_str[i]] = df['Deck'] == decks[i]
        
    for i in range(len(decks)):
        df[decks_str[i]] = df[decks_str[i]].replace({True: 1, False: 0})
    
    return df

In [14]:
def process_missing(df:DataFrame, **kwargs) -> DataFrame:
    df['Age_missing'] = df['Age'].isnull().astype('int8')
    df['Cabin_missing'] = df['Cabin'].isnull().astype('int8')
    df['Fare_missing'] = df['Fare'].isnull().astype('int8')
    df['Embarked_missing'] = df['Embarked'].isnull().astype('int8')
    return df

In [15]:
def delete_not_needed(df:DataFrame, **kwargs) -> DataFrame:
    to_delete = ['Name', 'Sex', 'Ticket', 'Cabin', 'Deck', 'Embarked', 'Survived', 'PassengerId']
    for d in to_delete:
        if d in df.columns:
            df = df.drop(columns=[d])
    return df

In [16]:
def generate_math_attributes(df:DataFrame, **kwargs) -> DataFrame:
    numeric_cols = list(df._get_numeric_data().columns)
    numeric_cols = list(set(numeric_cols) - set(['Survived']))
    numerics = df[numeric_cols]
    for i in range(0, numerics.columns.size-1):
        for j in range(0, numerics.columns.size-1):
            col1 = str(numerics.columns.values[i])
            col2 = str(numerics.columns.values[j])
            # multiply fields together (we allow values to be squared)
            if i <= j:
                name = col1 + "*" + col2
                df = pd.concat([df, pd.Series(numerics.iloc[:,i] * numerics.iloc[:,j], name=name)], axis=1)
            # add fields together
            if i < j:
                name = col1 + "+" + col2
                df = pd.concat([df, pd.Series(numerics.iloc[:,i] + numerics.iloc[:,j], name=name)], axis=1)
            # divide and subtract fields from each other
            if not i == j:
                name = col1 + "/" + col2
                df = pd.concat([df, pd.Series(numerics.iloc[:,i] / numerics.iloc[:,j], name=name)], axis=1)
                name = col1 + "-" + col2
                df = pd.concat([df, pd.Series(numerics.iloc[:,i] - numerics.iloc[:,j], name=name)], axis=1)
                
    if (kwargs['verbose']): print('Math attrs generated. Shape: {0}'.format(df.shape))
    return df

In [17]:
def reduce_mem_usage(df:DataFrame, **kwargs) -> DataFrame:
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    
    if kwargs['verbose']: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    
    return df

In [18]:
def remove_correlated_features(df:DataFrame, **kwargs) -> DataFrame:
    
    # correlation matrix with a given method
    df_corr = df.corr(method=kwargs['correlation'])
    
    mask = np.ones(df_corr.columns.size) - np.eye(df_corr.columns.size)
    df_corr = mask * df_corr
    
    drops = []
    # loop through each variable
    for col in df_corr.columns.values:
        # if we've already determined to drop the current variable, continue
        if np.in1d([col],drops):
            continue

    # find all the variables that are highly correlated with the current variable 
    # and add them to the drop list 
    corr = df_corr[abs(df_corr[col]) > kwargs['correlation_coef']].index
    drops = np.union1d(drops, corr)
    
    if kwargs['verbose']: print('Removing correlated columns {0}'.format(drops))
        
    df = df.drop(drops, axis=1)
    return df

In [19]:
def bin_ages(df:DataFrame, **kwargs) -> DataFrame:
    age_binned = pd.cut(df['Age'], bins=[0, 10, 22, 35, 50, 70, 100], labels = [0, 1, 2, 3, 4, 5])
    df['Age_binned'] = age_binned
    df['Age_binned'] = pd.factorize(df['Age_binned'])[0].astype('int8')
    return df

In [20]:
def bin_fares(df:DataFrame, **kwargs) -> DataFrame:
    fares_binned = pd.qcut(df['Fare'], q=4, labels=[1, 2, 3, 4])
    df['Fare_binned'] = fares_binned
    df['Fare_binned'] = pd.factorize(df['Fare_binned'])[0].astype('int8')
    return df

In [21]:
def impute_with_random_forest_many_col(df:DataFrame, **kwargs) -> DataFrame:
    
    col_names = kwargs['impute_col_names']
    initial_cols = list(df.columns)
    
    for col_name in col_names:
        
        empty_col_names = get_empty_cols(df)
        cols_names_to_impute = list(set(empty_col_names) - {col_name})
        if (kwargs['verbose']==True): print('Deleting: {0}. Imputing: {1}'.format(cols_names_to_impute, col_name))
        
        # save temp
        temp = df[cols_names_to_impute]

        # clear dataset 
        df = df.drop(columns=cols_names_to_impute)
        
        known = df.loc[ df[col_name].notnull() ]
        unknown = df.loc[ df[col_name].isnull() ]
        
        # nothing to predict
        if (len(unknown) == 0): continue
        
        column_index = list(df.columns).index(col_name)

        all_indices = [i for i in range(unknown.shape[1])]
        diff = list(set(all_indices) - {column_index})

        y = known.values[:, column_index]
        X = known.values[:, diff]

        # predict
        rtr = RandomForestRegressor(n_estimators=100, n_jobs=-1)
        rtr.fit(X, y)
        predicted = rtr.predict(unknown.values[:, diff])

        # fill missings
        df.loc[ (df[col_name].isnull()), col_name ] = predicted
        
        # restore dataset
        df[cols_names_to_impute] = temp
    
    # reorder columns back
    df = df.reindex(columns = initial_cols)
    
    return df

In [22]:
def impute_with_random_forest_single_col(df:DataFrame, **kwargs) -> DataFrame:
    
    col_name = kwargs['impute_col_name']   
   
    known = df.loc[ df[col_name].notnull() ]
    unknown = df.loc[ df[col_name].isnull() ]

    # no unknown values
    if (len(unknown) == 0): return df

    column_index = list(df.columns).index(col_name)

    all_indices = [i for i in range(unknown.shape[1])]
    diff = list(set(all_indices) - {column_index})

    y = known.values[:, column_index]
    X = known.values[:, diff]

    # predict
    rtr = RandomForestRegressor(n_estimators=100, n_jobs=-1)
    rtr.fit(X, y)
    predicted = rtr.predict(unknown.values[:, diff])

    # fill missings
    df.loc[ (df[col_name].isnull()), col_name ] = predicted
    
    return df

In [23]:
def print_empty_values(df:DataFrame) -> DataFrame:
    col_names_with_na = list(df.isna().sum()[lambda x: x > 0].index)
    col_names_with_empty = list(df.isnull().sum()[lambda x: x > 0].index)
    print('Columns with NA: {0}'.format(col_names_with_na))
    print('Columns with empty {0}'.format(col_names_with_empty))
    return df

In [24]:
def final_processing_sklearn(df:DataFrame):
    
    p = Pipeline([
        ('appeal', FunctionTransformer(process_appeal)),
        ('husband', FunctionTransformer(process_husband)),
        ('sex', FunctionTransformer(process_sex)),
        ('family size', FunctionTransformer(process_family_size)),
        ('deck', FunctionTransformer(process_deck)),
        ('missings', FunctionTransformer(process_missing)),
        ('math', FunctionTransformer(generate_math_attributes))
    ])

    r = p.fit_transform(df)
    return r

In [25]:
def final_processing_manual(df:DataFrame)->DataFrame:
    
    p = Pipe([
        reduce_mem_usage,
        process_appeal, 
        process_husband, 
        process_sex, 
        process_family_size, 
        process_deck_all,
        process_missing,
        delete_not_needed,
        impute_with_random_forest_many_col,
        bin_ages,
        bin_fares        
        ],
        
        correlation = 'spearman',
        correlation_coef = 0.95, 
        impute_col_names = ['Age', 'Fare'],
        combine = df_combine,
        verbose = True
    )

    r = p.transform(df)
    return r

In [26]:
df_test = final_processing_manual(df_test)
print_empty_values(df_test)
df_test.head()

Mem. usage decreased to  0.02 Mb (44.2% reduction)
Deleting: ['Fare']. Imputing: Age
Deleting: []. Imputing: Fare
Columns with NA: []
Columns with empty []


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Appeal_aristocracy,Appeal_child,Appeal_doctor,Appeal_military,Appeal_ordinary,HasHusband,Sex_bin,FamilySize,IsAlone,Deck_N,Deck_B,Deck_E,Deck_A,Deck_C,Deck_D,Deck_F,Deck_G,Deck_T,Age_missing,Cabin_missing,Fare_missing,Embarked_missing,Age_binned,Fare_binned
0,3,34.5,0,0,7.828125,0,0,0,0,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,3,47.0,1,0,7.0,0,0,0,0,1,1,0,2,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0
2,2,62.0,0,0,9.6875,0,0,0,0,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,2,1
3,3,27.0,0,0,8.664062,0,0,0,0,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1
4,3,22.0,1,1,12.289062,0,0,0,0,1,1,0,3,0,1,0,0,0,0,0,0,0,0,0,1,0,0,3,1


In [27]:
df_train = final_processing_manual(df_train)
print_empty_values(df_train)
df_train.head()

Mem. usage decreased to  0.04 Mb (47.8% reduction)
Deleting: []. Imputing: Age
Deleting: []. Imputing: Fare
Columns with NA: []
Columns with empty []


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Appeal_aristocracy,Appeal_child,Appeal_doctor,Appeal_military,Appeal_ordinary,HasHusband,Sex_bin,FamilySize,IsAlone,Deck_N,Deck_B,Deck_E,Deck_A,Deck_C,Deck_D,Deck_F,Deck_G,Deck_T,Age_missing,Cabin_missing,Fare_missing,Embarked_missing,Age_binned,Fare_binned
0,3,22.0,1,0,7.25,0,0,0,0,1,0,1,2,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,1,38.0,1,0,71.3125,0,0,0,0,1,1,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1
2,3,26.0,0,0,7.925781,0,0,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,2,2
3,1,35.0,1,0,53.09375,0,0,0,0,1,1,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,1
4,3,35.0,0,0,8.046875,0,0,0,0,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,2,2


In [28]:
list(df_train.columns) == list(df_test.columns)

True

In [29]:
set(list(df_train.columns)) == set(list(df_test.columns))

True

In [30]:
len(list(df_train.columns)) == len(list(df_test.columns))

True