# Used as the primary tool to preprocess the data for model training.

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import sys
import yaml

In [5]:
#Path to project modules in src
sys.path.insert(1, '/home/jovyan/work/src/app')
from modules.data_pre import rand_age
from modules.data_pre import lin_model_age
from modules.data_pre import sto_lin_model_age
from modules.data_pre import most_common
from modules.data_pre import gender_data_transform

In [3]:
#read in data
data_path = '/home/jovyan/work/data/train.csv'
config_file = '/home/jovyan/work/src/app/assets/config.yaml'
df = pd.read_csv(data_path)

In [12]:
#read in data
data_path = '/home/jovyan/work/data/train.csv'
config_file = '/home/jovyan/work/src/app/assets/config.yaml'
df = pd.read_csv(data_path)


def preprocess_log(f):
    def wrapper(dataf, *args, **kwargs):
        tic = dt.datetime.now()
        result = f(dataf, *args, **kwargs)
        toc = dt.datetime.now()
        print(f"{f.__name__} took {toc - tic}")
        return result
    return wrapper

@preprocess_log
def start_pipeline(dataf):
    return dataf.copy()

@preprocess_log
def clean_dataset(dataf):
    # Drop non useful feature columns
    drop_list = ['Name', 'Ticket', 'Cabin']
    dataf = dataf.drop(drop_list, 1)
    # Change gender to numeric
    dataf['Sex'] = dataf['Sex'].apply(gender_data_transform)
    # Change the embark to numeric
    embark = {'S': 1,'C':2,'Q':3}
    dataf['Embarked'] = dataf['Embarked'].apply(lambda x: embark.get(x)) 
    return dataf
 
@preprocess_log
def remove_outliers(dataf):
    #Skip this for now, come back after initial model build
    return dataf

@preprocess_log
def impune_dataset(dataf):
    #Impune the most common embark point
    dataf['Imp_Embarked_mf'] = dataf['Embarked']
    dataf['Imp_Embarked_mf'] = dataf['Imp_Embarked_mf'].apply(most_common, mf_value=dataf['Embarked'].value_counts().idxmax().tolist())
    # Get the most common value for Embarked and Fare are
    most_common_embarked = dataf['Embarked'].value_counts().idxmax().tolist()
    most_common_fare = dataf['Fare'].value_counts().idxmax().tolist()
    dataf['Imp_Age_Mean'] = dataf['Age']
    dataf['Imp_Age_Median'] = dataf['Age']
    dataf['Imp_Age_Median'].fillna((dataf['Imp_Age_Median'].median()), inplace=True)
    dataf['Imp_Age_Mean'].fillna((dataf['Imp_Age_Mean'].mean()), inplace=True)
    dataf['Imp_Age_Rand'] = dataf['Age'].apply(rand_age, args=(round(dataf['Age'].min()), dataf['Age'].max()))
    dataf['Imp_Det_Lin'] = dataf.apply(lin_model_age, axis=1)
    std_error = dataf['Age'].sem(axis=0, )
    std_error = [getattr(std_error, "tolist", lambda: std_error)()]
    dataf['Imp_Sto_Lin'] = dataf.apply(sto_lin_model_age, args= (std_error), axis=1)
    #update/save YAML
    config_dict = {'most_common_embarked': most_common_embarked, 'most_common_fare': most_common_fare}
    with open(config_file, 'w') as file:
        documents = yaml.dump(config_dict, file)
    return dataf

clean_df = (df
 .pipe(start_pipeline)
 .pipe(clean_dataset)
 .pipe(remove_outliers)
 .pipe(impune_dataset)
)
clean_df.head()

start_pipeline took 0:00:00.000107
clean_dataset took 0:00:00.003993
remove_outliers took 0:00:00.000006
impune_dataset took 0:00:00.080690


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Imp_Embarked_mf,Imp_Age_Mean,Imp_Age_Median,Imp_Age_Rand,Imp_Det_Lin,Imp_Sto_Lin
0,1,0,3,1,22.0,1,0,7.25,1.0,1.0,22.0,22.0,22.0,22.0,22.0
1,2,1,1,0,38.0,1,0,71.2833,2.0,2.0,38.0,38.0,38.0,38.0,38.0
2,3,1,3,0,26.0,0,0,7.925,1.0,1.0,26.0,26.0,26.0,26.0,26.0
3,4,1,1,0,35.0,1,0,53.1,1.0,1.0,35.0,35.0,35.0,35.0,35.0
4,5,0,3,1,35.0,0,0,8.05,1.0,1.0,35.0,35.0,35.0,35.0,35.0


In [None]:
clean_df.isnull().any()

In [None]:
save_data_path = '/home/jovyan/work/data/training_titanic_dataset.csv'
clean_df.to_csv(save_data_path)