# Preprocessing

Prototype the class/functions that will handle all the preprocessing.  
Quite helpful to interactively prototype when preprocessing.  
This code will then be saved and extended in the deployment folder as proper .py.  

In [2]:
import sklearn
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer, RobustScaler

% matplotlib inline
sns.set_style("darkgrid")
warnings.filterwarnings('ignore')
plt.rcParams["figure.figsize"] = (10, 5)

### 1. Loading Data

In [3]:
raw = pd.read_csv("../data/train.csv")
raw.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 2. Prototyping

In [23]:
def _cabin_floor(df):
    return df["cabin"].str[0].fillna("")


def _names(df):
    return df["name"].str.split(",").apply(lambda x: x[0])


def _fare(df):
    return np.log10(df["fare"] + 1)


class Preprocessor(object):
    
    def __init__(self):
        self.cabin_floor = LabelEncoder()
        self.sex = LabelEncoder()
        self.embarked = LabelBinarizer()
        self.name_feats = None
        self.scaler = RobustScaler()
    
    def _fit(self, df: pd.DataFrame):
        self.cabin_floor = self.cabin_floor.fit(_cabin_floor(df))
        self.sex = self.sex.fit(df["sex"])
        
        names = _names(df)
        self.name_feats_gb = _fare(df).groupby(names).mean().to_frame() 
        self.name_feats_gb["count"] = np.log(df.groupby(names).size())
        self.name_feats_gb = self.name_feats_gb.reset_index()
        
        self.embarked = LabelBinarizer().fit(df["embarked"].fillna("X"))
        
        return self
    
    def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
        res = pd.DataFrame(index=df.index)
        res["cabin_floor"] = np.log(self.cabin_floor.transform(_cabin_floor(df)) + 1)
        res["pclass"] = df["pclass"]
        res["sex"] = self.sex.transform(df["sex"])
        res["age"] = df["age"]
        res["fare"] = _fare(df)
        res[["sibsp", "parch"]] = np.log(df[["sibsp", "parch"]] + 1)
        res["missing_age"] = res["age"].isnull().astype(int)
        res["age"] = res["age"].fillna(0)
        
        name_feats = pd.merge(_names(df).to_frame(), self.name_feats_gb, 
                              left_on="name", right_on="name", how="left")
        name_feats = name_feats.set_index(df.index)
        res["name_mean_fare"] = name_feats["fare"].fillna(0)
        res["name_count"] = name_feats["count"].fillna(0)
        
        cols = ["embarked_" + str(i) for i in range(4)]
        embarked = pd.DataFrame(self.embarked.transform(df["embarked"].fillna("X")), 
                                columns=cols, index=df.index)
        res = pd.concat([res, embarked], axis=1)
        return res
    
    def _label_preproc(member):
        """
        Closure to deal with missing survivial column.
        """
        def wrapper(self, df):
            survived = pd.DataFrame()
            if "survived" in df.columns:
                survived = df[["survived"]]
                df = df.drop("survived", axis=1)
            res = member(self, df)
            if isinstance(res, pd.DataFrame):
                res = pd.concat([res, survived], axis=1)
            return res

        return wrapper

    @_label_preproc
    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        res = self._transform(df)
        res = pd.DataFrame(self.scaler.transform(res), columns=res.columns, index=df.index)
        return res
    
    @_label_preproc
    def fit(self, df: pd.DataFrame):
        self._fit(df)
        res = self._transform(df)
        self.scaler.fit(res)
        return self
    
    @_label_preproc
    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        self._fit(df)
        res = self._transform(df)
        res = pd.DataFrame(self.scaler.fit_transform(res), columns=res.columns, index=df.index)
        return res
        
        

### 3. Testing
Idea is to make sure the preprocessor can fit on one dataset (train) and the transform on another (val/test/prod).

In [24]:
# fit and then transform on same data

pp = Preprocessor()
pp.fit(raw)
pp.transform(raw).tail()

Unnamed: 0,cabin_floor,pclass,sex,age,fare,sibsp,parch,missing_age,name_mean_fare,name_count,embarked_0,embarked_1,embarked_2,embarked_3,survived
886,0.0,-1.0,0.0,0.103448,-0.077295,0.0,0.0,0.0,-0.081532,0.0,0.0,0.0,0.0,0.0,0
887,1.098612,-2.0,-1.0,-0.172414,0.544463,0.0,0.0,0.0,1.412871,1.584963,0.0,0.0,0.0,0.0,1
888,0.0,0.0,-1.0,-0.827586,0.358813,1.0,1.098612,1.0,0.365107,1.0,0.0,0.0,0.0,0.0,0
889,1.386294,-2.0,0.0,0.068966,0.544463,0.0,0.0,0.0,0.55524,0.0,1.0,0.0,-1.0,0.0,1
890,0.0,0.0,0.0,0.275862,-0.444911,0.0,0.0,0.0,-0.458025,0.0,0.0,1.0,-1.0,0.0,0


In [25]:
# fit and then transform on same data, missing labels

pp = Preprocessor()
pp.fit(raw)
pp.transform(raw.drop("survived", axis=1)).tail()

Unnamed: 0,cabin_floor,pclass,sex,age,fare,sibsp,parch,missing_age,name_mean_fare,name_count,embarked_0,embarked_1,embarked_2,embarked_3
886,0.0,-1.0,0.0,0.103448,-0.077295,0.0,0.0,0.0,-0.081532,0.0,0.0,0.0,0.0,0.0
887,1.098612,-2.0,-1.0,-0.172414,0.544463,0.0,0.0,0.0,1.412871,1.584963,0.0,0.0,0.0,0.0
888,0.0,0.0,-1.0,-0.827586,0.358813,1.0,1.098612,1.0,0.365107,1.0,0.0,0.0,0.0,0.0
889,1.386294,-2.0,0.0,0.068966,0.544463,0.0,0.0,0.0,0.55524,0.0,1.0,0.0,-1.0,0.0
890,0.0,0.0,0.0,0.275862,-0.444911,0.0,0.0,0.0,-0.458025,0.0,0.0,1.0,-1.0,0.0


In [26]:
# fit_transform on one dataset

pp = Preprocessor()
pp.fit_transform(raw).tail()

Unnamed: 0,cabin_floor,pclass,sex,age,fare,sibsp,parch,missing_age,name_mean_fare,name_count,embarked_0,embarked_1,embarked_2,embarked_3,survived
886,0.0,-1.0,0.0,0.103448,-0.077295,0.0,0.0,0.0,-0.081532,0.0,0.0,0.0,0.0,0.0,0
887,1.098612,-2.0,-1.0,-0.172414,0.544463,0.0,0.0,0.0,1.412871,1.584963,0.0,0.0,0.0,0.0,1
888,0.0,0.0,-1.0,-0.827586,0.358813,1.0,1.098612,1.0,0.365107,1.0,0.0,0.0,0.0,0.0,0
889,1.386294,-2.0,0.0,0.068966,0.544463,0.0,0.0,0.0,0.55524,0.0,1.0,0.0,-1.0,0.0,1
890,0.0,0.0,0.0,0.275862,-0.444911,0.0,0.0,0.0,-0.458025,0.0,0.0,1.0,-1.0,0.0,0


In [27]:
# fit_transform on one dataset without labels

pp = Preprocessor()
pp.fit_transform(raw.drop("survived", axis=1)).tail()

Unnamed: 0,cabin_floor,pclass,sex,age,fare,sibsp,parch,missing_age,name_mean_fare,name_count,embarked_0,embarked_1,embarked_2,embarked_3
886,0.0,-1.0,0.0,0.103448,-0.077295,0.0,0.0,0.0,-0.081532,0.0,0.0,0.0,0.0,0.0
887,1.098612,-2.0,-1.0,-0.172414,0.544463,0.0,0.0,0.0,1.412871,1.584963,0.0,0.0,0.0,0.0
888,0.0,0.0,-1.0,-0.827586,0.358813,1.0,1.098612,1.0,0.365107,1.0,0.0,0.0,0.0,0.0
889,1.386294,-2.0,0.0,0.068966,0.544463,0.0,0.0,0.0,0.55524,0.0,1.0,0.0,-1.0,0.0
890,0.0,0.0,0.0,0.275862,-0.444911,0.0,0.0,0.0,-0.458025,0.0,0.0,1.0,-1.0,0.0


In [29]:
# fit on one half, transform on the other

pp = Preprocessor()
pp.fit(raw.iloc[: len(raw)//2])
pp.transform(raw.iloc[len(raw)//2: ]).tail()

Unnamed: 0,cabin_floor,pclass,sex,age,fare,sibsp,parch,missing_age,name_mean_fare,name_count,embarked_0,embarked_1,embarked_2,embarked_3,survived
886,0.0,-1.0,0.0,0.103448,-0.079223,0.0,0.0,0.0,-2.246586,0.0,0.0,0.0,0.0,0.0,0
887,1.098612,-2.0,-1.0,-0.172414,0.55804,0.0,0.0,0.0,1.833004,1.0,0.0,0.0,0.0,0.0,1
888,0.0,0.0,-1.0,-0.827586,0.367761,1.0,1.098612,1.0,-2.246586,0.0,0.0,0.0,0.0,0.0,0
889,1.386294,-2.0,0.0,0.068966,0.55804,0.0,0.0,0.0,-2.246586,0.0,1.0,0.0,-1.0,0.0,1
890,0.0,0.0,0.0,0.275862,-0.456006,0.0,0.0,0.0,-2.246586,0.0,0.0,1.0,-1.0,0.0,0


In [28]:
# fit on one half, transform on the other without labels

pp = Preprocessor()
pp.fit(raw.iloc[: len(raw)//2])
pp.transform(raw.iloc[len(raw)//2: ].drop("survived", axis=1)).tail()

Unnamed: 0,cabin_floor,pclass,sex,age,fare,sibsp,parch,missing_age,name_mean_fare,name_count,embarked_0,embarked_1,embarked_2,embarked_3
886,0.0,-1.0,0.0,0.103448,-0.079223,0.0,0.0,0.0,-2.246586,0.0,0.0,0.0,0.0,0.0
887,1.098612,-2.0,-1.0,-0.172414,0.55804,0.0,0.0,0.0,1.833004,1.0,0.0,0.0,0.0,0.0
888,0.0,0.0,-1.0,-0.827586,0.367761,1.0,1.098612,1.0,-2.246586,0.0,0.0,0.0,0.0,0.0
889,1.386294,-2.0,0.0,0.068966,0.55804,0.0,0.0,0.0,-2.246586,0.0,1.0,0.0,-1.0,0.0
890,0.0,0.0,0.0,0.275862,-0.456006,0.0,0.0,0.0,-2.246586,0.0,0.0,1.0,-1.0,0.0
