# Description of the project
This is my first practical project in Machine Learning, using the Titanic example of Kaggle (https://www.kaggle.com/c/titanic). The goal is to create a model to predict the survival probability of passengers.

Here I want to make better use of Pipelines and FeatureUnions

In [1]:
# Import the relevant packages
import pylab as py
import pandas as pd # Pandas handles statistical data
import sklearn # Machine learning package
import sklearn.linear_model
import sklearn.neural_network
import sklearn.ensemble
import sklearn.metrics
import sklearn.preprocessing
from sklearn.preprocessing import FunctionTransformer
import time,datetime
import numpy.random as random
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator,TransformerMixin
%matplotlib inline

# Read training and test data using Pandas
data = pd.read_csv("data/train.csv")
data_test = pd.read_csv("data/test.csv")

In [2]:
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [None]:
"""
Encoding information about port of embarkment
"""
processEmbarked = Pipeline([
    ('LabelEncoder',sklearn.preprocessing.LabelEncoder()),
    ('LabelBinarizer',sklearn.preprocessing.LabelBinarizer())
])
toFit = data['Embarked']
for label,process in processEmbarked.steps:
    toFit = process.fit_transform(toFit)

In [146]:
class chooseFeature(BaseEstimator,TransformerMixin):
    def __init__(self,label):
        self.label = label
    def transform(self,X,*args,**kwargs):
        return X[self.label]
    def fit(self,*args,**kwargs):
        return self

In [149]:
"""
Encoding information about port of embarkment
"""
processEmbarked = Pipeline([
    ('Extraction',chooseFeature('Embarked')),
    ('LabelEncoder',sklearn.preprocessing.LabelEncoder()),
    ('LabelBinarizer',sklearn.preprocessing.LabelBinarizer())
])
toFit = data['Embarked']
for label,process in processEmbarked.steps[1:]:
    toFit = process.fit_transform(toFit)

In [152]:
"""
Encoding information about sex
"""
processSex = Pipeline([
    ('Extraction',chooseFeature('Sex')),
    ('LabelEncoder',sklearn.preprocessing.LabelEncoder()),
    ('LabelBinarizer',sklearn.preprocessing.LabelBinarizer())
])
toFit = data['Sex']
for label,process in processSex.steps[1:]:
    toFit = process.fit_transform(toFit)

In [157]:
"""
Encoding information about class
"""
processClass = Pipeline([
    ('Extraction',chooseFeature('Pclass')),
    ('LabelEncoder',sklearn.preprocessing.LabelEncoder()),
    ('LabelBinarizer',sklearn.preprocessing.LabelBinarizer())
])
toFit = data['Pclass']
for label,process in processClass.steps[1:]:
    toFit = process.fit_transform(toFit)

In [158]:
features = FeatureUnion([
    ('Sex',processSex),
    ('Class',processClass),
    ('Embarked',processEmbarked)
])

In [161]:
features.transform(data_test).shape

(418, 7)