# Description of the project
This is my first practical project in Machine Learning, using the Titanic example of Kaggle (https://www.kaggle.com/c/titanic). The goal is to create a model to predict the survival probability of passengers.

Here I want to make better use of Pipelines and FeatureUnions

In [1]:
# Import the relevant packages
import pylab as py
import pandas as pd # Pandas handles statistical data
import sklearn # Machine learning package
import sklearn.linear_model
import sklearn.neural_network
import sklearn.ensemble
import sklearn.metrics
import sklearn.preprocessing
import time,datetime
import numpy.random as random
from sklearn.pipeline import Pipeline,FeatureUnion,TransformerMixin
%matplotlib inline

# Read training and test data using Pandas
data = pd.read_csv("data/train.csv")
data_test = pd.read_csv("data/test.csv")

In [2]:
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

In [3]:
class addDummy(TransformerMixin):
    def __init__(self,label=None):
        self.label = label
    def fit(self,*args):
        return self
    def transform(self,X):
        if self.label is not None:
            arr = pd.Series(X[self.label])
        else:
            arr = pd.Series(X)
        mode = arr.mode()
        dummies = pd.get_dummies(arr).drop([mode[0]],axis=1).rename(columns=lambda x:self.label+'_'+str(x))
        return dummies
    def get_params(self,*args,**kwargs):
        return self.label
    def fit_transform(self,X,*args):
        return self.transform(X)

class addFeature(TransformerMixin):
    def __init__(self,label):
        self.label = label
    def fit(self,*args):
        return self
    def transform(self,X):
        return X[[self.label]]
    def get_params(self,*args,**kwargs):
        return self.label
    def fit_transform(self,X,*args):
        return self.transform(X)

class applyFunction(TransformerMixin):
    def __init__(self,function):
        self.function = function
    def fit(self,*args):
        return self
    def transform(self,X):
        out = X.apply(self.function,axis=1)
        return out.values.reshape((len(out),1))
    def get_params(self,*args,**kwargs):
        return self.function
    def fit_transform(self,X,*args):
        return self.transform(X)
    

In [14]:
def SexClassClassifier(x):
    for sex in ['male','female']:
        for clas in [1,2,3]:
            if x['Sex']==sex and x['Pclass']==clas:
                return str(sex[0]).upper()+str(clas)
def AgeClassifier(x,childrenLimit=16,seniorLimit=48):
    age = x['Age']
    if py.isnan(age):
        return 'unknown'
    elif age < childrenLimit:
        return 'child'
    elif age >= seniorLimit:
        return 'senior'
    else:
        return 'adult'
def FamilyClassifierComplete(x):
    family = (x['SibSp'],x['Parch'])
    if family[0]==0 and family[1]==0:
        return 0
    elif family[0]==0 and family[1]>0:
        return 1
    elif family[0]>0 and family[1]==0:
        return 2
    elif family[0]>0 and family[1]>0:
        return 3
def FamilyClassifierSimplified(x):
    family = (x['SibSp'],x['Parch'])
    if family[0]==0 and family[1]==0:
        return 0
    elif family[0]==0 and family[1]>0:
        return 1
    elif family[0]>0 and family[1]==0:
        return 1
    elif family[0]>0 and family[1]>0:
        return 1
    
featureChoice = FeatureUnion([
    ('SexClass',Pipeline([
        ('toLabel',applyFunction(SexClassClassifier)),
        ('toEncode',sklearn.preprocessing.LabelEncoder()),
        ('toDummies',sklearn.preprocessing.LabelBinarizer()),
    ])),
    ('Agegroup',Pipeline([
        ('toLabel',applyFunction(AgeClassifier)),
        ('toEncode',sklearn.preprocessing.LabelEncoder()),
        ('toDummies',sklearn.preprocessing.LabelBinarizer()),
    ])),
    ('Embarked',Pipeline([
        ('toLabel',addFeature('Embarked')),
        ('toEncode',sklearn.preprocessing.LabelEncoder()),
        ('toDummies',sklearn.preprocessing.LabelBinarizer()),
    ])),
    ('FamilyComplete',Pipeline([
        ('toLabel',applyFunction(FamilyClassifierComplete)),
        ('toDummies',sklearn.preprocessing.LabelBinarizer()),
    ])),
])

In [15]:
transformLabels = [x[0] for x in featureChoice.transformer_list]
try:
    index = transformLabels.index('SexClass')
    labels = ['F1','F2','F3','M1','M2','M3']
    featureChoice.transformer_list[index][1].named_steps['toEncode'].fit(labels)
    featureChoice.transformer_list[index][1].named_steps['toDummies'].fit(range(len(labels)))
except ValueError:
    pass

try:
    index = transformLabels.index('Agegroup')
    labels = ['child','adult','senior','unknown']
    featureChoice.transformer_list[index][1].named_steps['toEncode'].fit(labels)
    featureChoice.transformer_list[index][1].named_steps['toDummies'].fit(range(len(labels)))
except ValueError:
    pass

try:
    index = transformLabels.index('Embarked')
    labels = data['Embarked'].unique()
    featureChoice.transformer_list[index][1].named_steps['toEncode'].fit(labels)
    featureChoice.transformer_list[index][1].named_steps['toDummies'].fit(range(len(labels)))
except ValueError:
    pass

try:
    index = transformLabels.index('FamilySimplified')
    print(index)
    featureChoice.transformer_list[index][1].named_steps['toDummies'].fit(range(2))
except ValueError:
    pass

try:
    index = transformLabels.index('FamilyComplete')
    featureChoice.transformer_list[index][1].named_steps['toDummies'].fit(range(4))
except ValueError:
    pass

In [16]:
labels = ['F1','F2','F3','M1','M2','M3']
featureChoice.transformer_list[0][1].named_steps['toDummies'].fit(labels)

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

In [17]:
d=featureChoice.transform(data)

  y = column_or_1d(y, warn=True)
  mask |= (ar1 == a)


In [None]:
d.shape

In [None]:
d

In [None]:
pipe = Pipeline([
    ('features',featureChoice),
    ('model',sklearn.linear_model.LogisticRegression())
])

In [None]:
pipe.fit(data)

In [None]:
py.array(range(10)).ravel().shape