# 7 - Feature engineering - Titanic

Proceso previo al entrenamiento del modelo  el que se hace un análisis, limpieza y estructuración de los campos de los datos.
Puede incluir:
* Eliminación (registro entero, o feature entera) o sustitución de valores no conocidos 
* Escalado, centrado...
* Creación de características que agrupen varias, o extraerlas de datos no estructurados
* Análisis para eliminar las características menos importantes
* Eliminar _outliers_


In [1]:
%matplotlib inline  

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plots 

In [2]:
titanic = pd.read_csv("data/titanic.csv")

titanic.describe()



Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,,0.0,0.0,7.9104
50%,446.0,0.0,3.0,,0.0,0.0,14.4542
75%,668.5,1.0,3.0,,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [3]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S



* Survived: 	Survived (1) or died (0)
* Pclass: 	Passenger’s class
* Name: 	Passenger’s name
* Sex: 	Passenger’s sex
* Age: 	Passenger’s age
* SibSp: 	Number of siblings/spouses aboard
* Parch: 	Number of parents/children aboard
* Ticket: 	Ticket number
* Fare: 	Fare
* Cabin: 	Cabin
* Embarked: 	Port of embarkation

# Feature engineering

In [4]:
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())

In [5]:
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1

In [6]:
titanic["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [7]:
titanic["Embarked"] = titanic["Embarked"].fillna("S")

titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2


In [8]:
# Import the linear regression class
from sklearn.linear_model import LinearRegression
# Sklearn also has a helper that makes it easy to do cross validation
from sklearn.cross_validation import KFold


def lr_train (feat_predictor, p=True):
    # Initialize our algorithm class
    reg = LinearRegression()
    # Generate cross validation folds for the titanic dataset.  It return the row indices corresponding to train and test.
    # We set random_state to ensure we get the same splits every time we run this.
    kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
    predictions = []
    for train, test in kf:
        # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
        train_predictors = (titanic[feat_predictor].iloc[train,:])
        # The target we're using to train the algorithm.
        train_target = titanic["Survived"].iloc[train]
        # Training the algorithm using the predictors and target.
        reg.fit(train_predictors, train_target)
        # We can now make predictions on the test fold
        test_predictions = reg.predict(titanic[feat_predictor].iloc[test,:])
        predictions.append(test_predictions)

    # The predictions are in three separate numpy arrays.  Concatenate them into one.  
    # We concatenate them on axis 0, as they only have one axis.
    predictions = np.concatenate(predictions, axis=0)
    # Map predictions to outcomes (only possible outcomes are 1 and 0)
    predictions[predictions > .5] = 1
    predictions[predictions <=.5] = 0
    aciertos = sum(predictions == titanic["Survived"])
    accuracy = sum(predictions == titanic["Survived"])  / len(predictions)
    if p:
        print ("aciertos %f  accuracy %f " %  (aciertos, accuracy) )
    else:
        return aciertos, accuracy

In [9]:
lr_train(["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"])

aciertos 698.000000  accuracy 0.783389 



## more features!

### extrayendo la categoría a partir de tratamiento 

In [10]:
import re

# A function to get the title from a name.
def get_title(name):
    # Use a regular expression to search for a title.  Titles always consist of capital and lowercase letters, and end with a period.
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

# Get all the titles and print how often each one occurs.
titles = titanic["Name"].apply(get_title)
print(pd.value_counts(titles))

# Map each title to an integer.  Some titles are very rare, and are compressed into the same codes as other titles.
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
for k,v in title_mapping.items():
    titles[titles == k] = v

# Verify that we converted everything.
pd.value_counts(titles)

# Add in the title column.
titanic["Title"] = titles

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Major         2
Mlle          2
Don           1
Capt          1
Sir           1
Jonkheer      1
Ms            1
Countess      1
Lady          1
Mme           1
Name: Name, dtype: int64


### Agrupando por familias

In [11]:
titanic["FamilySize"]= titanic.Parch + titanic.SibSp

import operator

# A dictionary mapping family name to id
family_id_mapping = {}

# A function to get the id given a row
def get_family_id(row):
    # Find the last name by splitting on a comma
    last_name = row["Name"].split(",")[0]
    # Create the family id
    family_id = "{0}{1}".format(last_name, row["FamilySize"])
    # Look up the id in the mapping
    if family_id not in family_id_mapping:
        if len(family_id_mapping) == 0:
            current_id = 1
        else:
            # Get the maximum id from the mapping and add one to it if we don't have an id
            current_id = (max(family_id_mapping.items(), key=operator.itemgetter(1))[1] + 1)
        family_id_mapping[family_id] = current_id
    return family_id_mapping[family_id]

# Get the family ids with the apply method
family_ids = titanic.apply(get_family_id, axis=1)

# There are a lot of family ids, so we'll compress all of the families under 3 members into one code.
family_ids[titanic["FamilySize"] < 3] = -1



titanic["FamilyId"] = family_ids

In [12]:
# Print the count of each unique id.
pd.value_counts(family_ids)

-1      800
 14       8
 149      7
 63       6
 50       6
 59       6
 17       5
 384      4
 27       4
 25       4
 162      4
 8        4
 84       4
 340      4
 43       3
 269      3
 58       3
 633      2
 167      2
 280      2
 510      2
 90       2
 83       1
 625      1
 376      1
 449      1
 498      1
 588      1
dtype: int64

In [18]:
lr_train( ["Pclass", "Sex", "Age", "FamilySize", "Title", "FamilyId", "Fare","Embarked"])
lr_train( ["Pclass", "Sex", "Age", "FamilySize", "Title", "FamilyId"])



aciertos 708.000000  accuracy 0.794613 
aciertos 714.000000  accuracy 0.801347 


In [14]:
titanic.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,FamilySize,FamilyId
PassengerId,1.0,-0.005007,-0.035144,0.034212,-0.057527,-0.001652,0.012658,-0.040143,0.089864
Survived,-0.005007,1.0,-0.338481,-0.06491,-0.035322,0.081629,0.257307,0.016639,0.082786
Pclass,-0.035144,-0.338481,1.0,-0.339898,0.083081,0.018443,-0.5495,0.065997,-0.010017
Age,0.034212,-0.06491,-0.339898,1.0,-0.233296,-0.172482,0.096688,-0.245619,-0.106827
SibSp,-0.057527,-0.035322,0.083081,-0.233296,1.0,0.414838,0.159651,0.890712,0.335019
Parch,-0.001652,0.081629,0.018443,-0.172482,0.414838,1.0,0.216225,0.783111,0.392454
Fare,0.012658,0.257307,-0.5495,0.096688,0.159651,0.216225,1.0,0.217138,0.124323
FamilySize,-0.040143,0.016639,0.065997,-0.245619,0.890712,0.783111,0.217138,1.0,0.425037
FamilyId,0.089864,0.082786,-0.010017,-0.106827,0.335019,0.392454,0.124323,0.425037,1.0


## Seleccionando características

In [15]:
from sklearn.feature_selection import RFE

from sklearn.linear_model import LogisticRegression
from itertools import compress


full_feat=["Pclass", "Sex", "Age", "FamilySize", "Title", "FamilyId", "Fare","Embarked"]
# The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
X = titanic[full_feat]
# The target we're using to train the algorithm.
Y = titanic["Survived"]

# create a base classifier used to evaluate a subset of attributes
model = LinearRegression(normalize=True, copy_X=True)
#model = LogisticRegression()
# create the RFE model and select attributes
rfe = RFE(model)
rfe = rfe.fit(X, Y)

# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

select_feat = list(compress(full_feat, rfe.support_))
select_feat

[ True  True False False  True False False  True]
[1 1 3 2 1 5 4 1]


['Pclass', 'Sex', 'Title', 'Embarked']

In [16]:
lr_train( select_feat)

aciertos 698.000000  accuracy 0.783389 


In [17]:
import itertools
best_accuracy = 0
all_feat = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "Title", "FamilyId"]
for i in range(1,len(all_feat)+1):
    print (i)
    for combo in itertools.combinations(all_feat, i):
        
        _, accuracy = lr_train(list(combo), p=False)
        if (accuracy>best_accuracy):
            best_accuracy=accuracy
            best_feat=combo
print (best_feat, best_accuracy)

1
2
3
4
5
6
7
8
9
10
('Pclass', 'Sex', 'Age', 'SibSp', 'Title') 0.801346801347
