# Get the data

In [4]:
import pandas as pd
df = pd.read_csv('../datasets/titanic_data.csv').dropna()

In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S


## Final pipeline

* http://pandas.pydata.org/pandas-docs/stable/ 
* http://scikit-learn.org/stable/index.html
* https://github.com/scikit-learn-contrib/sklearn-pandas 

In [6]:
import warnings
import sklearn.preprocessing as pp
from sklearn.pipeline import Pipeline
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings("ignore")

target = 'Survived'
mapper = DataFrameMapper([
    (['Age','Fare','SibSp'],pp.Imputer()),
    (['Embarked'], [CategoricalImputer(), pp.LabelEncoder(),pp.LabelBinarizer()]),
    (['Age','Fare'],pp.StandardScaler()),
    ],default=False)

pipe = Pipeline([
    ('featurize', mapper),
    ('lm', RandomForestClassifier())])

### Train

In [7]:
pipe.fit(df,df[target])

Pipeline(memory=None,
     steps=[('featurize', DataFrameMapper(default=False, df_out=False,
        features=[(['Age', 'Fare', 'SibSp'], Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), (['Embarked'], [CategoricalImputer(copy=True, missing_values='NaN'), LabelEncoder(), LabelBinarizer(neg_label=...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

#### Predict

In [8]:
pipe.predict(df.head(10))

array([1, 1, 0, 1, 1, 1, 1, 0, 1, 0])

In [11]:
pipe.steps[-1][1]

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)