In [1]:
import pandas as pd

df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
df = pd.read_csv('train.csv', index_col='PassengerId')

# Separate target from predictors
y = df.Survived
X = df.drop(['Survived'], axis=1)

categorical_cols = ['Sex']
numerical_cols = ['Pclass', 'Age']
my_cols = categorical_cols + numerical_cols
X = X[my_cols].copy()
X.head()

Unnamed: 0_level_0,Sex,Pclass,Age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,male,3,22.0
2,female,1,38.0
3,female,3,26.0
4,female,1,35.0
5,male,3,35.0


In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

model = RandomForestRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
titanic_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(titanic_pipeline, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

titanic_pipeline.fit(X, y)



prediction = titanic_pipeline.predict(
    pd.DataFrame(
        {'Sex': ['female'], 'Pclass': [3], 'Age': [22.0]}
    )
)

print(prediction)

MAE scores:
 [0.27047696 0.2379243  0.24556605 0.24034027 0.23201374]
[0.63090657]


In [5]:
import pickle

with open("titanic_pipeline.pickle", "wb") as outfile:
    pickle.dump(titanic_pipeline, outfile)