# Building Predictive Model

In [1]:
import pandas as pd
import numpy as np
import os

## Import Data

In [2]:
processed_data_path = os.path.join(os.path.pardir, "data", "processed")
train_file_path = os.path.join(processed_data_path, "train.csv")
test_file_path = os.path.join(processed_data_path, "test.csv")

In [3]:
train_df = pd.read_csv(train_file_path, index_col="PassengerId")
test_df = pd.read_csv(test_file_path, index_col="PassengerId")

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null float64
Deck_B                891 non-null float64
Deck_C                891 non-null float64
Deck_D                891 non-null float64
Deck_E                891 non-null float64
Deck_F                891 non-null float64
Deck_G                891 non-null float64
Deck_Z                891 non-null float64
Pclass_1              891 non-null float64
Pclass_2              891 non-null float64
Pclass_3              891 non-null float64
Title_Lady            891 non-null float64
Title_Master          891 non-null float64
Title_Miss            891 non-null float64
Title_Mr              891 non-null float64
Title_Mrs             891 non-null float6

In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 32 columns):
Age                   418 non-null float64
Fare                  418 non-null float64
FamilySize            418 non-null int64
IsMother              418 non-null int64
IsMale                418 non-null int64
Deck_A                418 non-null float64
Deck_B                418 non-null float64
Deck_C                418 non-null float64
Deck_D                418 non-null float64
Deck_E                418 non-null float64
Deck_F                418 non-null float64
Deck_G                418 non-null float64
Deck_Z                418 non-null float64
Pclass_1              418 non-null float64
Pclass_2              418 non-null float64
Pclass_3              418 non-null float64
Title_Lady            418 non-null float64
Title_Master          418 non-null float64
Title_Miss            418 non-null float64
Title_Mr              418 non-null float64
Title_Mrs             418 non-null flo

## Data Preparation

In [6]:
X = train_df.loc[:,:"AgeState_Child"].as_matrix().astype("float")
Y = train_df["Survived"].ravel()  # Creates flattened 1D array

In [7]:
print(X.shape, Y.shape)

(891, 32) (891,)


In [12]:
from sklearn.model_selection import train_test_split  # sklearn.model_selection from sklearn 0.19+
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(712, 32) (179, 32)
(712,) (179,)


In [13]:
import sklearn

In [14]:
sklearn.__version__

'0.19.1'

## Building the baseline model

In [15]:
from sklearn.dummy import DummyClassifier

In [16]:
dummy_model = DummyClassifier(strategy="most_frequent", random_state=0)

In [18]:
dummy_model.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [19]:
print("Score for baseline model = {0:.2f}".format(dummy_model.score(X_test, y_test)))

Score for baseline model = 0.61


In [21]:
# Performance metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [22]:
print("Accuracy for baseline model = {0:.2f}".format(accuracy_score(y_test, dummy_model.predict(X_test))))

Accuracy for baseline model = 0.61
