# Preprocess the Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('titanic.csv')

In [3]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
encoder = LabelEncoder()

In [6]:
df2 = df.copy()

In [7]:
df2['Sex'] = encoder.fit_transform(df2['Sex'])

In [8]:
#male = 1, female = 0
df2.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,1,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,0,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,0,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,0,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,1,35.0,0,0,8.05


In [9]:
y = df2['Survived'].values

In [10]:
X = df2.drop(['Survived', 'Name'], axis=1)

In [11]:
X.head()

Unnamed: 0,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,3,1,22.0,1,0,7.25
1,1,0,38.0,1,0,71.2833
2,3,0,26.0,0,0,7.925
3,1,0,35.0,1,0,53.1
4,3,1,35.0,0,0,8.05


In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [13]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(665, 6)
(222, 6)
(665,)
(222,)


In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [15]:
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Build A  Model

In [16]:
from sklearn.tree import DecisionTreeClassifier

In [17]:
model = DecisionTreeClassifier()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [18]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.77      0.88      0.82       127
           1       0.80      0.64      0.71        95

    accuracy                           0.78       222
   macro avg       0.78      0.76      0.77       222
weighted avg       0.78      0.78      0.77       222



# Compare Models

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [20]:
logreg = LogisticRegression(solver='lbfgs',
                               max_iter=200,
                               random_state=1)
svc = SVC(kernel='linear')
tree = DecisionTreeClassifier()
forest = RandomForestClassifier(n_estimators=128, random_state=78) 

In [21]:
ml_models = [logreg, svc, tree, forest]

In [22]:
for ml_model in ml_models:
    temp_model = ml_model
    temp_model.fit(X_train_scaled, y_train)
    temp_y_pred = temp_model.predict(X_test_scaled)
    
    temp_report = classification_report(y_test, temp_y_pred)
    print('----------------')
    print(ml_model)
    print(temp_report)

----------------
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
              precision    recall  f1-score   support

           0       0.79      0.91      0.84       127
           1       0.84      0.67      0.75        95

    accuracy                           0.81       222
   macro avg       0.81      0.79      0.80       222
weighted avg       0.81      0.81      0.80       222

----------------
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)
              precision    recall  f1-score   support

           0    

# Hyperparameter Tuning

In [23]:
estimators = [10, 20, 50, 100, 150, 200, 250, 500, 1000]
for estimator in estimators:
    model = RandomForestClassifier(n_estimators=estimator, random_state=78)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    report = classification_report(y_test, y_pred)
    print('----------------')
    print(str(estimator))
    print(report)

----------------
10
              precision    recall  f1-score   support

           0       0.77      0.88      0.82       127
           1       0.81      0.65      0.72        95

    accuracy                           0.78       222
   macro avg       0.79      0.77      0.77       222
weighted avg       0.79      0.78      0.78       222

----------------
20
              precision    recall  f1-score   support

           0       0.78      0.91      0.84       127
           1       0.85      0.65      0.74        95

    accuracy                           0.80       222
   macro avg       0.81      0.78      0.79       222
weighted avg       0.81      0.80      0.80       222

----------------
50
              precision    recall  f1-score   support

           0       0.78      0.89      0.83       127
           1       0.82      0.66      0.73        95

    accuracy                           0.79       222
   macro avg       0.80      0.78      0.78       222
weighted avg  