In [None]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [None]:
# Import the data
df_train = pd.read_excel('Resources/titanic_full.xls')
df_train.head()

In [None]:
# Removing unwanted columns
df_train.drop(["name","ticket","cabin","home.dest","body","boat","embarked"], axis=1, inplace=True)
df_train.head()

In [None]:
# Adjusting Pclass column to strings
df_train['pclass'] = df_train['pclass'].replace([1,2,3],['1st','2nd','3rd'])
df_train['family'] = df_train['sibsp']+df_train['parch']
df_train.drop(['sibsp','parch'], axis=1, inplace=True)
df_train.head()

In [None]:
# Determine if any fare points are outside of the 1.5*IQR range (outliers)
amounts = df_train['fare'].sort_values()
quartiles = np.quantile(amounts,[.25,.75])
iqr = quartiles[1]-quartiles[0]
lower_bound = quartiles[0]-(1.5*iqr)
upper_bound = quartiles[1]+(1.5*iqr)

potential_outliers = [print(amt) if amt < lower_bound or amt > upper_bound else next for amt in amounts]

In [None]:
# Dropping NAN Values
df_train = df_train.dropna(how='any')

In [None]:
# Converting categorical data to dummy values for testing
df_train = pd.get_dummies(df_train)

In [None]:
# Split the data into X_train, X_test, y_train, y_test
y = df_train["survived"].values
X = df_train.drop("survived", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

X_train.head()

In [None]:
# Scaling Data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Logistic Regression

In [None]:
# Train a Logistic Regression model print the model score
classifier = LogisticRegression(solver='lbfgs',max_iter=1000)
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

# KNN

In [None]:
# Loop through different k values to find which has the highest accuracy.
# Note: We use only odd numbers because we don't want any ties.
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
# Note that k: 5 provides the best accuracy where the classifier starts to stablize
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train_scaled, y_train)
print('k=11 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

## DNN

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes
# for each layer.
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=40, activation="relu", input_dim=8))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=20, activation="tanh"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=20, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

## Decision Tree

In [None]:
# Create data
X, y = make_classification(random_state=1, n_features=8, n_informative=5, n_redundant=0)
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

In [None]:
features = clf.feature_importances_
print(features)
plt.bar(x = range(len(features)), height=features)
plt.show()

## Creating Model

In [None]:
import pickle
pickle.dump(clf, open('model_randomforrest_2022080848.pkl','wb'))
# save the scaler
pickle.dump(scaler, open('scaler.pkl', 'wb'))