In [None]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [None]:
# Import the data
df_train = pd.read_excel('Resources/titanic_full.xls')
df_train.head(50)

In [None]:
cabin_only = df_train[["cabin"]].copy()
cabin_only["Cabin_Data"] = cabin_only["cabin"].isnull().apply(lambda x: not x)

In [None]:
cabin_only["Deck"] = cabin_only["cabin"].str.slice(0,1)
cabin_only["Room"] = cabin_only["cabin"].str.slice(1,5).str.extract("([0-9]+)", expand=False).astype("float")
cabin_only[cabin_only["Cabin_Data"]]


In [None]:
cabin_only.drop(["cabin", "Cabin_Data"], axis=1, inplace=True, errors="ignore")


In [None]:
cabin_only["Deck"] = cabin_only["Deck"].fillna("N")
cabin_only["Room"] = cabin_only["Room"].fillna(cabin_only["Room"].mean())

In [None]:
cabin_only.head(30)

In [None]:
df_train = df_train.join(cabin_only)


In [None]:
df_train['family']=df_train['sibsp']+df_train['parch']+1
df_train.head(50)

In [None]:
# attempt = df_train[['ticket','boat']].dropna().set_index('ticket')
# attempt_dict = attempt.to_dict()
# attempt_dict = list(attempt_dict.values())
# attempt_dict
# # df_train['attempted_boat'] = df_train['attempted_boat'].map(attempt_dict)


In [None]:
stair = []
for x in range(0,len(df_train)):
    deck = df_train["Deck"][x]
    room = df_train["Room"][x]
    if (deck =="A") or (((deck=="B") or (deck=="C")) and (room < 70)) or (((deck=="D") or (deck=="E")) and (room < 50)):
        stair.append(1)
        
    elif (deck == "N"):
        stair.append("N")
        
    else: stair.append(0)
    
df_train["Stair"]= stair

df_train

In [None]:
df_train.drop(["name","ticket","cabin","home.dest","body","boat","embarked","Room","sibsp",'parch'], axis=1, inplace=True)
df_train.head()

In [None]:
df_train['pclass'] = df_train['pclass'].replace([1,2,3],['1st','2nd','3rd'])

In [None]:
df_train = df_train.dropna(how='any')
df_train = pd.get_dummies(df_train)

In [None]:
y = df_train["survived"].values
X = df_train.drop("survived", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

X_train.head()

In [None]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
classifier = LogisticRegression(solver='lbfgs',max_iter=1000)
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

In [None]:
# Loop through different k values to find which has the highest accuracy.
# Note: We use only odd numbers because we don't want any ties.
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes
# for each layer.
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=40, activation="relu", input_dim=20))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=20, activation="tanh"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=20, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

In [None]:
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Create data
X, y = make_classification(random_state=1, n_features=20, n_informative=5, n_redundant=0)
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

In [None]:
features = clf.feature_importances_
print(features)
plt.bar(x = range(len(features)), height=features)
plt.show()