## 1- Preprocessing 

In [None]:
# Importing Libraries
import pandas as pd
import matplotlib as mp
import seaborn as sb
import numpy as np

In [None]:
data = pd.read_csv("titanic-passengers.csv", sep=";")
new_data = pd.DataFrame.copy(data)

# Displaying the first five rows of the data
data.head()

In [None]:
# concise summary of a DataFrame.
data.info() 

In [None]:
# Showing descriptive statistics
data.describe(include="all")

In [None]:
# Changing survived and Sex columns from categorical to numerical
# for survived: yes = 1, no = 0
# for Sex: male = 1, female = 0
new_data["Survived"] = new_data["Survived"].map({"Yes":1, "No":0})
new_data["Sex"] = new_data["Sex"].map({"male":1, "female":0})

# One-hot-Encoding the Embarked column
new_data = pd.get_dummies(new_data, columns=["Embarked"])

new_data.head()

In [None]:
# Finding correlations between features and outcome
def dataCorr(data):
    data_corr = data.corr()
    mask = np.zeros_like(data_corr)
    mask[np.triu_indices_from(mask)] = True
    mp.pyplot.subplots(figsize=(15, 15))
    dataplot = sb.heatmap(data_corr, annot=True, cmap="coolwarm", mask=mask, center=0, square=True, fmt=".2f")
    mp.pyplot.xticks(rotation = 45)
    mp.pyplot.show()
    
dataCorr(new_data)

In [None]:
# Finding missing values
new_data.isnull().sum().sort_values(ascending=False)

In [None]:
# Removed columns that I found to be useless
new_data.drop(columns=["Cabin", "Ticket"], inplace=True)

# Filling in the missing cabins
# new_data["Cabin"] = new_data["Cabin"].fillna("G6")

In [None]:
# Finding missing values
new_data.isnull().sum().sort_values(ascending=False)

#### 1.1- Data Cleaning
##### Finding outliers:
- Finding the rows that are considered outliers and removing them from the dataset we have

In [None]:
from collections import Counter


def detectOutliers(data, n, features):
    outlier_rows = []
    for col in features:
        Q1 = np.percentile(data[col], 25)
        Q3 = np.percentile(data[col], 75)
        IQR = Q3 - Q1
        outlierStep = IQR * 1.5
        outlierList = data[(data[col] < Q1 - outlierStep ) | (data[col] > Q3 + outlierStep)].index
        outlier_rows.extend(outlierList)
    # Counts the frequency of occurrences of each row index
    outlier_rows = Counter(outlier_rows)
    multiple_outliers = list(key for key, value in outlier_rows.items() if value > n)
    return multiple_outliers

outliers_to_drop = detectOutliers(new_data, 2, ["Age", "SibSp", "Fare", "Parch"])
print("these rows will be dropped due to their outlier values: {} ".format(outliers_to_drop))

In [None]:
print("Before removing the outliers we have {} rows".format(len(new_data)))
new_data = new_data.drop(outliers_to_drop, axis=0).reset_index(drop=True)
print("After removing the outliers we have {} rows".format(len(new_data)))

## 2- Visualisation Phase

In [None]:
# ["Sex", "Fare", "Pclass", "Embarked_C", "Embarked_Q", "Embarked_S"]

mp.pyplot.subplots(figsize=(10, 10))
mp.pyplot.subplot(2, 2, 1)
sb.histplot(data = data, x = "Sex", hue = "Survived", multiple="dodge")
mp.pyplot.subplot(2, 2, 2)
sb.histplot(data = data, x = "Fare", hue = "Survived", multiple="dodge", bins = 7)
mp.pyplot.subplot(2, 2, 3)
sb.histplot(data = data, x = "Pclass", hue = "Survived", multiple="dodge")
mp.pyplot.show()

In [None]:
grid = sb.FacetGrid(data, col="Sex", row="Survived")
grid.map(sb.histplot, "Age")

grid = sb.FacetGrid(data, col="Pclass", row="Survived", legend_out=True)
grid.map(sb.histplot, "Fare", kde = True)



In [None]:
g = sb.histplot(data = new_data,x="Fare", kde=True, stat="density", label=("Skewness: {}".format(round(new_data["Fare"].skew(), 2))))
g.legend()

#### 2.2- Observation
##### from the FacetGrids and Histoplots:
- The Fare is positively skewed making alot of the points seem like an outlier

In [None]:
# Fixing the skewness of the fare values
new_data["Fare"] = new_data["Fare"].map(lambda x: np.log(x) if x > 0 else 0)

In [None]:
g = sb.histplot(data = new_data,x="Fare", kde=True, stat="density", label=("Skewness: {}".format(round(new_data["Fare"].skew(), 2))))
g.legend()

#### 2.2- Observation
##### from the FacetGrids:
- The lower the fare, the more likely the person was on a lower Pclass
  - More people survived than died from Pclass 1
  - the survived to died ratio were very close to the ones in Pclass 2
  - Pclass 3 had the most casualties
- Most passengers were between the age 20 and 40
  - More Males died than Females
  - More Females survived than Males

In [None]:
dataCorr(new_data)

In [None]:
missingAge = list(new_data[new_data["Age"].isnull()].index)
len(missingAge)

In [None]:
g = sb.catplot(data=new_data, x="Sex", y="Age", hue="Pclass", kind="box", col="Survived")
g = sb.catplot(data=new_data, x="Parch", y="Age", kind="box", col="Survived")
g = sb.catplot(data=new_data, x="SibSp", y="Age", kind="box", col="Survived")

In [None]:
dataCorr(new_data[["Age", "SibSp", "Parch", "Sex", "Fare", "Pclass"]])

#### 2.3- Observation
##### from the correlation graph:
- We can see that Age is in correlated with sex or Fare much
- Age is correlated with Pclass, Parch, and SibSp

In [None]:
# Filling in the missing age values with the median of values that have the same Sibsp, Parch and Pclass values
# or filling it with the median of all the values combined

for i in missingAge:
    medianAge = new_data["Age"].dropna().median()
    AssumedAge = new_data["Age"][(new_data["SibSp"] == new_data.iloc[i]["SibSp"])
                                 & (new_data["Parch"] == new_data.iloc[i]["Parch"])
                                 & (new_data["Pclass"] == new_data.iloc[i]["Pclass"])].median()
    if np.isnan(AssumedAge):
        new_data["Age"][i] = medianAge
    else:
        new_data["Age"][i] = AssumedAge
        
# print(AssumedAge)

In [None]:
new_data.isnull().sum().sort_values(ascending=False)

#### 2.4- Observation
##### from the correlation graph:
- The strongest relationships with survival are: Pclass, Sex, and Fare
  - since Pclass 1 is the highest class and Pclass one is the lowest therefore, the lower the Pclass the higher the survival rate, with 33% inverse proportionality
  - For Sex Male = 1 and Female = 0 so since the data shows 55% inverse proportionality that means more Females have survived the accident.
  - The higher the Fare the more likely the passenger was on a higher class therefor higher rate of survival with Fare being 26% proportional to Survival

In [None]:
new_data[["Pclass", "Survived"]].groupby("Pclass", as_index=False).mean()

In [None]:
# Changing the names to their title

Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Dr": "Officer",
    "Rev": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir": "Royalty",
    "Lady": "Royalty",
    "the Countess": "Royalty",
    "Dona": "Royalty",
    "Mme": "Miss",
    "Mlle": "Miss",
    "Miss" : "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs": "Mrs",
    "Master": "Master"
}

new_data.rename(columns={"Name": "Title"}, inplace=True)
for index, Row in new_data.iterrows():
    for title in Title_Dictionary:
        if title in Row["Title"]:
            new_data["Title"][index]= Title_Dictionary.get(title)


new_data.head()

In [None]:
new_data[["Title", "Survived"]].groupby("Title", as_index=False).mean().sort_values(by="Survived", ascending=False)

In [None]:
# Adding the SibSp and Parch columns into one column called FamilySize

new_data["FamilySize"] = new_data["Parch"] + new_data["SibSp"] + 1

new_data.head()

In [None]:
new_data[["FamilySize", "Survived"]].groupby("FamilySize", as_index=False).mean().sort_values(by="Survived", ascending=False)

In [None]:
# Creating a feature called isAlone to determine if the passenger is alone or not

new_data["isAlone"] = 0
new_data.loc[new_data["FamilySize"] == 1, "isAlone"] = 1
new_data[["FamilySize", "isAlone"]].head()

In [None]:
new_data[["isAlone", "Survived"]].groupby("isAlone", as_index=False).mean().sort_values(by="Survived", ascending=False)

In [None]:
# Displaying the correlation between (Title, FamilySize and Survived) and (Title, Fare and Survived)
grid = sb.FacetGrid(new_data, col="Title", row="Survived")
grid.map(sb.histplot, "FamilySize")

grid = sb.FacetGrid(new_data, col="Title", row="Survived")
grid.map(sb.histplot, "isAlone")
grid.add_legend()

In [None]:
new_data.drop(columns=["FamilySize", "SibSp", "Parch"], inplace=True)

# new_data.head()

In [None]:
new_data = pd.get_dummies(new_data, columns=["Title"])

new_data.head()

In [None]:
# Displaying the correlation of all the data after adding title and familysize to the equation
dataCorr(new_data)

#### 2.5- Observation
##### from the correlation and FacetGrid graph:
- Noticed few useless features that dont have much of an effect on the survival rate of the passangers such as:
  - Title_Royalty
  - Title_Officer
  - PassengerId
- Some of the key features that are very important:
  - Fare
  - Sex
  - Pclass
- Features that have an indirect affect:
  - isAlone
  - Title_Mrs
  - Title_Mr
  - Title_Miss
  - Age

##### Dropping unnecessary features

In [None]:
new_data = new_data.drop(columns=["PassengerId", "Embarked_Q", "Title_Royalty", "Title_Master"], axis=1)

# new_data.head()

## 3- Model

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree,export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_curve, auc, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import graphviz

In [None]:
X = new_data.drop(columns=["Survived"], axis=1)
y = new_data["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)

##### Applying Confusion Matrix

In [None]:
confusion_matrix = pd.crosstab(y_test, dt_pred, rownames=["Actual"], colnames=["Predicted"])
sb.heatmap(confusion_matrix, annot = True, cmap="coolwarm")

##### Function to measure how well the model is doing

In [None]:
def metrics(ypred, classi):
    class_report = classification_report(y_test, ypred)
    ypred_prob = classi.predict_proba(X_test)[::,1]
    fpr, tpr, thresh = roc_curve(y_test, ypred_prob)
    auc_score = auc(fpr, tpr)
    
    #create ROC curve
    mp.pyplot.plot(fpr,tpr)
    mp.pyplot.title("ROC Curve")
    mp.pyplot.ylabel('True Positive Rate')
    mp.pyplot.xlabel('False Positive Rate')
    mp.pyplot.show()
    
    # Displaying AUC Score
    print(f"The AUC Score is: {auc_score}")
    
    # Displaying Classification Score
    print(f"\nThe Classification Score is:\n{class_report}")
    
    acc = round(accuracy_score(y_test, ypred), 2)
    prec = round(precision_score(y_test, ypred), 1)
    recall = round(recall_score(y_test, ypred), 2)

    print(f"Accuracy: {acc}\nPrecision: {prec}\nRecall: {recall}")

In [None]:
dt_metric = metrics(dt_pred, dt)
dt_metric

In [None]:
# DOT data
def graphTree(classi):
    mp.pyplot.figure(figsize=(10, 10))
    plot_tree(classi)

In [None]:
graphTree(dt)

In [None]:
dot_data = export_graphviz(dt, out_file=None)

# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph.render("decision_tree_graphivz")

In [None]:
dt2 = DecisionTreeClassifier( max_depth=100, splitter="random", random_state=40)
dt2.fit(X_train, y_train)
dt_pred2 = dt2.predict(X_test)

In [None]:
dt_metric2 = metrics(dt_pred2, dt2)
dt_metric2

In [None]:
rf = RandomForestClassifier(criterion="entropy", max_depth=50, n_estimators=100, random_state=40)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

In [None]:
rf_metric = metrics(rf_pred, rf)
rf_metric

In [None]:
classifier_list = {"Plain Decision Tree": dt_metric,
                   "Adjusted Decision Tree": dt_metric2,
                   "Random Forest": rf_metric}
classifier_list = pd.DataFrame(classifier_list)

classifier_list