# Kaggle Competion - Titanic: Machine Learning from Disaster

## Run Configurations

In [35]:
MODEL_CODE = "GNB"                 # DTC or GNB
N_SPLIT = 10                       # k-Fold k value
REPORT_NAME = "REP_" + MODEL_CODE

## Preprocessing

In [36]:
import pandas as pd
import numpy as np

In [37]:
df = pd.read_csv("data/train.csv")

In [38]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [39]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [40]:
for col in df.columns:
    print(f"{col} - {df[df[col].isna()].shape}")

PassengerId - (0, 12)
Survived - (0, 12)
Pclass - (0, 12)
Name - (0, 12)
Sex - (0, 12)
Age - (177, 12)
SibSp - (0, 12)
Parch - (0, 12)
Ticket - (0, 12)
Fare - (0, 12)
Cabin - (687, 12)
Embarked - (2, 12)


In [41]:
c1 = df.drop(["Name", "Cabin", "Ticket"], axis=1)

In [42]:
c2 = c1.dropna()

In [43]:
for col in c2.columns:
    print(f"{col} - {c2[c2[col].isna()].shape}")

PassengerId - (0, 9)
Survived - (0, 9)
Pclass - (0, 9)
Sex - (0, 9)
Age - (0, 9)
SibSp - (0, 9)
Parch - (0, 9)
Fare - (0, 9)
Embarked - (0, 9)


In [44]:
c2

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.2500,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.9250,S
3,4,1,1,female,35.0,1,0,53.1000,S
4,5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
885,886,0,3,female,39.0,0,5,29.1250,Q
886,887,0,2,male,27.0,0,0,13.0000,S
887,888,1,1,female,19.0,0,0,30.0000,S
889,890,1,1,male,26.0,0,0,30.0000,C


In [45]:
c3 = c2.drop("Survived", axis=1)
y = c2["Survived"]

In [46]:
c3

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,male,22.0,1,0,7.2500,S
1,2,1,female,38.0,1,0,71.2833,C
2,3,3,female,26.0,0,0,7.9250,S
3,4,1,female,35.0,1,0,53.1000,S
4,5,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
885,886,3,female,39.0,0,5,29.1250,Q
886,887,2,male,27.0,0,0,13.0000,S
887,888,1,female,19.0,0,0,30.0000,S
889,890,1,male,26.0,0,0,30.0000,C


In [47]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

### TEST

In [48]:
ctf = ColumnTransformer([("encoder", OneHotEncoder(), [2,7])], remainder="passthrough")

In [54]:
c4a = np.array(ctf.fit_transform(c3), dtype=np.str)
c4 = pd.DataFrame(c4a)

In [60]:
X = c4
Xv = X.values
yv = y.values

### TEST

## Model Evaluation with k-Fold Cross Validation

In [61]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

def create_model(code):
    if code == "DTC":
        return DecisionTreeClassifier()
    elif code == "GNB":
        return GaussianNB()
    return None

In [62]:
from sklearn.model_selection import KFold         # 10-Fold Cross Validation
from sklearn.preprocessing import StandardScaler  # Standardizing input data
from sklearn.metrics import confusion_matrix      # calculating confusion matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score              # calculating f1 score
import time                                       # to measure elapsed time
import os                                         # to create a folder for reporting

In [63]:
if not os.path.exists('reports'):
    os.makedirs('reports')

file = open(f"reports/{REPORT_NAME}_EVAL.txt", "w")

start = time.process_time()

# initializing variables for kFold run and F1_Score
current_fold = 0
total_acc = 0
total_prc = 0
total_rec = 0
total_f1s = 0

for train_index, test_index in KFold(N_SPLIT).split(Xv):
    current_fold += 1
    
    x_train, x_test = Xv[train_index], Xv[test_index]
    y_train, y_test = yv[train_index], yv[test_index]
        
    model=create_model(MODEL_CODE)  # model will be discarded after the fold is finished
    
    model.fit(x_train, y_train)
            
    y_pred = model.predict(x_test)
    y_pred = (y_pred > 0.5)
    
    cfm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    prc = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    
    f1s = f1_score(y_test, y_pred, average="macro")
    # average='macro' : Calculate metrics for each label, and find their unweighted mean. 
    # This does not take label imbalance into account
    
    ITR_EVAL_STR  = f"{str(current_fold).rjust(2)}-FOLD\t| Precision\t= {prc}\n"
    ITR_EVAL_STR += f"\t\t  Recall\t= {rec}\n"
    ITR_EVAL_STR += f"\t\t  F1 Score\t= {f1s}\n"
    
    print(ITR_EVAL_STR)
    file.write(ITR_EVAL_STR + "\n")
    
    total_acc += acc
    total_prc += prc
    total_rec += rec
    total_f1s += f1s


end = time.process_time()

# AVG_ACCURACY_STR = f"AVG_ACCURACY\t= {total_acc/N_SPLIT}"
AVG_PRECISION_STR = f"AVG_PRECISION\t= {total_prc/N_SPLIT}"
AVG_RECALL_STR = f"AVG_RECALL\t\t= {total_rec/N_SPLIT}"
AVG_F1_SCORE_STR = f"AVG_F1_SCORE\t= {total_f1s/N_SPLIT}\n"
EXEC_TIME_STR = f"EXEC_TIME\t= {end-start}s"

# print(AVG_ACCURACY_STR)
print(AVG_PRECISION_STR)
print(AVG_RECALL_STR)
print(AVG_F1_SCORE_STR)
print(EXEC_TIME_STR)

# file.write(AVG_ACCURACY_STR + "\n")
file.write(AVG_PRECISION_STR + "\n")
file.write(AVG_RECALL_STR + "\n")
file.write(AVG_F1_SCORE_STR + "\n")
file.write(EXEC_TIME_STR + "\n")
    
file.close()

 1-FOLD	| Precision	= 0.6428571428571429
		  Recall	= 0.6428571428571429
		  F1 Score	= 0.7077922077922079

 2-FOLD	| Precision	= 0.4230769230769231
		  Recall	= 0.6875
		  F1 Score	= 0.6638655462184873

 3-FOLD	| Precision	= 0.7727272727272727
		  Recall	= 0.5666666666666667
		  F1 Score	= 0.726923076923077

 4-FOLD	| Precision	= 0.7714285714285715
		  Recall	= 0.75
		  F1 Score	= 0.7605633802816902

 5-FOLD	| Precision	= 0.6666666666666666
		  Recall	= 0.8
		  F1 Score	= 0.7452153110047847

 6-FOLD	| Precision	= 0.7575757575757576
		  Recall	= 0.7575757575757576
		  F1 Score	= 0.773524720893142

 7-FOLD	| Precision	= 0.8148148148148148
		  Recall	= 0.6666666666666666
		  F1 Score	= 0.7691056910569105

 8-FOLD	| Precision	= 0.7241379310344828
		  Recall	= 0.7777777777777778
		  F1 Score	= 0.7936046511627908

 9-FOLD	| Precision	= 0.8
		  Recall	= 0.6896551724137931
		  F1 Score	= 0.7908249158249159

10-FOLD	| Precision	= 0.7777777777777778
		  Recall	= 0.8076923076923077
		  F1 Score	

## Model Training

In [64]:
from sklearn.model_selection import train_test_split

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [66]:
start = time.process_time()

In [67]:
model = create_model(MODEL_CODE)
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [68]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)

In [69]:
end = time.process_time()

In [70]:
acc = accuracy_score(y_test, y_pred)
prc = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1s = f1_score(y_test, y_pred, average="macro")

In [71]:
if not os.path.exists('reports'):
    os.makedirs('reports')

file = open(f"reports/{REPORT_NAME}_TRAIN.txt", "w")

# ACCURACY_STR = f"ACCURACY\t= {acc}"
PRECISION_STR = f"PRECISION\t= {prc}"
RECALL_STR = f"RECALL\t\t= {rec}"
F1_SCORE_STR = f"F1_SCORE\t= {f1s}\n"
EXEC_TIME_STR = f"EXEC_TIME\t= {end-start}s"

# print(ACCURACY_STR)
print(PRECISION_STR)
print(RECALL_STR)
print(F1_SCORE_STR)
print(EXEC_TIME_STR)

# file.write(ACCURACY_STR + "\n")
file.write(PRECISION_STR + "\n")
file.write(RECALL_STR + "\n")
file.write(F1_SCORE_STR + "\n")
file.write(EXEC_TIME_STR + "\n")

file.close()

PRECISION	= 0.6911764705882353
RECALL		= 0.8392857142857143
F1_SCORE	= 0.7864396654719236

EXEC_TIME	= 0.03125s


## Predictions

In [72]:
td = pd.read_csv("data/test.csv")
td.shape

(418, 11)

In [73]:
for col in td.columns:
    print(f"{col} - {td[td[col].isna()].shape}")

PassengerId - (0, 11)
Pclass - (0, 11)
Name - (0, 11)
Sex - (0, 11)
Age - (86, 11)
SibSp - (0, 11)
Parch - (0, 11)
Ticket - (0, 11)
Fare - (1, 11)
Cabin - (327, 11)
Embarked - (0, 11)


In [74]:
td.loc[td["Fare"].isna(),:]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [75]:
td.replace(np.nan,0)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,0,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,0,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,0,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,0,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,0,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,0.0,0,0,A.5. 3236,8.0500,0,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,0,S
416,1308,3,"Ware, Mr. Frederick",male,0.0,0,0,359309,8.0500,0,S


In [1412]:
td2 = td["Fare"].replace(np.nan, 0)

In [1413]:
td2[td["Fare"].isna()]

152    0.0
Name: Fare, dtype: float64

In [1414]:
tc1 = td.drop(["Name", "Cabin", "Ticket"], axis=1)

In [1415]:
for col in tc1.columns:
    print(f"{col} - {tc1[tc1[col].isna()].shape}")

PassengerId - (0, 8)
Pclass - (0, 8)
Sex - (0, 8)
Age - (86, 8)
SibSp - (0, 8)
Parch - (0, 8)
Fare - (1, 8)
Embarked - (0, 8)


In [1416]:
tc2 = tc1.replace(np.nan, 0)

In [1417]:
for col in tc2.columns:
    print(f"{col} - {tc2[tc2[col].isna()].shape}")

PassengerId - (0, 8)
Pclass - (0, 8)
Sex - (0, 8)
Age - (0, 8)
SibSp - (0, 8)
Parch - (0, 8)
Fare - (0, 8)
Embarked - (0, 8)


In [1418]:
model.predict(tc2)

ValueError: could not convert string to float: 'male'