# TITANIC SURVIVAL PREDICTION
![titanic](./assets/titanic.gif)
## CREATING THE DATASETS

Firstly, we need to retrieve the datasets.

In [163]:
import pandas as pd
import numpy as np

test= pd.read_csv('titanic/test.csv')
train= pd.read_csv('titanic/train.csv')

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


On the preprocessing part, we see some patterns and missing values.
Missing values are mostly from ```Cabin``` and ```Age``` columns and also ```Age``` observations have float numbers _(e.g 34.5)_.
I disregarded the __Cabin__, __Ticket__ ,__Sibsp__ and __Parch__ columns but you are welcome to use them to your advantage.

# PREPROCESSING

In [164]:
from sklearn.preprocessing import LabelEncoder

# Rounding the observations to have better certainty
train.Age=train.Age.round()

# Separating features and the target
features=['Pclass',
 'Sex',
 'Age',
 'Fare',
 'Embarked']
target='Survived'



# Label encoding the Sex column to have 0 and 1 as values
lbl = LabelEncoder()

train['Sex'] = lbl.fit_transform(train[['Sex']].values.ravel())
test['Sex'] = lbl.fit_transform(test[['Sex']].values.ravel())

# Embarked column had a few missing values, filling them
train['Embarked'] = train['Embarked'].fillna(value=train['Embarked'].mode()[0])
test['Embarked'] = test['Embarked'].fillna(value=test['Embarked'].mode()[0])
train.Embarked.unique()

# After missing value filling, using dummies method to encode the observations
train_emb=pd.get_dummies(train['Embarked'])
train_new = train[features]
test_new = test[features]
train_new=train_new.join(train_emb)
train_new=train_new.drop('Embarked',axis=1)

# Age column had many missing observations, this time filling them with standard deviation
Age_stan_train = train['Age'].std()
train['Age'] = train['Age'].fillna(value = Age_stan_train)
Age_stan_test = test['Age'].std()
test['Age'] = test['Age'].fillna(value = Age_stan_test)
train.Age.isnull().sum()



0

# -----------------------------
# MODEL SELECTION
# -----------------------------
## ~TRAIN TEST SPLIT

In [165]:
from sklearn.model_selection import train_test_split
y_train = train[target]
X_train, X_test, y_train, y_test = train_test_split(train_new, y_train, test_size=0.30)

~ IMPORTANT LIBRARIES

In [None]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

from xgboost import XGBClassifier

from sklearn.model_selection import RepeatedStratifiedKFold,cross_val_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import metrics

```First Trial:
Random Forest```

In [149]:
rfc = RandomForestClassifier()

params = {'n_estimators': [200,500,800,1000,1200],
          'max_depth': [3,5,7],
          'criterion':['entropy', 'gini']
          }

rfc_cv = GridSearchCV(rfc, params, cv = 5, n_jobs=-1, verbose=2).fit(X_train, y_train)
pred=rfc_cv.predict(X_test)
print(accuracy_score(y_test, pred))


Fitting 5 folds for each of 30 candidates, totalling 150 fits
0.7723880597014925


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.0min finished


```Second Trial:
XGBOOST```

In [159]:
rfc = XGBClassifier()

#setting the parameters
params = {'n_estimators': [200,500,800,1000,1200],
          'max_depth': [3,5,7],
          'objective' : ['binary:logistic'],
          'gamma': [0.5, 1, 1.5, 2, 5],
          }


rfc_cv=RandomizedSearchCV(rfc, params, cv = 10, n_jobs=-1, verbose=2).fit(X_train, y_train)
pred=rfc_cv.predict(X_test)
print(accuracy_score(y_test, pred))

Fitting 10 folds for each of 10 candidates, totalling 100 fits
0.7649253731343284


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   44.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.1min finished


```Third Trial:
Gradient Boosting Classifier```

In [160]:
grad_clf=GradientBoostingClassifier(max_depth=4, max_features=0.3, min_samples_leaf=100,
                           n_estimators=300)
grad_clf.fit(X_train,y_train)
pred=grad_clf.predict(X_test)
print(accuracy_score(y_test, pred))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred)))

0.7723880597014925
Mean Absolute Error: 0.22761194029850745
Mean Squared Error: 0.22761194029850745
Root Mean Squared Error: 0.47708693159476445


```Fourth Trial: K Neighbors Classifier```

In [161]:
knn= KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,y_train)
print(accuracy_score(y_test, pred))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred)))

0.7723880597014925
Mean Absolute Error: 0.22761194029850745
Mean Squared Error: 0.22761194029850745
Root Mean Squared Error: 0.47708693159476445


# * BONUS
## MIX AND MATCH : CLASSIFIER SELECTION  _(ACCURACY CONTROL)_

In [162]:

models = {"KNN": KNeighborsClassifier(),
          "Logistic Regression": LogisticRegression(max_iter=10000),
          "Random Forest": RandomForestClassifier(),
          "SVC" : SVC(probability=True),
          "DecisionTreeClassifier" : DecisionTreeClassifier(),
          "AdaBoostClassifier" : AdaBoostClassifier(algorithm='SAMME', base_estimator=DecisionTreeClassifier(),
                   learning_rate=1.5, n_estimators=2, random_state=7),
          "GradientBoostingClassifier" : GradientBoostingClassifier(max_depth=4, max_features=0.3, min_samples_leaf=100,
                           n_estimators=300),
          "GaussianNB" : GaussianNB(),
          "LinearDiscriminantAnalysis" : LinearDiscriminantAnalysis(),
"QuadraticDiscriminantAnalysis" : QuadraticDiscriminantAnalysis()}
scores={}
cv=RepeatedStratifiedKFold(n_splits=10,n_repeats=3)
np.random.seed(42)
model_scores = {}
for name, model in models.items():
    score=cross_val_score(model,X_train,y_train,cv=cv,scoring='accuracy',n_jobs=-1)
    scores[name]=np.mean(score)
print(scores)

{'KNN': 0.7078511691414918, 'Logistic Regression': 0.7987967229902714, 'Random Forest': 0.8020481310803892, 'SVC': 0.6854753370882404, 'DecisionTreeClassifier': 0.7773596176821983, 'AdaBoostClassifier': 0.767400580303806, 'GradientBoostingClassifier': 0.8196705922512372, 'GaussianNB': 0.7998719918074758, 'LinearDiscriminantAnalysis': 0.7929595494111624, 'QuadraticDiscriminantAnalysis': 0.6584997439836149}


# ---------------------------------
## The strongest prediction within the options regarding the model belongs to:
* GradientBoostingClassifier: 0.8196705922512372
# ---------------------------------




