In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
import tensorflow as tf


## Data

In [3]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [4]:
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [5]:
train_data["Family"] = train_data["SibSp"] + train_data["Parch"]
train_data = train_data.drop(["Name", "Ticket", "Cabin", "SibSp", "Parch"], axis=1)

In [6]:
train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
Fare             0
Embarked         2
Family           0
dtype: int64

In [7]:
train_data = train_data[train_data["Embarked"].notna()]
X_train, y_train = train_data[train_data.columns[2:]], train_data["Survived"]

In [8]:
ct = ColumnTransformer([("fillMedian", SimpleImputer(strategy="median"), ["Age"]),
                       ("ohe", OneHotEncoder(), ["Sex", "Embarked"]),
                       ("ss", StandardScaler(), ["Fare"])],
                      remainder="passthrough")
p = Pipeline([("ct", ct)])

In [9]:
X_train = p.fit_transform(X_train)

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

## kNN

In [11]:
param_grid = {
    "n_neighbors": [1, 5, 10, 20, 50]
}
knn_grid = GridSearchCV(KNeighborsClassifier(), param_grid, n_jobs=-1)
knn_grid.fit(X_train, y_train)
print(knn_grid.best_params_)
grid_prediction = knn_grid.predict(X_valid)
print(classification_report(y_valid, grid_prediction))

{'n_neighbors': 5}
              precision    recall  f1-score   support

           0       0.78      0.89      0.83       110
           1       0.77      0.60      0.68        68

    accuracy                           0.78       178
   macro avg       0.78      0.75      0.76       178
weighted avg       0.78      0.78      0.77       178



## Naive Bayes

In [12]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_valid)
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84       110
           1       0.74      0.72      0.73        68

    accuracy                           0.80       178
   macro avg       0.79      0.78      0.78       178
weighted avg       0.80      0.80      0.80       178



## Logistical Regression

In [13]:
param_grid = {
    "penalty":[None,"l2","l1","elasticnet"],
    "C":[0.001, 0.01, 0.1, 1, 10, 100]
}
log_grid = GridSearchCV(LogisticRegression(), param_grid, n_jobs=-1)
log_grid.fit(X_train, y_train)
print(log_grid.best_params_)
grid_prediction = log_grid.predict(X_valid)
print(classification_report(y_valid, grid_prediction))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'C': 1, 'penalty': 'l2'}
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       110
           1       0.78      0.69      0.73        68

    accuracy                           0.81       178
   macro avg       0.80      0.79      0.79       178
weighted avg       0.81      0.81      0.81       178



## Support Vector Machines

In [14]:
param_grid = {
    "kernel":["linear", "rbf", "sigmoid"],
    "C":[0.1, 1, 10],
    "degree":list(range(3,11))
}
svm_grid = GridSearchCV(SVC(), param_grid, n_jobs=-1)
svm_grid.fit(X_train, y_train)
print(svm_grid.best_params_)
grid_prediction = svm_grid.predict(X_valid)
print(classification_report(y_valid, grid_prediction))

{'C': 10, 'degree': 3, 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       110
           1       0.80      0.71      0.75        68

    accuracy                           0.82       178
   macro avg       0.82      0.80      0.80       178
weighted avg       0.82      0.82      0.82       178



## Random Forest Classifier

In [15]:
param_grid = {
    "n_estimators":[100, 200, 500],
    "criterion":["gini","entropy","log_loss"]
}   
rfc_grid = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1)
rfc_grid.fit(X_train, y_train)
print(rfc_grid.best_params_)
grid_prediction = rfc_grid.predict(X_valid)
print(classification_report(y_valid, grid_prediction))

{'criterion': 'entropy', 'n_estimators': 500}
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       110
           1       0.78      0.72      0.75        68

    accuracy                           0.81       178
   macro avg       0.81      0.80      0.80       178
weighted avg       0.81      0.81      0.81       178



## XGBoost

In [16]:
param_grid = {
    "max_depth": range(2, 10, 1),
    "n_estimators":range(40, 201, 40),
    "learning_rate":[0.5, 0.1,0.05,0.01]
}
boost_grid = GridSearchCV(XGBClassifier(), param_grid, scoring="roc_auc", cv=10, n_jobs=-1)
boost_grid.fit(X_train, y_train)
print(boost_grid.best_params_)
grid_prediction = boost_grid.predict(X_valid)
print(classification_report(y_valid, grid_prediction))

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 40}
              precision    recall  f1-score   support

           0       0.80      0.94      0.87       110
           1       0.86      0.63      0.73        68

    accuracy                           0.82       178
   macro avg       0.83      0.78      0.80       178
weighted avg       0.83      0.82      0.81       178



## Tensorflow

In [17]:
def train_model_split(X_train, y_train, num_nodes, dropout_prob, learning_rate, batch_size, epochs):
    neuralnet = tf.keras.Sequential([
        tf.keras.layers.Dense(num_nodes, activation="relu", input_shape=(9,)),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(num_nodes, activation="relu"),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])
    neuralnet.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
                     loss="binary_crossentropy",
                     metrics=["accuracy"])
    history = neuralnet.fit(X_train,
                            y_train,
                            epochs=epochs,
                            batch_size=batch_size,
                            validation_split=0.2,
                            verbose=0
                           )
    return neuralnet, history

In [18]:
def train_model_data(X_train, y_train, num_nodes, dropout_prob, learning_rate, batch_size, epochs):
    neuralnet = tf.keras.Sequential([
        tf.keras.layers.Dense(num_nodes, activation="relu", input_shape=(9,)),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(num_nodes, activation="relu"),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])
    neuralnet.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
                     loss="binary_crossentropy",
                     metrics=["accuracy"])
    history = neuralnet.fit(X_train,
                            y_train,
                            epochs=epochs,
                            batch_size=batch_size,
                            validation_data=(X_valid, y_valid),
                            verbose=0
                           )
    return neuralnet, history

In [19]:
least_validation_loss_split = float("inf")
least_validation_loss_data = float("inf")
least_loss_model_split = None
least_loss_model_data = None

epochs = 100
for num_nodes in [16,32,64]:
  for dropout_prob in [0,0.2]:
    for learning_rate in [0.01,0.005,0.001]:
      for batch_size in [32,64,128]:
        print(f"nodes: {num_nodes}, dropout: {dropout_prob}, learning rate: {learning_rate}, batch size: {batch_size}")
        print("validation_split = 0.2")
        model,history = train_model_split(X_train,y_train,num_nodes,dropout_prob,learning_rate,batch_size,epochs)
        val_loss, val_accuracy = model.evaluate(X_valid,y_valid)
        if val_loss < least_validation_loss_split:
          least_validation_loss_split = val_loss
          least_loss_model_split = model

        print("validation_data = (X_valid,y_valid)")
        model,history = train_model_data(X_train,y_train,num_nodes,dropout_prob,learning_rate,batch_size,epochs)
        val_loss, val_accuracy = model.evaluate(X_valid,y_valid)
        if val_loss < least_validation_loss_data:
          least_validation_loss_data = val_loss
          least_loss_model_data = model

nodes: 16, dropout: 0, learning rate: 0.01, batch size: 32
validation_split = 0.2
validation_data = (X_valid,y_valid)
nodes: 16, dropout: 0, learning rate: 0.01, batch size: 64
validation_split = 0.2
validation_data = (X_valid,y_valid)
nodes: 16, dropout: 0, learning rate: 0.01, batch size: 128
validation_split = 0.2
validation_data = (X_valid,y_valid)
nodes: 16, dropout: 0, learning rate: 0.005, batch size: 32
validation_split = 0.2
validation_data = (X_valid,y_valid)
nodes: 16, dropout: 0, learning rate: 0.005, batch size: 64
validation_split = 0.2
validation_data = (X_valid,y_valid)
nodes: 16, dropout: 0, learning rate: 0.005, batch size: 128
validation_split = 0.2
validation_data = (X_valid,y_valid)
nodes: 16, dropout: 0, learning rate: 0.001, batch size: 32
validation_split = 0.2
validation_data = (X_valid,y_valid)
nodes: 16, dropout: 0, learning rate: 0.001, batch size: 64
validation_split = 0.2
validation_data = (X_valid,y_valid)
nodes: 16, dropout: 0, learning rate: 0.001, batc

## Choosing Model

In [20]:
model = VotingClassifier([
    ("1",knn_grid),
    ("2",nb),
    ("3",log_grid),
    ("4",svm_grid),
    ("5",rfc_grid),
    ("6",boost_grid)
])
model.fit(X_train, y_train)
y_pred = model.predict(X_valid)
roc_auc_1 = roc_auc_score(y_valid, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [21]:
def tensor_binarizer(lst):
    return [1 if x[0] > 0.5 else 0 for x in lst]

In [22]:
tf_pred1 = least_loss_model_split.predict(X_valid)
tf_pred2 = least_loss_model_data.predict(X_valid)
tf_pred1 = tensor_binarizer(tf_pred1)
tf_pred2 = tensor_binarizer(tf_pred2)
roc_auc_2 = roc_auc_score(y_valid, tf_pred1)
roc_auc_3 = roc_auc_score(y_valid, tf_pred2)



In [23]:
d = {roc_auc_1:model, roc_auc_2:least_loss_model_split, roc_auc_3:least_loss_model_data}
top_score = max(d.keys())
model = d[top_score]

## Test Data

In [24]:
test_data["Family"] = test_data["SibSp"] + test_data["Parch"]
passengers = test_data["PassengerId"]
test_data = test_data.drop(["PassengerId", "Name", "Ticket", "Cabin", "SibSp", "Parch"], axis=1)

In [25]:
test_data.isna().sum()

Pclass       0
Sex          0
Age         86
Fare         1
Embarked     0
Family       0
dtype: int64

In [26]:
avgfare = test_data["Fare"].dropna().mean()
test_data["Fare"] = test_data["Fare"].fillna(avgfare)
print(test_data)

     Pclass     Sex   Age      Fare Embarked  Family
0         3    male  34.5    7.8292        Q       0
1         3  female  47.0    7.0000        S       1
2         2    male  62.0    9.6875        Q       0
3         3    male  27.0    8.6625        S       0
4         3  female  22.0   12.2875        S       2
..      ...     ...   ...       ...      ...     ...
413       3    male   NaN    8.0500        S       0
414       1  female  39.0  108.9000        C       0
415       3    male  38.5    7.2500        S       0
416       3    male   NaN    8.0500        S       0
417       3    male   NaN   22.3583        C       2

[418 rows x 6 columns]


In [27]:
test_data = pd.DataFrame(p.fit_transform(test_data))
print(test_data)

        0    1    2    3    4    5         6    7    8
0    34.5  0.0  1.0  0.0  1.0  0.0 -0.498407  3.0  0.0
1    47.0  1.0  0.0  0.0  0.0  1.0 -0.513274  3.0  1.0
2    62.0  0.0  1.0  0.0  1.0  0.0 -0.465088  2.0  0.0
3    27.0  0.0  1.0  0.0  0.0  1.0 -0.483466  3.0  0.0
4    22.0  1.0  0.0  0.0  0.0  1.0 -0.418471  3.0  2.0
..    ...  ...  ...  ...  ...  ...       ...  ...  ...
413  27.0  0.0  1.0  0.0  0.0  1.0 -0.494448  3.0  0.0
414  39.0  1.0  0.0  1.0  0.0  0.0  1.313753  1.0  0.0
415  38.5  0.0  1.0  0.0  0.0  1.0 -0.508792  3.0  0.0
416  27.0  0.0  1.0  0.0  0.0  1.0 -0.494448  3.0  0.0
417  27.0  0.0  1.0  1.0  0.0  0.0 -0.237906  3.0  2.0

[418 rows x 9 columns]


In [28]:
test_pred = model.predict(test_data)
test_pred = tensor_binarizer(test_pred)



In [29]:
final = pd.DataFrame(test_pred, index=passengers)
final.columns=["Survived"]
print(final)

             Survived
PassengerId          
892                 0
893                 0
894                 0
895                 0
896                 0
...               ...
1305                0
1306                1
1307                0
1308                0
1309                0

[418 rows x 1 columns]


In [30]:
final.to_csv("submission.csv")