In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
training = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")

training["train_test"] = 1
test["train_test"] = 0
test["Survived"] = np.NaN
all_data = pd.concat([training, test])

%matplotlib inline
all_data.columns

In [None]:
training.info()

In [None]:
training.describe().T

In [None]:
df_num = training[["Age", "SibSp","Parch", "Fare"]]
df_cat = training[["Survived", "Pclass", "Sex", "Ticket", "Cabin", "Embarked"]]

In [None]:
for i in df_num.columns:
    plt.hist(df_num[i])
    plt.title(i)
    plt.show()

In [None]:
print(df_num.corr())
sns.heatmap(df_num.corr())

In [None]:
pd.pivot_table(training, index="Survived", values=df_num)

In [None]:
import warnings 
warnings.filterwarnings('ignore')

In [None]:
for i in df_cat.columns:
    sns.barplot(df_cat[i].value_counts().index, df_cat[i].value_counts()).set_title(i)
    plt.show()

In [None]:
print(pd.pivot_table(training, index="Survived", columns="Pclass", values="Ticket", aggfunc="count"))
print()
print(pd.pivot_table(training, index="Survived", columns="Sex", values="Ticket", aggfunc="count"))
print()
print(pd.pivot_table(training, index="Survived", columns="Embarked", values="Ticket", aggfunc="count"))

In [None]:
df_cat.Cabin
training["cabin_multiple"] = training.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(" ")))
training["cabin_multiple"].value_counts()

In [None]:
pd.pivot_table(training, index="Survived", columns="cabin_multiple", values="Ticket", aggfunc="count")

In [None]:
training["cabin_adv"] = training.Cabin.apply(lambda x: str(x)[0])

In [None]:
print(training.cabin_adv.value_counts())
pd.pivot_table(training, index="Survived", columns="cabin_adv", values="Name", aggfunc="count")

In [None]:
training["numeric_tickets"] = training.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)
training["ticket_letters"] = training.Ticket.apply(lambda x: "".join(x.split(" ")[:-1]).replace(".","").replace("/","").lower() if len(x.split(" ")[:-1]) > 0 else 0)

In [None]:
training["numeric_tickets"].value_counts()

In [None]:
pd.set_option("max_rows", None)
training["ticket_letters"].value_counts()

In [None]:
pd.pivot_table(training, index="Survived", columns="numeric_tickets", values="Ticket", aggfunc="count")

In [None]:
pd.pivot_table(training, index="Survived", columns="ticket_letters", values="Ticket", aggfunc="count")

In [None]:
training.Name.head(50)
training["name_title"] = training.Name.apply(lambda x: x.split(",")[1].split(".")[0].strip())

In [None]:
training["name_title"].value_counts()

### Data Preprocessing

In [None]:
all_data["cabin_multiple"] = all_data.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(" ")))
all_data["cabin_adv"] = all_data.Cabin.apply(lambda x: str(x)[0])
all_data["numeric_tickets"] = all_data.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)
all_data["ticket_letters"] = all_data.Ticket.apply(lambda x: "".join(x.split(" ")[:-1]).replace(".","").replace("/","").lower() if len(x.split(" ")[:-1]) > 0 else 0)
all_data["name_title"] = all_data.Name.apply(lambda x: x.split(",")[1].split(".")[0].strip())

all_data.Age = all_data.Age.fillna(training.Age.mean())
all_data.Fare = all_data.Fare.fillna(training.Fare.mean())

all_data.dropna(subset=["Embarked"], inplace=True)

all_data["norm_sibsp"] = np.log(all_data.SibSp+1)
all_data["norm_sibsp"].hist()

all_data["norm_fare"] = np.log(all_data.Fare+1)
all_data["norm_fare"].hist()

all_data.Pclass = all_data.Pclass.astype(str)

all_dummies = pd.get_dummies(all_data[["Pclass", "Sex", "Age", "SibSp","Parch", "norm_fare", 
"Embarked", "cabin_adv", "cabin_multiple", "numeric_tickets", "name_title", "train_test"]])

X_train = all_dummies[all_dummies.train_test == 1].drop(["train_test"], axis=1)
X_test = all_dummies[all_dummies.train_test == 0].drop(["train_test"], axis=1)

y_train = all_data[all_data.train_test == 1].Survived
y_train.shape

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
all_dummies_scaled = all_dummies.copy()
all_dummies_scaled[["Age", "SibSp", "Parch", "norm_fare"]] = scaler.fit_transform(
    all_dummies_scaled[["Age", "SibSp", "Parch", "norm_fare"]])

all_dummies_scaled

X_train_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 1].drop(
["train_test"], axis=1)
X_test_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 0].drop(
["train_test"], axis=1)

### Model Building

##### NaiveBayes = %72.21
##### LogisticRegression = %82.11
##### DecisionTreeClassifier = %77.39
##### KNeighborsClassifier = %81.55
##### RandomForestClassifier = %80.54
##### SVC = %83.24
##### XGBClassifier = %81.77
##### VotingClassifier = %82.79

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
gnb = GaussianNB()
cv = cross_val_score(gnb, X_train_scaled, y_train, cv=5)
print(cv)
print(cv.mean())

In [None]:
lr = LogisticRegression(max_iter = 2000)
cv = cross_val_score(lr, X_train_scaled, y_train, cv=5)
print(cv)
print(cv.mean())

In [None]:
dt = tree.DecisionTreeClassifier(random_state=1)
cv = cross_val_score(dt, X_train_scaled, y_train, cv=5)
print(cv)
print(cv.mean())

In [None]:
knn = KNeighborsClassifier()
cv = cross_val_score(knn, X_train_scaled, y_train, cv=5)
print(cv)
print(cv.mean())

In [None]:
rf = RandomForestClassifier(random_state=1)
cv = cross_val_score(rf, X_train_scaled, y_train, cv=5)
print(cv)
print(cv.mean())

In [None]:
svc = SVC(probability=True)
cv = cross_val_score(svc, X_train_scaled, y_train, cv=5)
print(cv)
print(cv.mean())

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state=1)
cv = cross_val_score(xgb, X_train_scaled, y_train, cv=5)
print(cv)
print(cv.mean())

In [None]:
from sklearn.ensemble import VotingClassifier
vc = VotingClassifier(estimators=[("lr",lr), ("knn",knn), ("rf",rf), ("gnb",gnb),
                                 ("svc",svc), ("xgb",xgb)], voting="soft")

In [None]:
cv = cross_val_score(vc, X_train_scaled, y_train, cv=5)
print(cv)
print(cv.mean())

In [None]:
vc.fit(X_train_scaled, y_train)
y_pred_vc = vc.predict(X_test_scaled)
basic_submission = {"PassengerId": test.PassengerId, "Survived": y_pred_vc}
base_submission = pd.DataFrame(data=basic_submission)
base_submission.to_csv("base_submission.csv", index=False)

### Model Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

def clf_performance(classifier, model_name):
    print(model_name)
    print("Best Score: " + str(classifier.best_score_))
    print("Best Parameters: " + str(classifier.best_params_))

In [None]:
lr = LogisticRegression()
param = {"max_iter": [2000],
         "penalty": ["l1", "l2"],
         "C": np.logspace(-4, 4, 20),
         "solver": ["liblinear"]}

clf_lr = GridSearchCV(lr, param, cv=5, n_jobs=-1, verbose=2)
best_lr = clf_lr.fit(X_train_scaled, y_train)

clf_performance(best_lr, "Logistic Regression")

In [None]:
knn = KNeighborsClassifier()
params = {"n_neighbors": [3, 5, 7, 9],
         "weights": ["uniform", "distance"],
         "algorithm": ["auto", "ball_tree", "kd_tree"],
         "p": [1, 2]}

clf_knn = GridSearchCV(knn, params, cv=5, n_jobs=-1, verbose=2)
best_knn = clf_knn.fit(X_train_scaled, y_train)

clf_performance(best_knn, "KNN")

In [None]:
svc = SVC(probability=True)
params = [{"kernel": ["rbf"], "gamma": [.1,.5], "C": [.1,1]},
{"kernel": ["linear"], "C": [.1,1]},
{"kernel": ["poly"], "degree": [2,3], "C": [.1,1]}]

clf_svc = GridSearchCV(svc, params, cv=5, n_jobs=-1, verbose=True)
best_svc = clf_svc.fit(X_train_scaled, y_train)

clf_performance(best_svc, "SVC")

In [None]:
rf = RandomForestClassifier(random_state=1)
params = {"n_estimators": [500],
         "criterion": ["gini"],
         "bootstrap": [True], "max_depth": [20],
         "max_features": ["auto", 10],
         "min_samples_leaf": [2], "min_samples_split": [2]}

clf_rf = GridSearchCV(rf, params, cv=5, n_jobs=-1, verbose=True)
best_rf = clf_rf.fit(X_train_scaled, y_train)

clf_performance(best_rf, "Random Forest")

In [None]:
best_rf = best_rf.best_estimator_.fit(X_train_scaled, y_train)
feat_importances = pd.Series(best_rf.feature_importances_, index=X_train_scaled.columns)
feat_importances.nlargest(20).plot(kind="barh")

In [None]:
xgb = XGBClassifier(random_state=1)

params = {"n_estimators": [500, 550], "colsample_bytree": [0.75, 0.8],
         "max_depth": [None], "reg_alpha": [1], "reg_lambda": [2, 5],
         "subsample": [0.6, 0.65], "learning_rate": [0.5], "gamma": [0.5, 1],
         "min_child_weight": [0.01], "sampling_method": ["uniform"]}

clf_xgb = GridSearchCV(xgb, params, cv=5, n_jobs=-1, verbose=True)
best_xgb = clf_xgb.fit(X_train_scaled, y_train)

clf_performance(best_xgb, "Random Forest")

In [None]:
y_pred = best_xgb.best_estimator_.predict(X_test_scaled)
xgb_submission = {"PassengerId": test.PassengerId, "Survived": y_pred}
submission_xgb = pd.DataFrame(xgb_submission)
submission_xgb.to_csv("xgb_submission.csv", index=False)

In [None]:
bestLR = best_lr.best_estimator_
bestKNN = best_knn.best_estimator_
bestSVC = best_svc.best_estimator_
bestRF = best_rf.best_estimator_
bestXGB = best_xgb.best_estimator_

voting_hard = VotingClassifier(estimators = [("knn", bestKNN), ("svc", bestSVC), ("rf", bestRF)], voting="hard")
voting_soft = VotingClassifier(estimators = [("knn", bestKNN), ("svc", bestSVC), ("rf", bestRF)], voting="soft")
voting_all = VotingClassifier(estimators = [("knn", bestKNN), ("svc", bestSVC), ("rf", bestRF), ("lr", bestLR)], voting="soft")
voting_xgb = VotingClassifier(estimators = [("knn", bestKNN), ("svc", bestSVC), ("rf", bestRF), ("lr", bestLR), ("xgb", bestXGB)], voting="soft")

print("voting_hard: ", cross_val_score(voting_hard, X_train, y_train, cv=5))
print("voting_hard mean: ", cross_val_score(voting_hard, X_train, y_train, cv=5).mean())

print("voting_soft: ", cross_val_score(voting_soft, X_train, y_train, cv=5))
print("voting_soft mean: ", cross_val_score(voting_soft, X_train, y_train, cv=5).mean())

print("voting_all: ", cross_val_score(voting_all, X_train, y_train, cv=5))
print("voting_all mean: ", cross_val_score(voting_all, X_train, y_train, cv=5).mean())

print("voting_xgb: ", cross_val_score(voting_xgb, X_train, y_train, cv=5))
print("voting_xgb mean: ", cross_val_score(voting_xgb, X_train, y_train, cv=5).mean())

In [None]:
params = {"weights": [[1,1,1], [1,2,1], [1,1,2]]}

vote_weight = GridSearchCV(voting_hard, params, cv=5, n_jobs=-1, verbose=1)
best_weight = vote_weight.fit(X_train_scaled, y_train)
clf_performance(best_weight, "VC Weights")
voting_pred = best_weight.best_estimator_.predict(X_test_scaled)

In [None]:
voting_hard.fit(X_train_scaled, y_train)
voting_soft.fit(X_train_scaled, y_train)
voting_all.fit(X_train_scaled, y_train)
voting_xgb.fit(X_train_scaled, y_train)

bestRF.fit(X_train_scaled, y_train)
y_pred_hard = voting_hard.predict(X_test_scaled)
y_pred_rf = bestRF.predict(X_test_scaled)
y_pred_soft = voting_soft.predict(X_test_scaled)
y_pred_all = voting_all.predict(X_test_scaled)
y_pred_xgb = voting_xgb.predict(X_test_scaled)

In [None]:
final_rf = {"PassengerId": test.PassengerId, "Survived": y_pred_rf}
submission_rf = pd.DataFrame(final_rf)

final_hard = {"PassengerId": test.PassengerId, "Survived": y_pred_hard}
submission_hard = pd.DataFrame(final_hard)

final_soft = {"PassengerId": test.PassengerId, "Survived": y_pred_soft}
submission_soft = pd.DataFrame(final_soft)

final_all = {"PassengerId": test.PassengerId, "Survived": y_pred_all}
submission_all = pd.DataFrame(final_all)

final_xgb = {"PassengerId": test.PassengerId, "Survived": y_pred_xgb}
submission_xgb = pd.DataFrame(final_xgb)

final_datas = {"PassengerId": test.PassengerId, "Survived": y_pred_rf, 
               "Survived": y_pred_hard, "Survived": y_pred_soft,
               "Survived": y_pred_all, "Survived": y_pred_xgb}
comparison = pd.DataFrame(final_datas)


In [None]:
submission_rf.to_csv("submission_rf.csv", index=False)
submission_hard.to_csv("submission_hard.csv", index=False)
submission_soft.to_csv("submission_soft.csv", index=False)
submission_all.to_csv("submission_all.csv", index=False)
submission_xgb.to_csv("submission_xgb.csv", index=False)