In [1]:
# scikit-learn >= 0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

import numpy as np
import pandas as pd

# to plot figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [None]:
PATH = '/kaggle/input/titanic/'

In [None]:
# function to read the input
import pandas as pd

def load_housing_data(filename, path = PATH):
    csv_path = os.path.join(path, filename)
    return pd.read_csv(csv_path)

In [None]:
train_data = load_housing_data("train.csv")
test_data = load_housing_data("test.csv")
gen_data = load_housing_data("gender_submission.csv")

In [None]:
# print the shape of the training and testing dataset
print(train_data.shape)
print(test_data.shape)
print(gen_data.shape)

In [None]:
train_data.head(1)

In [None]:
test_data.head(1)

In [None]:
# # print the shape of the transformed train data
# print(train_labels.shape)
# print(train_data.shape)

In [None]:
train_data.info()

In [None]:
train_data["Pclass"].value_counts()

In [None]:
train_data["Embarked"].value_counts()

In [None]:
train_data["Parch"].value_counts()

In [None]:
train_data["SibSp"].value_counts()

In [None]:
train_data["Sex"].value_counts()

Note:- Cabin column has lot of empty values 

Note:- similarly Age has some of empty values

Step:- remove the cabin column from the train data set \\

Step:- we will add the median age in empty rows of Age column

## Transformation of the dataset

1) We will drop Cabin column 

2) fill the missing value with median in Age column

3) One-Hot Encoding of the Sex column

4) One-Hot Encoding of the Pclass column

5) One-Hot Encoding of the Parch column

6) One-hot Encoding of the SibSp column

In [None]:
# remove the target label from the training set
train_labels = train_data["Survived"].copy()
train_data = train_data.drop("Survived", axis=1)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [None]:
# transformer to remove the ticket column from the data
from sklearn.base import BaseEstimator, TransformerMixin

class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        # do nothing
        print("CustomTransformer")
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        # reomve ticket, cabin and name column from the data
        X = X[["PassengerId"]]
        
        return X
        

In [None]:
# class FillNan(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         #do nothing
#         print("FillNan")
    
#     def fit(self, X, y=None):
#         return self
    
#     def tranform(self, X, y=None):
#         X["Age"].fillna(-1)
#         X["Fare"].fillna(-1)

In [None]:
all_attribs = ["PassengerId"]
num_attribs = ["Age", "Fare"]
cat_atribs = ["Pclass", "Sex", "SibSp", "Parch", "Embarked"]
pipeline = ColumnTransformer([
    ("imputer", SimpleImputer(strategy="median"), num_attribs),
    ("std_scalar", StandardScaler(), [0,1]),
    ("custom_transformer", CustomTransformer(), all_attribs),
    ("cat", OneHotEncoder(), cat_atribs)
    
    
],remainder="drop")
print(train_data.shape)
#print(train_data.loc[888])
train_data = pipeline.fit_transform(train_data)
print(train_data.shape)
# print(data_transformed[888])
#data_transformed

# Train Model

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_cls = SGDClassifier()
sgd_cls.fit(train_data, train_labels)

In [None]:
train_predict = sgd_cls.predict(train_data)

In [None]:
# confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(train_predict, train_labels)

In [None]:
# imlpementing custom cross- validation
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)

for train_index, test_index in skfolds.split(train_data, train_labels):
    clone_clf = clone(sgd_cls)
    train_data_folds = train_data[train_index]
    train_labels_folds = train_labels[train_index]
    test_data_folds = train_data[test_index]
    test_labels_folds = train_labels[test_index]
    
    clone_clf.fit(train_data_folds, train_labels_folds)
    prediction = clone_clf.predict(test_data_folds)
#     print(type(prediction))
#     print(type(test_labels_folds))
    n_correct = np.sum(prediction == test_labels_folds)
    print("score:", (n_correct)/len(prediction))

In [None]:
from sklearn.model_selection import cross_val_predict
train_pred = cross_val_predict(sgd_cls, train_data, train_labels, cv=3)

In [None]:
confusion_matrix(train_pred, train_labels)

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_cls = RandomForestClassifier(random_state=42)
train_pred = cross_val_predict(forest_cls, train_data, train_labels, cv=3) 

In [None]:
confusion_matrix(train_pred, train_labels)

In [None]:
# we will settle for the RandomForestClassifier
from sklearn.metrics import precision_recall_curve

train_pred_forest = cross_val_predict(forest_cls, train_data, train_labels, cv=3, method="predict_proba")
print(train_pred_forest)

In [None]:
# make the score out of probability by keeping the score of one class only
train_pred_scores = train_pred_forest[:,-1]

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

fpr, tpr, thresholds = roc_curve(train_labels, train_pred_scores)

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal
    plt.axis([0, 1, 0, 1])                                    # Not shown in the book
    plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16) # Not shown
    plt.ylabel('True Positive Rate (Recall)', fontsize=16)    # Not shown
    plt.grid(True)                                            # Not shown

plt.figure(figsize=(8, 6))                                    # Not shown
plot_roc_curve(fpr, tpr)
fpr_90 = fpr[np.argmax(tpr >= recall_90_precision)]           # Not shown
plt.plot([fpr_90, fpr_90], [0., recall_90_precision], "r:")   # Not shown
plt.plot([0.0, fpr_90], [recall_90_precision, recall_90_precision], "r:")  # Not shown
plt.plot([fpr_90], [recall_90_precision], "ro")               # Not shown
#save_fig("roc_curve_plot")                                    # Not shown
plt.show()

In [None]:
test_data = load_housing_data("test.csv")
test_data_transformed = pipeline.fit_transform(test_data)
forest_cls.fit(train_data, train_labels)
test_predict = forest_cls.predict(test_data_transformed)