# Import the nesscary libraries

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import scipy

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
#load the data
train=pd.read_csv("/kaggle/input/titanic/train.csv")
train.head(40)

In [None]:
#Let's first check for any missing values
train.isna().sum()

As we can see Age, Cabin, Embarked have missing vales

In [None]:
train.head()

In [None]:
#Age is a very proising variable so we can use diffrent methods to dill it suck as mode, mean, grouped mean
train1=train['Age'].fillna(train['Age'].mean(),inplace=True)
train1=train['Age'].fillna(train['Age'].mean(),inplace=True)

In [None]:
train.isna().sum()

In [None]:
# We can simply fill the embarked values with mode
train['Embarked'].fillna(train['Embarked'].mode()[0],inplace=True)

# Exploratory data analysis

In [None]:
num = train[['Age','SibSp','Parch','Fare']]
cat = train[['Survived','Pclass','Sex','Ticket','Cabin','Embarked']]
sns.color_palette('muted')
sns.barplot(data=train ,x='Sex',y='Survived').set_title("Sex vs Survived")
plt.legend(['male','female'])
plt.show()

Notes:

    1. There is huge disparity between the two sex
    2. We can use this to fill na values

In [None]:
sns.barplot(data=train,x='Pclass',y='Survived')

In [None]:
sns.distplot(train['Age'].dropna(),bins=50)

As we can see most people are in the age group of 30-40

In [None]:
sns.histplot(data=train,x='Age',hue='Survived',multiple = 'stack')

In [None]:
sns.pairplot(train, hue='Survived')

In [None]:
plot,ax=plt.subplots(1,3,figsize=(15,5))
sns.histplot(data = train.loc[train['Pclass']==3],binwidth=7,x='Age',ax=ax[2],hue="Survived",multiple="stack").set_title("Pclass - 3")
sns.histplot(data = train.loc[train['Pclass']==2],binwidth=7,x='Age',ax=ax[1],hue="Survived",multiple="stack").set_title("Pclass - 2")
sns.histplot(data = train.loc[train['Pclass']==1],binwidth=7,x='Age',ax=ax[0],hue="Survived",multiple="stack").set_title("Pclass - 1")
plt.tight_layout()
plt.show()

In [None]:
plot,ax=plt.subplots(1,2,figsize=(15,5))
sns.histplot(data=train.loc[train['Sex']=='male'],ax=ax[0],x='Age',binwidth=7,hue='Survived',multiple='stack').set_title('Male')
sns.histplot(data=train.loc[train['Sex']=='female'],ax=ax[1],x='Age',binwidth=7,hue='Survived',multiple='stack').set_title('Females')


The no of females survived is much greater than the no of males

In [None]:
train['Sex'].replace('male',0,inplace=True)
train['Sex'].replace('female',1,inplace=True)

Titanic left Southampton, calling first at Cherbourg, France, and then Queenstown, so to make it numerical we replace 

    0 = Southampton
    1 = Cherbourg
    2 = Queenstown

In [None]:
train['Embarked'].replace('S',0,inplace=True)
train['Embarked'].replace('C',1,inplace=True)
train['Embarked'].replace('Q',2,inplace=True)

In [None]:
train

In [None]:
train.isna().sum()

In [None]:
#These columns do not contribute to the model in any way
train.drop(['PassengerId','Name','Ticket','Cabin'],axis=1,inplace=True)

In [None]:
train.info()

In [None]:
plt.figure(figsize=(10,10))
htmp=train.corr()
#This is the correlation index
sns.heatmap(htmp,annot=True)

The Strongest Positive Correlations are with **Age**, **Sex**, **Emarked** in that order


# Model Building

In [None]:
import sklearn

In [None]:
X=train.drop('Survived',axis=1)
y=train['Survived']

In [None]:
np.random.seed(45)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(random_state=1)
model.fit(X_train,y_train)
model.score(X_test,y_test)

In [None]:
#This is a function to test the accuracy of a classifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
def classifier_accuracy(model):
    
    #the CV 5 fold score
    #x=cross_val_score(model, X_test,y_test,cv=5).mean()*100
    #print(f"5 Fold CV Accuracy = {x:.2f}%\n\n")
    
    #The ROC curve
    print("The ROC: ")
    y_probs=model.predict_proba(X_test)
    y_probs_positive=y_probs[:,1]
    fpr,tpr,thresholds=roc_curve(y_test,y_probs_positive)
    plt.plot(fpr,tpr,color="orange",label='ROC')
    plt.plot([0,1],[0,1],label="No predictive Power Line",linestyle='--')
    print(f"AUC: {roc_auc_score(y_test,y_probs_positive)*100:.2f}%")
    plt.ylabel("True Positive Rate")
    plt.xlabel("False Positive Rate")
    plt.legend()
    plt.show()
    
    
    #The confusion matrix
    print("The Confusion Matrix: ")
    y_preds=model.predict(X_test)
    conf_mat=confusion_matrix(y_test,y_preds)
    sns.heatmap(data = conf_mat,annot=True,fmt='d')
    plt.show()
    
    
    #classidication report
    print("Classification Report:-\n")
    print(classification_report(y_test,y_preds))
    
classifier_accuracy(model)

This is our baseline model, we shall try to tune hyperparameters and make it more accuarate

In [None]:
from sklearn.model_selection import RandomizedSearchCV

grid={
    "n_estimators":[100,150,200,250,300,350,400],
    "max_depth":[1,2,3,4,5,6,7,8],
    'min_samples_leaf':[1,2,4]
}
rs_model=RandomizedSearchCV(estimator=model,param_distributions=grid,n_iter=10,cv=5,verbose=2)

In [None]:
rs_model.fit(X_train,y_train)

In [None]:
rs_model.predict(X_test)

In [None]:
classifier_accuracy(rs_model)

In [None]:
from sklearn.model_selection import GridSearchCV
gs_grid={
    "n_estimators":[100,150,200,250,300,350,400],
    "max_depth":[1,2,3,4,5,6,7,8],
    'min_samples_leaf':[2,4]
}
gs_model=GridSearchCV(estimator=model,cv=5,param_grid=gs_grid,verbose=2)

In [None]:
gs_model.fit(X_train,y_train)

In [None]:
gs_model.predict(X_test)

In [None]:
classifier_accuracy(gs_model)

In [None]:
gs_model.best_score_

In [None]:
test=pd.read_csv('/kaggle/input/titanic/test.csv')
test.shape

In [None]:
test['Sex'].replace('male',0,inplace=True)
test['Sex'].replace('female',1,inplace=True)
test['Embarked'].replace('S',0,inplace=True)
test['Embarked'].replace('C',1,inplace=True)
test['Embarked'].replace('Q',2,inplace=True)
test.drop(['PassengerId','Name','Ticket','Cabin'],axis=1,inplace=True)
test['Age'].fillna(test['Age'].mean(),inplace=True)
test['Fare'].fillna(test['Fare'].mean(),inplace=True)

In [None]:
pred=rs_model.predict(test)
test2=pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
output = pd.DataFrame({'PassengerId': test2.PassengerId, 'Survived': pred})
output.to_csv('submission.csv', index=False)
print("Predictions Saved")