# Loading the necessary libraries 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

from sklearn.preprocessing import MinMaxScaler

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing the dataset into the jupyter notebook

In [None]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")

# Data Overview

In [None]:
train.describe().columns

In [None]:
train.info()

In [None]:
train.head()

> # EDA (Exploratory Data Analysis)

In [None]:
import seaborn as sns
sns.barplot(x=train['Survived'], y= train.Survived.value_counts(), color='orange')

##### The number of died is nearly the double of the number of people that survives in titanic.

In [None]:
import matplotlib.pyplot as plt

plt.hist(train['Pclass'], color='cyan')
plt.xticks(train.Pclass)
plt.xlabel('Pclass');

##### Most of the people travelled in p-class 3 and the fair for the same might be the most affordable.

In [None]:
sns.barplot(x=train.Embarked.value_counts().index, y=train.Embarked.value_counts(), color='#66FF00')

##### Majority of the number of ships Embarked to the port S  from Titanic.

In [None]:
plt.xlabel('Age')
sns.histplot(x= "Age", hue= train[["Survived"]].apply(tuple, axis=1), data=train)
plt.legend(train.Survived)
plt.legend(['Not survived', 'Survived'])

##### The histogram depicts the age distribution of the passengers onboard and their survival numbers.
##### The highest survival rate would  be of the people in the age group of 18 to 35 and seems left skewed, i.e young people had better chances of survival than old people and the infants.

##### The infants between the age of 0 to 4 had high chances of not surviving along with the senior citizens aged above 75.

In [None]:
#plt.bar(train.Sex.unique(), train.Sex.value_counts(), color='pink')
fig, axes = plt.subplots(1, 2,figsize=(10, 4))
women = train[train['Sex']=='female']
men = train[train['Sex']=='male']
ax = sns.distplot(women[women['Survived']==1].Age, bins=18, label='Survived',ax = axes[0], kde =False)
ax = sns.distplot(women[women['Survived']==0].Age, bins=40, label='Not survived',ax = axes[0], kde =False)
ax.legend()
ax.set_title('Female')
ax = sns.distplot(men[men['Survived']==1].Age, bins=18,label='Survived' ,ax = axes[1], kde = False)
ax = sns.distplot(men[men['Survived']==0].Age, bins=40, label='Not Survived',ax = axes[1], kde = False)
ax.legend()
ax.set_title('Male');

##### This distribution plot illustrates that females aged between 15 to 35 were more likely to survive, whereas the males between the age of 30 to 40 were more likely to live.
##### And mostly women have higher survival chances as compared to males on the ship.
##### Here infants also have higher probability to survive in both cases.

#### Average survival rate according to all the numerical data points

In [None]:
sns.barplot(train.SibSp.value_counts().index, train.SibSp.value_counts(), color='#FF404C') 

In [None]:
sns.barplot(train.Parch.unique(), train.Parch.value_counts(), color= '#773277')

In [None]:
pd.pivot_table(train, values = ['Age', 'SibSp', 'Parch', 'Fare'], index= 'Survived')

In [None]:
#sns.heatmap(train.corr(),cmap="YlGnBu")
corr = train.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(7, 5))
    ax = sns.heatmap(corr, annot=True,mask=mask, vmax=.3, square=True, cmap='Blues')

In [None]:
# The heatmap above illustrates that the survived column (target value) is a bitb correlated to the fare of the trip. 

fig, ax= plt.subplots(1,3, figsize= (15,5))

fig.subplots_adjust(hspace=0.5)
sns.barplot(x= 'Pclass', y = 'Survived', data = train, palette = "Blues_r", ax = ax[0])
sns.barplot(x= 'Pclass', y='Fare', data= train, palette = "Blues_r", ax = ax[1])
sns.barplot(x= 'Survived', y='Fare', data= train, palette = "Blues_r", ax = ax[2])

# The fare is higher for the Pclass 1 and is reducing subsequently for 2nd Class and 3rd class. Which might indirectly imply that the richer passengers who travelled in the 1st class had higher chances of survival as fare is the most correlated as compared to other attributes in the dataset. 

##### The barplots above indicate towards the fact that people from Pclass 1 had highest survival chances, and the price for class 1 is also highest,and people paying more fare had nearly double survival rate than those who travelled in lower class with a cheaper ticket fare.

In [None]:
(train[['Sex','Survived']].value_counts(normalize=True)*100).unstack()

In [None]:
g = sns.FacetGrid(train, col = 'Pclass', row = 'Survived')
g.map_dataframe(sns.histplot, x="Age",hue= "Sex", bins = 20).add_legend();

In [None]:
train['Sex'].value_counts(normalize=True)*100

In [None]:
train['Survived'].value_counts(normalize=True)*100

In [None]:
train.columns

In [None]:
data = train.copy()
data['relatives'] = data['SibSp']+data['Parch']

# Plottin a point-plot

sns.pointplot(x= 'relatives', y= 'Survived', data = data)

# A point plot represents an estimate of central tendency for a numeric variable by the 
# position of scatter plot points and provides some indication of the uncertainty around 
# that estimate using error bars.


##### Here we were checking out the number of relatives explaining the survival chances, and turns out that if a person had 1 to 3 relative count, then they had better survival rates as compared to other figures.

> # Data Preparation

In [None]:
missing = train.isna().sum()
per = (train.isna().sum()/train.isna().count()*100)
df = pd.concat([missing, per], axis=1, keys=['Total', 'Percentage'])
df = np.round(df,3)
df.Percentage = df['Percentage'].map(str) + '%'
df

##### Here the table shows the null total number and the percentage of the null values present in the dataset. And Cabin attribute has the highest number of null values followed by Age attribute.

In [None]:
# Filling the null values in the age column with the median of whole attribute as median is less sensitive to the outliers, if any.

train.Age.fillna(value = train.Age.median()
                , inplace = True)

In [None]:
# Imputing the null values in Embarked with the most frequently occcured destination in the dataset.

train.Embarked.fillna(train.Embarked.mode()[0], inplace=True)

> # Feature Selection

In [None]:
# We are dropping the PassengerId attribute as that category is not being considered relevant to the survival chances of a passenger.

train.drop(['PassengerId'], axis=1, inplace = True)

In [None]:
train.Ticket.nunique()

In [None]:
# Since the distinct number of tickets in the dataset are 681, it wouldn't be a smart choice to convert all of them into a categorical feature.
# And since it isn't contributing in figuring out the survival chances of any individual, we would drop this attribute too.

train.drop(['Ticket'], axis=1, inplace= True)

In [None]:
train.drop(['Cabin'], axis=1, inplace = True)

In [None]:
# The same is true for the names of the people. So, we will drop Names column too.

train.drop(['Name'], axis=1, inplace=True)

> # Feature Engineering

In [None]:
# We need to segregate the Age into certain intervals to make better predictions.

# 0 to 20 -> young - 0
# 21 to 40 -> adult -1
# 41 to 60 -> old -2
# 61 to 100 -> senior citizen -3

train.Age = train.Age.astype('int')
train.loc[train.Age <=20, 'Age'] = 0
train.loc[(train.Age >=21) & (train.Age <= 40), 'Age'] = 1
train.loc[ (train.Age >=41) & (train.Age <=60), 'Age'] = 2
train.loc[ (train.Age >=61) & (train.Age <=100), 'Age'] = 3
train.head(20)

In [None]:
# Converting Embarked from categorical to numerical value

train.loc[train.Embarked == 'S', 'Embarked'] = 0
train.loc[train.Embarked == 'C', 'Embarked'] = 1
train.loc[train.Embarked == 'Q', 'Embarked'] = 2
train

In [None]:
train.loc[train.Sex == 'male', 'Sex'] =1
train.loc[train.Sex == 'female', 'Sex'] = 0
train

In [None]:
### Scaling the feature

scaler = MinMaxScaler()

train[['Fare']] = scaler.fit_transform(train[['Fare']])

> # Building machine Learning models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier

In [None]:
X = train.drop('Survived', axis=1)
y = train.Survived

In [None]:
print(X.shape)
y.shape

In [None]:
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [None]:
print(X_train.shape)
print(y_train.shape
     )

In [None]:
svc = SVC()
acc_svc = cross_val_score(svc,X_train,y_train,cv =5 )
acc_svc
svc_score = acc_svc.mean()*100
print("Accuracy: " + str(svc_score))

In [None]:
lr = LogisticRegression()
acc_lr = cross_val_score(lr, X_train, y_train, cv=5)
acc_lr
lr_score = acc_lr.mean()*100
print("Accuracy : " + str(lr_score))

In [None]:
nb = GaussianNB()
acc_nb = cross_val_score(nb, X_train, y_train, cv=5)
acc_nb
nb_score = acc_nb.mean()*100
print("accuracy : " + str(nb_score))

In [None]:
knn = KNeighborsClassifier()
acc_knn = cross_val_score(knn, X_train, y_train, cv=5)
acc_knn
knn_score = acc_knn.mean()*100
print("Accuracy : " + str(knn_score))

In [None]:
dt = tree.DecisionTreeClassifier()
acc_dt = cross_val_score(dt, X_train,y_train, cv=5)
acc_dt
dt_score = acc_dt.mean()*100
print("Accuracy : " + str(dt_score))

In [None]:
dt = dt.fit(X_train, y_train)
plt.figure(figsize=(20,20))
tree.plot_tree(dt);

In [None]:
rf = RandomForestClassifier()
acc_rf = cross_val_score(rf, X_train, y_train, cv=5)
acc_rf
rf_score = acc_rf.mean()*100
print("Accuracy : " + str(rf_score))

In [None]:
accuracy = {'Model' : ['SVC', 'LOGISTIC REGRESSION', 'NAIVE BAYES', 'K NEAREST NEIGHBOUR', 'DECISION TREE', 'RANDOM FOREST'],
            'Accuracy': [svc_score, lr_score, nb_score, knn_score, dt_score, rf_score]}
df = pd.DataFrame(accuracy) 
df.Accuracy = df.Accuracy.round(2)
df.Accuracy = df.Accuracy.map(str) + '%'
df