In [55]:
## import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import classification_report


In [56]:
## load the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
gender_data = pd.read_csv('gender_submission.csv')

In [57]:
## insert the survived column in the test data
test_data.insert(1,'Survived',gender_data['Survived'])

In [58]:
# Concatenate train_data and test_data along the rows (axis=0)
concatenated_data = pd.concat([train_data, test_data], axis=0)

# Reset the index of the concatenated data
concatenated_data.reset_index(drop=True, inplace=True)

In [59]:
concatenated_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [60]:
concatenated_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     1309 non-null   int64  
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 122.8+ KB


In [61]:
## Check for missing values
concatenated_data.isnull().sum()

PassengerId       0
Survived          0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [62]:
## Replace missing values with median
concatenated_data['Age'] = concatenated_data['Age'].fillna(concatenated_data['Age'].median())

In [63]:
## Replace missing values with median
concatenated_data['Fare'] = concatenated_data['Fare'].fillna(concatenated_data['Fare'].median())

In [64]:
# Calculate the mode of the Cabin variable
mode = concatenated_data['Cabin'].mode().values[0]
# Replace missing values in Cabin variable with mode
concatenated_data['Cabin'].fillna(mode, inplace=True)


In [65]:
# Calculate the mode of the Cabin variable
mode = concatenated_data['Embarked'].mode().values[0]
# Replace missing values in Cabin variable with mode
concatenated_data['Embarked'].fillna(mode, inplace=True)

In [66]:
## Check for missing values
concatenated_data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [67]:
## statistical summery of the dataset 
concatenated_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0
mean,655.0,0.377387,2.294882,29.503186,0.498854,0.385027,33.281086
std,378.020061,0.484918,0.837836,12.905241,1.041658,0.86556,51.7415
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,22.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,35.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


In [68]:
## Exploratory data analysis
## correlation matrix
corr_mat = concatenated_data.corr(numeric_only=True)
corr_mat

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.02037,-0.038354,0.025799,-0.055224,0.008942,0.031128
Survived,-0.02037,1.0,-0.26471,-0.043286,0.00237,0.108919,0.233773
Pclass,-0.038354,-0.26471,1.0,-0.377908,0.060832,0.018322,-0.558683
Age,0.025799,-0.043286,-0.377908,1.0,-0.189972,-0.125851,0.178182
SibSp,-0.055224,0.00237,0.060832,-0.189972,1.0,0.373587,0.160349
Parch,0.008942,0.108919,0.018322,-0.125851,0.373587,1.0,0.221635
Fare,0.031128,0.233773,-0.558683,0.178182,0.160349,0.221635,1.0


In [69]:
##
women = concatenated_data.loc[concatenated_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)*100

print("% of women who survived:", rate_women)

% of women who survived: 82.61802575107296


In [70]:
##
men = concatenated_data.loc[concatenated_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)*100
print("% of men who survived:", rate_men)

% of men who survived: 12.930011862396205


In [71]:
##use label encoder for categorical columns
label_encoder = preprocessing.LabelEncoder()
concatenated_data['Sex'] = label_encoder.fit_transform(concatenated_data['Sex'])


In [73]:
#use label encoder for categorical columns
label_encoder = preprocessing.LabelEncoder()
concatenated_data['Embarked'] = label_encoder.fit_transform(concatenated_data['Embarked'])

In [74]:
concatenated_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,C23 C25 C27,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,C23 C25 C27,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,C23 C25 C27,2


In [75]:
## split our data into features and target variables
X = concatenated_data[['Pclass','Sex','Age','Fare','Embarked']]
y = concatenated_data['Survived']

In [76]:
## split our data into training and testing
## 70% for training, 30% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=32)

In [77]:
##standard scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [78]:
##knn
knn_model = KNeighborsClassifier(n_neighbors = 3)
knn_model.fit(X_train, y_train)

In [79]:
y_pred_knn = knn_model.predict(X_test)

In [80]:
##
print('Accurancy:', metrics.accuracy_score(y_test,y_pred_knn))

Accurancy: 0.8447837150127226


In [81]:
pd.DataFrame({'Actual':y_test, 'predicted':y_pred_knn})

Unnamed: 0,Actual,predicted
872,0,0
270,0,1
987,1,1
1214,0,1
113,0,1
...,...,...
285,0,0
1170,0,0
1260,0,0
1069,1,1


In [82]:
## decision tree
## create a decision tree with criterion gini with no max depth
decision_tree_gini = DecisionTreeClassifier()

## entropy criterion with no max depth
decision_tree_entropy = DecisionTreeClassifier(criterion = 'entropy')

## max depth of 3
decision_tree_depth = DecisionTreeClassifier(max_depth=3)

In [83]:
## fit our model
decision_tree_gini.fit(X_train, y_train)
decision_tree_entropy.fit(X_train, y_train)
decision_tree_depth.fit(X_train, y_train)

In [84]:
## make prediction

y_pred_gini = decision_tree_gini.predict(X_test)
y_pred_entropy = decision_tree_entropy.predict(X_test)
y_pred_depth = decision_tree_depth.predict(X_test)

In [85]:
## model evaluation
print('Accurancy(gini):', metrics.accuracy_score(y_test,y_pred_gini))
print('Accurancy(entropy):', metrics.accuracy_score(y_test,y_pred_entropy))
print('Accurancy(depth):', metrics.accuracy_score(y_test,y_pred_depth))

Accurancy(gini): 0.8498727735368957
Accurancy(entropy): 0.8295165394402035
Accurancy(depth): 0.8676844783715013


In [86]:
## support vector machine
clfLinear = svm.SVC(kernel ='linear')
clfsigmoid = svm.SVC(kernel ='sigmoid')
clfrbf = svm.SVC(kernel ='rbf')

In [87]:
## fit the model
clfLinear.fit(X_train, y_train)
clfsigmoid.fit(X_train, y_train)
clfrbf.fit(X_train, y_train)

In [88]:
## make the prediction 
y_pred_linear = clfLinear.predict(X_test)
y_pred_sigmoid = clfsigmoid.predict(X_test)
y_pred_rbf = clfrbf.predict(X_test)

In [89]:
## model evaluation
print('Accurancy(Linear Kernel):', metrics.accuracy_score(y_test,y_pred_linear))
print('Accurancy(Sigmoid Kernel):', metrics.accuracy_score(y_test,y_pred_sigmoid))
print('Accurancy(RBF Kernel):', metrics.accuracy_score(y_test,y_pred_rbf))

Accurancy(Linear Kernel): 0.8727735368956743
Accurancy(Sigmoid Kernel): 0.727735368956743
Accurancy(RBF Kernel): 0.8676844783715013


In [90]:
print(classification_report(y_test, y_pred_sigmoid))

              precision    recall  f1-score   support

           0       0.76      0.80      0.78       235
           1       0.67      0.63      0.65       158

    accuracy                           0.73       393
   macro avg       0.72      0.71      0.71       393
weighted avg       0.73      0.73      0.73       393



In [91]:
## create our naive bayes model
gausian_model = GaussianNB()
## using  training data for fitting
gausian_model.fit(X_train, y_train)

In [92]:
## using test data to make prediction
y_pred = gausian_model.predict(X_test)

In [93]:
## create a dataframe with the prediction
pd.DataFrame({'Actual': y_test, 'Predicted':y_pred})

Unnamed: 0,Actual,Predicted
872,0,0
270,0,0
987,1,1
1214,0,0
113,0,1
...,...,...
285,0,0
1170,0,0
1260,0,0
1069,1,1


In [94]:
## model evaluation
## measuring accuracy comparing prediction and real survive
print('Accurancy:', metrics.accuracy_score(y_test, y_pred))

Accurancy: 0.8473282442748091


In [95]:
## confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
conf_mat

array([[205,  30],
       [ 30, 128]], dtype=int64)