### 20MAI0077 - Vivek Dadhich
> [Github repo Link](https://github.com/vivek20dadhich/dwm-ELA-CSE5021)

In [3]:
#import required packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

In [4]:
#Read the dataset onto a variable

train = pd.read_csv('C:/Users/Vivek/Desktop/Machine Learning Techniques/titanic_data.csv')
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df = train[['Survived','Pclass','Sex','Age','Fare']]

### Feature engineering #1

In [7]:
# Encoding - change male -> 1 and female -> 0 using lambda inline function

df['Sex'] = df['Sex'].apply(lambda sex:1 if sex=='male' else 0)

### Feature engineering #2

In [8]:

# Handling missing values - Data Imputation

print(df.isnull().sum())
# only age has missing values

df['Age'] = df['Age'].fillna(df['Age'].median())
print(df['Age'].isnull().sum())

Survived      0
Pclass        0
Sex           0
Age         177
Fare          0
dtype: int64
0


In [9]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,1,22.0,7.25
1,1,1,0,38.0,71.2833
2,1,3,0,26.0,7.925
3,1,1,0,35.0,53.1
4,0,3,1,35.0,8.05


### Set the predictor and response variable

In [10]:
X = df.drop('Survived', axis = 1) # all except survived
Y = df['Survived']

### Splitting using the magical function 

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 42)

## *Support Vector Machine*

In [21]:
# Fitting Linear SVM to the Training set
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)

Y_pred = linear_svc.predict(X_test)

acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)

### Confusion matrix

In [22]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(Y_test, Y_pred)
confusion_matrix

array([[130,  27],
       [ 21,  90]], dtype=int64)

In [23]:
# 130 -> true negative (people didnt survive actually and model predicted)
# 27 -> false positive
# 21 -> false negative
# 90 -> true positive

### Accuracy Score

In [28]:
acc_linear_svc = round(accuracy_score(Y_test, Y_pred)*100,2)
acc_linear_svc

82.09

### Classification report 

In [25]:
from sklearn.metrics import classification_report
report = classification_report(Y_test, Y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.86      0.83      0.84       157
           1       0.77      0.81      0.79       111

    accuracy                           0.82       268
   macro avg       0.82      0.82      0.82       268
weighted avg       0.82      0.82      0.82       268



<br></br>

## *Decision Tree*

In [27]:
decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_train, Y_train)  
Y_pred_dt = decision_tree.predict(X_test)  
#acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree = round(accuracy_score(Y_test, Y_pred_dt)*100,2)
acc_decision_tree

75.75

### Confusion matrix for decision tree classifier

In [30]:
from sklearn.metrics import confusion_matrix
cfm = confusion_matrix(Y_test, Y_pred_dt)
print(cfm)

[[127  30]
 [ 35  76]]


### Classification report for dt classifier

In [31]:
report_nb = classification_report(Y_test, Y_pred_dt)
print(report_nb)

              precision    recall  f1-score   support

           0       0.78      0.81      0.80       157
           1       0.72      0.68      0.70       111

    accuracy                           0.76       268
   macro avg       0.75      0.75      0.75       268
weighted avg       0.76      0.76      0.76       268



## *Comparison*

In [32]:
results = pd.DataFrame({
    'Model': ['Support Vector Machines',
              'Decision Tree'],
    'Score': [acc_linear_svc, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head()

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
82.09,Support Vector Machines
75.75,Decision Tree
