# Loading the data set

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
titanic = pd.read_csv('/content/titanic_dataset .csv')

In [33]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [34]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [35]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


# handling the missing values

In [36]:
from sklearn.impute import SimpleImputer

In [37]:
print("Missing values before imputation:\n", titanic.isnull().sum())

Missing values before imputation:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


# Impute missing 'Age' values with the mean

In [38]:
age_imputer = SimpleImputer(strategy='mean')
titanic['Age'] = age_imputer.fit_transform(titanic[['Age']])

# Impute missing 'Embarked' values with the most frequent value

In [39]:
titanic['Embarked'] = titanic['Embarked'].fillna(titanic['Embarked'].mode()[0])

# Check for missing values

In [40]:
print("Missing values after imputation:\n", titanic.isnull().sum())

Missing values after imputation:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64


# Drop columns not necessary for the analysis

In [41]:
titanic = titanic.drop(columns=['Cabin', 'Ticket', 'Name', 'PassengerId'])

Display the first few rows after dropping columns

In [42]:
print("Dataset after dropping unnecessary columns:\n", titanic.head())

Dataset after dropping unnecessary columns:
    Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0         0       3    male  22.0      1      0   7.2500        S
1         1       1  female  38.0      1      0  71.2833        C
2         1       3  female  26.0      0      0   7.9250        S
3         1       1  female  35.0      1      0  53.1000        S
4         0       3    male  35.0      0      0   8.0500        S


# Encoding Categorical Variables

In [43]:
from sklearn.preprocessing import LabelEncoder

# Convert 'Sex' and 'Embarked' columns to numerical values

In [44]:
label_encoder = LabelEncoder()
titanic['Sex'] = label_encoder.fit_transform(titanic['Sex'])
titanic['Embarked'] = label_encoder.fit_transform(titanic['Embarked'])

# Display the first few rows after encoding

In [45]:
print("Dataset after encoding categorical variables:\n", titanic.head())

Dataset after encoding categorical variables:
    Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0         0       3    1  22.0      1      0   7.2500         2
1         1       1    0  38.0      1      0  71.2833         0
2         1       3    0  26.0      0      0   7.9250         2
3         1       1    0  35.0      1      0  53.1000         2
4         0       3    1  35.0      0      0   8.0500         2


# Feature Scaling

In [46]:
from sklearn.preprocessing import StandardScaler

In [47]:
X = titanic.drop(columns=['Survived'])
y = titanic['Survived']


In [48]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [49]:
print("Features after scaling:\n", X_scaled[:5])

Features after scaling:
 [[ 0.82737724  0.73769513 -0.5924806   0.43279337 -0.47367361 -0.50244517
   0.58595414]
 [-1.56610693 -1.35557354  0.63878901  0.43279337 -0.47367361  0.78684529
  -1.9423032 ]
 [ 0.82737724 -1.35557354 -0.2846632  -0.4745452  -0.47367361 -0.48885426
   0.58595414]
 [-1.56610693 -1.35557354  0.40792596  0.43279337 -0.47367361  0.42073024
   0.58595414]
 [ 0.82737724  0.73769513  0.40792596 -0.4745452  -0.47367361 -0.48633742
   0.58595414]]


# Creating kNN and SVM Models

In [50]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [51]:
knn = KNeighborsClassifier(n_neighbors=5)
svm = SVC(kernel='linear')

In [52]:
print("kNN and SVM models created.")

kNN and SVM models created.


# k-Fold Cross-Validation

In [54]:
from sklearn.model_selection import KFold, cross_val_score

In [55]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate kNN model using k-fold cross-validation

In [56]:
knn_scores = cross_val_score(knn, X_scaled, y, cv=kf, scoring='accuracy')

In [57]:
print("kNN average accuracy with k-fold cross-validation: ", np.mean(knn_scores))

kNN average accuracy with k-fold cross-validation:  0.8013495700207143


# Evaluate SVM model using k-fold cross-validation

In [58]:
svm_scores = cross_val_score(svm, X_scaled, y, cv=kf, scoring='accuracy')
print("SVM average accuracy with k-fold cross-validation: ", np.mean(svm_scores))

SVM average accuracy with k-fold cross-validation:  0.7878852551628899


# Stratified k-Fold Cross-Validation

In [59]:
from sklearn.model_selection import StratifiedKFold

# Define stratified k-fold cross-validation

In [60]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate kNN model using stratified k-fold cross-validation

In [61]:
svm_scores = cross_val_score(svm, X_scaled, y, cv=kf, scoring='accuracy')
print("SVM average accuracy with k-fold cross-validation: ", np.mean(svm_scores))

SVM average accuracy with k-fold cross-validation:  0.7878852551628899


# Stratified k-Fold Cross-Validation

In [62]:
from sklearn.model_selection import StratifiedKFold

# Evaluate kNN model using stratified k-fold cross-validation

In [63]:
knn_stratified_scores = cross_val_score(knn, X_scaled, y, cv=skf, scoring='accuracy')
print("kNN average accuracy with stratified k-fold cross-validation: ", np.mean(knn_stratified_scores))

kNN average accuracy with stratified k-fold cross-validation:  0.8080848659845584


# Evaluate SVM model using stratified k-fold cross-validation

In [64]:
svm_stratified_scores = cross_val_score(svm, X_scaled, y, cv=skf, scoring='accuracy')
print("SVM average accuracy with stratified k-fold cross-validation: ", np.mean(svm_stratified_scores))

SVM average accuracy with stratified k-fold cross-validation:  0.7867553825874082
