# Import Library and Dataset

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Preprocessing

In [3]:
df_train.dropna(subset=['Embarked'], inplace=True)

In [4]:
df_train['Sex'].unique()

array(['male', 'female'], dtype=object)

In [5]:
df_train['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [6]:
from sklearn.preprocessing import LabelEncoder
sex_encoder = LabelEncoder()
df_train['Sex'] = sex_encoder.fit_transform(df_train['Sex'])
df_train['Sex']

encoded_df = pd.get_dummies(df_train['Embarked'], prefix='Embarked')

df_train = pd.concat([df_train, encoded_df], axis=1)
df_train = df_train.drop(columns='Embarked')

In [7]:
embarked_encoder = LabelEncoder()
df_train['Embarked_C'] = embarked_encoder.fit_transform(df_train['Embarked_C'])
df_train['Embarked_Q'] = embarked_encoder.transform(df_train['Embarked_Q'])
df_train['Embarked_S'] = embarked_encoder.transform(df_train['Embarked_S'])

In [25]:
df_train.to_csv('train_cleaned.csv', index=False)

In [8]:
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,0,0,1
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,,0,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,0,0,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,,1,2,W./C. 6607,23.4500,,0,0,1
889,890,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,C148,1,0,0


In [9]:
df_train['Age'].isna().sum()

177

In [10]:
X = df_train[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
y = df_train['Survived']

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
708,1,0,0,0,151.5500,0,0,1
240,3,0,1,0,14.4542,1,0,0
382,3,1,0,0,7.9250,0,0,1
792,3,0,8,2,69.5500,0,0,1
683,3,1,5,2,46.9000,0,0,1
...,...,...,...,...,...,...,...,...
107,3,1,0,0,7.7750,0,0,1
271,3,1,0,0,0.0000,0,0,1
862,1,0,0,0,25.9292,0,0,1
436,3,0,2,2,34.3750,0,0,1


In [12]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Decision Tree

In [13]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
tree.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = tree.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[89 20]
 [20 49]]


0.7752808988764045

## svc

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
# Define parameter grid
from sklearn.model_selection import GridSearchCV 

# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
			'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
			'kernel': ['rbf', 'sigmoid']} 

# Create SVC classifier
svc = SVC()

# Grid search cross-validation
grid_search = GridSearchCV(estimator=svc,
                        param_grid=param_grid, 
                        cv=2, 
                        scoring='accuracy', 
                        verbose=3)
grid_search.fit(X_train, y_train)



Fitting 2 folds for each of 50 candidates, totalling 100 fits
[CV 1/2] END ........C=0.1, gamma=1, kernel=rbf;, score=0.711 total time=   0.0s
[CV 2/2] END ........C=0.1, gamma=1, kernel=rbf;, score=0.792 total time=   0.0s
[CV 1/2] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.640 total time=   0.0s
[CV 2/2] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.634 total time=   0.0s
[CV 1/2] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.775 total time=   0.0s
[CV 2/2] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.792 total time=   0.0s
[CV 1/2] END ..C=0.1, gamma=0.1, kernel=sigmoid;, score=0.789 total time=   0.0s
[CV 2/2] END ..C=0.1, gamma=0.1, kernel=sigmoid;, score=0.780 total time=   0.0s
[CV 1/2] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.618 total time=   0.0s
[CV 2/2] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.625 total time=   0.0s
[CV 1/2] END .C=0.1, gamma=0.01, kernel=sigmoid;, score=0.618 total time=   0.0s
[CV 2/2] END .C=0.1, gamma=0.01, kernel=sigmoid

In [23]:
# Print best parameters
print("Best parameters:", grid_search.best_params_)

# Print best estimator
print("Best estimator:", grid_search.best_estimator_)

# Evaluate on test set
accuracy = grid_search.best_estimator_.score(X_test, y_test)
print("Accuracy on test set:", accuracy)

Best parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best estimator: SVC(C=10, gamma=0.1)
Accuracy on test set: 0.7865168539325843


In [24]:
from sklearn.svm import SVC
SVM = SVC(kernel = 'rbf', C=0.2, gamma=1)
SVM = SVM.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = SVM.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[93 16]
 [25 44]]


0.7696629213483146

## Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[85 24]
 [15 54]]


0.7808988764044944

## Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
classifier.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)



[[89 20]
 [17 52]]


0.7921348314606742

# Visualization

# Modelling