In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import naive_bayes

from sklearn.metrics import accuracy_score

# load data
data = pd.read_csv('Titanic.csv')

In [21]:
def calculate_variance(data):
    n = len(data)
    if n <= 1:
        return 0.0
    mean = sum(data) / n
    variance = sum((x - mean) ** 2 for x in data) / (n - 1)
    return variance

for i in data.columns:
    if data[i].dtype == 'int64' or data[i].dtype == 'float64':
        print(f"Variance of the column {i}:",calculate_variance(data[i]))

Variance of the column PassengerId: 66231.0
Variance of the column Survived: 0.2367722165474984
Variance of the column Pclass: 0.6990151199889065
Variance of the column Age: 169.05239993721085
Variance of the column SibSp: 1.2160430774662894
Variance of the column Parch: 0.6497282437357467
Variance of the column Fare: 2469.436845743117


In [3]:
# preprocess data
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)  # drop irrelevant columns
data['Age'] = data['Age'].fillna(data['Age'].mean())  # fill missing values with mean
data = pd.get_dummies(data)  # one-hot encode categorical variables

# split data into training and test sets
X = data.drop('Survived', axis=1)
y = data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Decision Tree

In [4]:
# train decision tree
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# make predictions on test set
y_pred = clf.predict(X_test)

# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.7821229050279329


## Random Forest

In [5]:
# Create a Random Forest model object
'''
Some Important Parameters:-
1. n_estimators:- It defines the number of decision trees to be created in a random forest.
2. criterion:- "Gini" or "Entropy."
3. min_samples_split:- Used to define the minimum number of samples required in a leaf 
node before a split is attempted
4. max_features: -It defines the maximum number of features allowed for the split in each 
decision tree.
5. n_jobs:- The number of jobs to run in parallel for both fit and predict. Always keep (-1) to 
use all the cores for parallel processing.
'''
# rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf = RandomForestClassifier(n_estimators=100, min_samples_split=5, random_state=42)


# Fit the model to the training data
rf.fit(X_train, y_train)

# Use the model to make predictions on the testing data
y_pred = rf.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.8324022346368715


## Support Vector Machine

In [6]:
'''
Kernel: The kernel function is used to transform the input data into a higher-dimensional space, where it may be easier to separate the classes. 
The most common types of kernel functions are linear, polynomial, and radial basis function (RBF).

C: C is the regularization parameter, which controls the trade-off between maximizing the margin and minimizing the classification error. 
A large value of C will result in a smaller margin and more classification errors, while a small value of C will result in a larger margin but may lead to more misclassifications.

Gamma: Gamma is a parameter of the RBF kernel function that controls the width of the kernel. 
A smaller value of gamma will result in a wider kernel, which may result in more points being considered support vectors. A larger value of gamma will result in a narrower kernel, which may result in overfitting.

Degree: Degree is a parameter of the polynomial kernel function that controls the degree of the polynomial. 
A higher degree polynomial will result in a more complex decision boundary, which may lead to overfitting.

Class Weight: Class weight is a parameter that is used to balance the weight of the different classes. 
This is useful when the classes are imbalanced, and the SVM model may be biased towards the majority class.

Probability: The probability parameter allows the SVM model to output the probability estimates of the predicted class.
'''

clf = svm.SVC(kernel='linear', C=1.0, gamma='auto',degree=3,probability=True)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = clf.predict(X_test)

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7821229050279329


In [7]:
# Create a support vector classifier object with grid search
clf = svm.SVC()

# Define the parameter grid to search
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'degree': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

# Perform grid search to find the best kernel function
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print the best kernel function and its score
print("Best kernel function: ", grid_search.best_params_['kernel'])
print("Best score: ", grid_search.best_score_)

Best kernel function:  linear
Best score:  0.7878853540825371


## Naive Bayes Classifier

In [15]:
clf = naive_bayes.GaussianNB()
# clf = naive_bayes.MultinomialNB()

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = clf.predict(X_test)

# Evaluate the performance of the classifier
y_prob = clf.predict_proba(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.776536312849162
