# Predicting survivors of Titanic using SVM Algorithm

### Importing Libraries

In [None]:
%matplotlib notebook
import numpy as np
import pandas as pd

# for creating plots
import matplotlib.pyplot as plt

# for splitting of data sets
from sklearn.model_selection import train_test_split

# importing SVM from scikit learn
from sklearn.svm import SVC

# Visualization tools from adspy_shared_utilities
from adspy_shared_utilities import plot_labelled_scatter
from adspy_shared_utilities import plot_class_regions_for_classifier
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot

### Reading Titanic Data Set

In [None]:
# importing data file in csv format
cl = pd.read_csv('titanic.csv', header=0)

# show first 5 rows of the data set 
cl.head()

In [None]:
# descriptive statistics regarding the data
cl.describe()

### Training and Test Data Split

In [None]:
# filling missing values with the median
cl['age'] = cl['age'].fillna(cl['age'].median())
cl['fare'] = cl['fare'].fillna(cl['fare'].median())
cl['embarked'] = cl['embarked'].fillna(cl['embarked'].median())

# Sex and Age as independent variable
X = cl[['embarked', 'age']].as_matrix()
# dependent variable
y = cl['survived']

#Splitting of data to training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

### Applying Linear SVM

In [None]:
#Applying Linear SVM with C = 1.0 to training data
clf = SVC(kernel = 'linear', C=1.0).fit(X_train, y_train)

In [None]:
plot_class_regions_for_classifier(clf, X_test, y_test, None, None,'Titanic test data')

print('Accuracy of Linear SVC classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Linear SVC classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

### Linear SVC with varying C

In [None]:
from sklearn.svm import LinearSVC
fig, subaxes = plt.subplots(1, 3, figsize=(8, 4))
for this_C, subplot in zip([0.00001, 1.0, 100], subaxes):
    clf = SVC(kernel = 'linear', C=this_C).fit(X_train, y_train)
    title = 'Linear SVC, C = {:.5f}'.format(this_C)
    plot_class_regions_for_classifier_subplot(clf, X_train, y_train,
                                             None, None, title, subplot)

In [None]:
#Accuracy with varying C
for this_C in [0.00001, 0.1, 1.0, 10, 20, 50, 100]:
    clf = SVC(kernel = 'linear', C=this_C).fit(X_train, y_train)
    print("Accuracy of Linear SVM with C = {:.5f}".format(this_C),"is ", clf.score(X_test,y_test))

### Splitting Dataset to Training and Testing

In [None]:
# filling missing values with the median
cl['age'] = cl['age'].fillna(cl['age'].median())
cl['fare'] = cl['fare'].fillna(cl['fare'].median())
cl['embarked'] = cl['embarked'].fillna(cl['embarked'].median())

#independent variables
X_2 = cl[['pclass', 'sex','age','sibsp', 'parch', 'fare', 'embarked']].as_matrix()
#dependent variables
y_2 = cl['survived']

#Splitting of data to training and testing
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_2, y_2, random_state = 0)

In [None]:
# Fitting of training data to Linear SVM

clf2 = SVC(kernel = 'linear', C=1.0).fit(X_train2, y_train2)

print('Accuracy of Linear SVC classifier on training set: {:.2f}'
     .format(clf2.score(X_train2, y_train2)))
print('Accuracy of Linear SVC classifier on test set: {:.2f}'
     .format(clf2.score(X_test2, y_test2)))

In [None]:
# Fitting of training data to RBF SVM with C = 1.0

clf3 = SVC(kernel = 'rbf', C=1.0).fit(X_train2, y_train2)

print('Accuracy of Linear SVC classifier on training set: {:.2f}'
     .format(clf3.score(X_train2, y_train2)))
print('Accuracy of Linear SVC classifier on test set: {:.2f}'
     .format(clf3.score(X_test2, y_test2)))

In [None]:
# Fitting of training data to Polynomial SVM with C = 1.0

clf4 = SVC(kernel = 'poly', degree = 3).fit(X_train2, y_train2)

print('Accuracy of Linear SVC classifier on training set: {:.2f}'
     .format(clf4.score(X_train2, y_train2)))
print('Accuracy of Linear SVC classifier on test set: {:.2f}'
     .format(clf4.score(X_test2, y_test2)))

In [None]:
# Linear SVM C = 1.0
clf2 = SVC(kernel = 'linear', C=1.0).fit(X_train2, y_train2)

#RBF C = 1.0
clf3 = SVC(kernel = 'rbf', C=1.0).fit(X_train2, y_train2)

#Polynomial Degree = 3
clf4 = SVC(kernel = 'poly', degree = 3).fit(X_train2, y_train2)

In [None]:
sample_data = pd.DataFrame([{'pclass': 1, 'sex': 0,'age': 29,'sibsp': 0, 'parch': 0, 'fare': 211.3375, 'embarked': 1}])

In [None]:
clf2.predict(sample_data)

In [None]:
clf3.predict(sample_data)

In [None]:
clf4.predict(sample_data)