In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn import preprocessing

plt.rcParams["figure.figsize"] = (10, 10)

# NOTE still using training set from HW1
root = '/home/jimge/public/CAP5610/hw1/'

train = pd.read_csv(root + 'train.csv')

We decided from assignment 1 to include the following features:
* Pclass
* Gender
* Age 
* Fare

Pclass is already coded well.

Gender we will record as 0 and 1.

Age and fare we will leave as continuous.

In [55]:
'''
Recode gender
'''
train['Gender'] = (train['Sex'] == 'female').transform(lambda x: 1 if x else 0)

'''
Fix missing data in age
'''
age_mean = train.Age.mean()
age_std = train.Age.std()
def age_rand():
    return np.random.normal(age_mean, age_std)

age_missing = train.Age.isna()
train.Age = train.Age.transform(lambda x: age_rand() if pd.isna(x) else x)

'''
Normalize the fields we care about
Note that gender is 0..1 by default
'''
def norm(series):
    return (series-series.min()) / (series.max() - series.min())

scaler = preprocessing.MinMaxScaler()
train['AgeNorm'] = norm(train.Age)
train['FareNorm'] = norm(train.Fare)
train['PclassNorm'] = norm(train.Pclass)

X = train[['Age', 'Fare', 'Gender', 'Pclass']]
y = train['Survived']

In [53]:
'''
Results - run 5-fold test on Titanic data using SVM
'''
clf = SVC(kernel='linear')
scores = cross_val_score(clf, X, y, cv=5)
print('linear    ', scores, scores.mean())

clf = SVC(kernel='rbf')
scores = cross_val_score(clf, X, y, cv=5)
print('RBF       ', scores, scores.mean())

clf = SVC(kernel='poly', degree=2)
scores = cross_val_score(clf, X, y, cv=5)
print('quadratic ', scores, scores.mean())


linear     [0.80446927 0.80337079 0.78651685 0.75280899 0.78651685] 0.7867365513778168
RBF        [0.58100559 0.71348315 0.66853933 0.67977528 0.69101124] 0.6667629150712446
quadratic  [0.59776536 0.70786517 0.65730337 0.66853933 0.69101124] 0.6644968928504174


In [57]:
'''
Results - run 5-fold test on Titanic data using SVM (using normalized data)
'''
X = train[['AgeNorm', 'FareNorm', 'Gender', 'PclassNorm']]
y = train['Survived']

clf = SVC(kernel='linear')
scores = cross_val_score(clf, X, y, cv=5)
print('linear    ', scores, scores.mean())

clf = SVC(kernel='rbf')
scores = cross_val_score(clf, X, y, cv=5)
print('RBF       ', scores, scores.mean())

clf = SVC(kernel='poly', degree=2)
scores = cross_val_score(clf, X, y, cv=5)
print('quadratic ', scores, scores.mean())


linear     [0.80446927 0.80337079 0.78651685 0.75280899 0.78651685] 0.7867365513778168
RBF        [0.74860335 0.79213483 0.78651685 0.75280899 0.78651685] 0.773316176009039
quadratic  [0.74860335 0.80337079 0.78651685 0.75280899 0.78651685] 0.775563367020275
