In [2]:
import numpy as np
import pandas as pd

# Machine Learning - SVM 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn import metrics
from sklearn.model_selection import GridSearchCV


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
# Load the test data
train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Preprocess the train data
train_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True) # Drop 
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
train_data['Embarked'] = train_data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Fill N/A Values
train_data.fillna(train_data.mean(), inplace=True) 

train_data # display 

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,0,22.000000,1,0,7.2500,0.0
1,2,1,1,1,38.000000,1,0,71.2833,1.0
2,3,1,3,1,26.000000,0,0,7.9250,0.0
3,4,1,1,1,35.000000,1,0,53.1000,0.0
4,5,0,3,0,35.000000,0,0,8.0500,0.0
...,...,...,...,...,...,...,...,...,...
886,887,0,2,0,27.000000,0,0,13.0000,0.0
887,888,1,1,1,19.000000,0,0,30.0000,0.0
888,889,0,3,1,29.699118,1,2,23.4500,0.0
889,890,1,1,0,26.000000,0,0,30.0000,1.0


In [5]:
# Features and Target Variables
# Features = Pclass, Sex, Age, SibSp, Parch, Fare, Embarked
X = train_data.drop(columns=['PassengerId', 'Survived'])
y = train_data['Survived']

In [6]:
X # Features

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,22.000000,1,0,7.2500,0.0
1,1,1,38.000000,1,0,71.2833,1.0
2,3,1,26.000000,0,0,7.9250,0.0
3,1,1,35.000000,1,0,53.1000,0.0
4,3,0,35.000000,0,0,8.0500,0.0
...,...,...,...,...,...,...,...
886,2,0,27.000000,0,0,13.0000,0.0
887,1,1,19.000000,0,0,30.0000,0.0
888,3,1,29.699118,1,2,23.4500,0.0
889,1,0,26.000000,0,0,30.0000,1.0


In [7]:
y # Target Variable

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [8]:
# Split Data: 90% Train, 10% Test on Train.csv 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=20)

## SVC Model - Default Values
svc_model = SVC()
svc_model.fit(X_train, y_train)
svc_predictions = svc_model.predict(X_test) 
print("SVC Results")
print(classification_report(y_test, svc_predictions))
c_matrix = metrics.confusion_matrix(y_test, svc_predictions)
FP = c_matrix[0, 1] 
FN = c_matrix[1, 0]  
print("This is the false negative : " + str(FN))
print("This is the false positive : " + str(FP))

SVC Results
              precision    recall  f1-score   support

           0       0.71      0.88      0.79        57
           1       0.65      0.39      0.49        33

    accuracy                           0.70        90
   macro avg       0.68      0.64      0.64        90
weighted avg       0.69      0.70      0.68        90

This is the false negative : 20
This is the false positive : 7


# Parameters
- C - Regularization Parameter
- Kernel
    - `linear`, `poly`, `rbf` (default), `sigmoid`, `precomputed`
- degree - degree of the polynomail kernel `poly`
- gamma - kernel coefficient for `rbf`, `poly`, and `sigmoid`

## Find Best Parameters
Use the `GridSearchCV` package to find the best parameters

In [9]:
# Define the parameter grid
#param_grid = {
#    'C': np.arange(0.1, 10.1, 0.1),  # C values from 0.1 to 10 in increments of 0.1
#    'gamma': np.arange(0.1, 1.05, 0.05),  # gamma values from 0.1 to 1 in increments of 0.05
#    'kernel': ['rbf', 'linear', 'poly', 'sigmoid'] # considers all kernel types 
#}

#grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)
#grid.fit(X_train, y_train)

# C=10, gamma=0.1, kernel=linear; Score 81.2%

svc_model = SVC(C=10, gamma=0.1, kernel='linear')
svc_model.fit(X_train, y_train)
svc_predictions = svc_model.predict(X_test) 
print("SVC Results")
print(classification_report(y_test, svc_predictions))
c_matrix = metrics.confusion_matrix(y_test, svc_predictions)

print(c_matrix)
FP = c_matrix[0, 1]  # FP
FN = c_matrix[1, 0]  # FN
TP = c_matrix[1, 1]  # TP
TN = c_matrix[0, 0]  # TN

FP_rate = FP / (FP + TN)  # FPR
FN_rate = FN / (FN + TP)  # FNR

print("This is the False Positive Rate: " + str(FP_rate))
print("This is the False Negative Rate: " + str(FN_rate))

SVC Results
              precision    recall  f1-score   support

           0       0.85      0.91      0.88        57
           1       0.83      0.73      0.77        33

    accuracy                           0.84        90
   macro avg       0.84      0.82      0.83        90
weighted avg       0.84      0.84      0.84        90

[[52  5]
 [ 9 24]]
This is the False Positive Rate: 0.08771929824561403
This is the False Negative Rate: 0.2727272727272727


In [32]:
# Load and Proprocess Data: Same as train.csv
test_data = pd.read_csv("test.csv")
test_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})
test_data['Embarked'] = test_data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
test_data.fillna(test_data.mean(), inplace=True) 
X_test_final = test_data.drop(columns=['PassengerId'])

# Predict
svc_model = SVC(C=10, gamma=0.1, kernel='linear')
svc_model.fit(X_train, y_train)
predictions = svc_model.predict(X_test_final)
# CSV Output
output = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': predictions})
output.to_csv('svc_svm_optimized.csv', index=False)