# Importing Required Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

import pickle

import warnings
warnings.filterwarnings("ignore")

sns.set()

# Importing Data

In [2]:
train_features_backup = pd.read_csv("train_features_df.csv")
test_features_backup = pd.read_csv("test_features_df.csv")

train_features_df = train_features_backup.copy()
train_features_df = train_features_df.drop(["Age_label_enc", "Fare_label_enc"], axis = 1)
train_features_df["Indices"] = train_features_df.index

test_features_df = test_features_backup.copy()
test_features_df = test_features_df.drop(["Age_label_enc", "Fare_mean_enc"], axis = 1)

In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_preprocessed_df = pd.read_csv("preprocessed_train_df.csv")
test_preprocessed_df = pd.read_csv("preprocessed_test_df.csv")

# Train-Test Split

In [4]:
X, y = train_features_df.values, train_df["Survived"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Base SVM Model

In [5]:
SVC().fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [6]:
svm_model = SVC(random_state = 42).fit(X_train[:, :-1], y_train)

## Classification Accuracy

In [7]:
np.where(svm_model.predict(X_test[:, :-1]) == y_test, 1, 0).mean()

0.6703910614525139

## F1 Score

In [8]:
f1_score(y_true = y_test, y_pred = svm_model.predict(X_test[:, :-1]))

0.4158415841584159

## Classification Summary

In [9]:
preds = svm_model.predict(X_test[:, :-1])

f1_df = pd.DataFrame(index = ["Predictions:0", "Predictions:1"], columns = ["Actuals:0", "Actuals:1"])

f1_df.at["Predictions:0", "Actuals:0"] = np.where((preds == 0) & (y_test == 0), 1, 0).sum()
f1_df.at["Predictions:0", "Actuals:1"] = np.where((preds == 0) & (y_test == 1), 1, 0).sum()
f1_df.at["Predictions:1", "Actuals:0"] = np.where((preds == 1) & (y_test == 0), 1, 0).sum()
f1_df.at["Predictions:1", "Actuals:1"] = np.where((preds == 1) & (y_test == 1), 1, 0).sum()

f1_df

Unnamed: 0,Actuals:0,Actuals:1
Predictions:0,99,53
Predictions:1,6,21


## Creating Base Submission File

In [10]:
X, y = train_features_df.values, train_df["Survived"].values

full_svm_model = SVC(random_state = 42).fit(X[:, :-1], y)
full_preds = full_svm_model.predict(test_features_df.values)

sub4 = pd.read_csv("gender_submission.csv")
sub4["Survived"] = full_preds
sub4.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [11]:
sub4.to_csv("sub4_base_svm.csv", index = False)

# Optimizing SVM Model

## Data

In [12]:
X, y = train_features_df.values, train_df["Survived"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Base Model

In [13]:
base_svm_model = SVC(random_state = 42)

## Grid Search

### Parameters

In [None]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
                     'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
                    {'kernel': ['sigmoid'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
                     'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
                    {'kernel': ['linear'], 'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}
                   ]

In [14]:
param_grid = [
              {'kernel': ['rbf'], 
               'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
               'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
              
              {'kernel': ['sigmoid'], 
               'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
               'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
              
              {'kernel': ['linear'], 
               'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}
]

### All Possible SVM Models 

In [15]:
all_svm_models = GridSearchCV(base_svm_model, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)

### Fitting All Models

In [16]:
all_svm_models_fit = all_svm_models.fit(X_train[:, :-1], y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed: 11.3min finished



### Best Estimator

In [21]:
int_best_svm = all_svm_models_fit.best_estimator_

In [18]:
filename = "intenal_best_svm.sav"
pickle.dump(all_svm_models_fit.best_estimator_, open(filename, 'wb'))

# pickle.dump(all_svm_models_fit.best_estimator_, "internal_best_svm.sav")

In [20]:
filename = "intenal_best_svm.sav"
loaded_model = pickle.load(open(filename, 'rb'))
loaded_model

SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

#### Accuracy of Best Estimator

**Classification Accuracy**

In [22]:
np.where(int_best_svm.predict(X_test[:, :-1]) == y_test, 1, 0).mean()

0.8156424581005587

**F1 Score**

In [23]:
f1_score(y_pred = int_best_svm.predict(X_test[:, :-1]), y_true = y_test)

0.7692307692307693

# Submission File

## Fitting Full Data To Optimized SVM Model

In [24]:
X, y = train_features_df.values, train_df["Survived"].values

In [27]:
final_svm_models_fit = all_svm_models.fit(X[:, :-1], y)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed:  9.5min finished


### Best Estimator

In [29]:
final_best_svm = final_svm_models_fit.best_estimator_
final_best_svm

SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

### Exporting Final SVM Model

In [30]:
filename = "final_best_svm.sav"
pickle.dump(final_best_svm, open(filename, 'wb'))

In [31]:
filename = "final_best_svm.sav"
loaded_model = pickle.load(open(filename, 'rb'))
loaded_model

SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

## Creating Submission

In [34]:
sub4_final = pd.read_csv("gender_submission.csv")
sub4_final["Survived"] = final_best_svm.predict(test_features_df.values)
sub4_final.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [35]:
sub4_final.to_csv("sub4_final_svm.csv", index = False)