In [1]:
# Load packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

In [2]:
# Load and view data
penguins = sns.load_dataset('penguins')
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [3]:
# Remove the penguins with missing data
penguinsClean = penguins[~penguins['body_mass_g'].isna()]

In [4]:
# Only use numeric values. Categorical values could be encoded as dummy variables.

X = penguinsClean[
    ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
]
Y = penguinsClean['species']

# Split the data into training and testing sets.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=20220621)

# Scale the input variable because SVM is dependent on differences in scale for distances
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Linear SVM

In [5]:
# Define and fit the model.
# Adjust C from 0.01 to 100 by changing the number of decimal places or zeros.
# C controls the slope of the hinge function. Larger values make misclassification less frequent.

penguinsSVMlinear = svm.SVC(kernel='linear', C=0.01)
penguinsSVMlinear.fit(X_train_scaled, Y_train)

In [6]:
# Predict for the test set
Y_pred = penguinsSVMlinear.predict(X_test_scaled)

In [7]:
# Display the confusion matrix
confusion_matrix(Y_test, Y_pred)

array([[31,  0,  0],
       [ 7, 13,  0],
       [ 0,  0, 35]])

## Radial basis function

In [8]:
# Adjust the number of decimal places in
# gamma (affects distance a point has influence, smaller value of gamma allow influence to spread more )
# and C

penguinsSVMrbf = svm.SVC(kernel='rbf', C=10, gamma=0.01)
penguinsSVMrbf.fit(X_train_scaled, Y_train)

In [9]:
# Predict for the test set
Y_pred = penguinsSVMrbf.predict(X_test_scaled)

In [10]:
# Display the confusion matrix
confusion_matrix(Y_test, Y_pred)

array([[30,  1,  0],
       [ 1, 19,  0],
       [ 0,  0, 35]])

## Polynomial

In [11]:
# Adjust the number of decimal places in C and change degree by steps of 1.
# Degree impacts the degree of the polynomial for the kernel.

penguinsSVMpoly = svm.SVC(kernel='poly', C=0.1, degree=5)
penguinsSVMpoly.fit(X_train_scaled, Y_train)

In [12]:
# Predict for the test set
Y_pred = penguinsSVMpoly.predict(X_test_scaled)

In [13]:
# Display the confusion matrix
confusion_matrix(Y_test, Y_pred)

array([[31,  0,  0],
       [17,  3,  0],
       [ 4,  0, 31]])

## Accessing information

In [14]:
# The number of support vectors for each class
penguinsSVMrbf.n_support_

array([21, 21,  6], dtype=int32)

In [15]:
# Which instances in the training set are support vectors
penguinsSVMrbf.support_

array([ 12,  18,  38,  65,  99, 111, 114, 116, 120, 122, 140, 155, 157,
       165, 180, 212, 217, 218, 220, 241, 245,  16,  24,  36,  45,  56,
        58,  72,  78,  83,  89,  95, 113, 137, 183, 186, 187, 200, 226,
       227, 234, 237,  43,  86,  92, 112, 196, 247], dtype=int32)

In [16]:
# The coefficients of the hyperplanes for each pair of classes in the form intercept = coefficient1*variable1 + coefficient2*variable2 + ...
penguinsSVMlinear.coef_

array([[-0.62622598,  0.05401353, -0.108782  ,  0.08399517],
       [-0.24717395,  0.49072732, -0.33804034, -0.25984323],
       [ 0.08731435,  0.50450852, -0.36236078, -0.36043294]])

In [17]:
# The intercept of the hyperplanes for each pair of classes.
penguinsSVMlinear.intercept_

array([0.44537772, 0.30524277, 0.23923495])