In [11]:
import pandas as pd

oj = pd.read_csv("../../data/OJ.csv")
oj

Unnamed: 0,Purchase,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,SalePriceMM,SalePriceCH,PriceDiff,Store7,PctDiscMM,PctDiscCH,ListPriceDiff,STORE
0,CH,237,1,1.75,1.99,0.00,0.00,0,0,0.500000,1.99,1.75,0.24,No,0.000000,0.000000,0.24,1
1,CH,239,1,1.75,1.99,0.00,0.30,0,1,0.600000,1.69,1.75,-0.06,No,0.150754,0.000000,0.24,1
2,CH,245,1,1.86,2.09,0.17,0.00,0,0,0.680000,2.09,1.69,0.40,No,0.000000,0.091398,0.23,1
3,MM,227,1,1.69,1.69,0.00,0.00,0,0,0.400000,1.69,1.69,0.00,No,0.000000,0.000000,0.00,1
4,CH,228,7,1.69,1.69,0.00,0.00,0,0,0.956535,1.69,1.69,0.00,Yes,0.000000,0.000000,0.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1065,CH,252,7,1.86,2.09,0.10,0.00,0,0,0.587822,2.09,1.76,0.33,Yes,0.000000,0.053763,0.23,0
1066,CH,256,7,1.86,2.18,0.00,0.00,0,0,0.670258,2.18,1.86,0.32,Yes,0.000000,0.000000,0.32,0
1067,MM,257,7,1.86,2.18,0.00,0.00,0,0,0.736206,2.18,1.86,0.32,Yes,0.000000,0.000000,0.32,0
1068,CH,261,7,1.86,2.13,0.00,0.24,0,0,0.588965,1.89,1.86,0.03,Yes,0.112676,0.000000,0.27,0


In [12]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

X = oj.drop("Purchase", axis=1)
y = oj["Purchase"]

X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=800, random_state=0, stratify=y
)

svc = SVC(C=0.01, kernel='linear')
svc.fit(X_train, y_train)

num_support_vectors = svc.support_vectors_.shape[0]
num_support_vectors

621

In [13]:
y_train_pred = svc.predict(X_train)
train_error = 1 - (y_train_pred == y_train).mean()

y_test_pred = svc.predict(X_test)
test_error = 1 - (y_test_pred == y_test).mean()

train_error, test_error

(np.float64(0.24875000000000003), np.float64(0.20740740740740737))

In [14]:
import sklearn.model_selection as skm
from sklearn.metrics import accuracy_score
import numpy as np 

svm_linear = SVC(kernel='linear')

kfold = skm.KFold(5, 
                  random_state=0,
                  shuffle=True)
grid = skm.GridSearchCV(svm_linear,
                        {'C':np.linspace(0.1, 1, 5)},
                        refit=True,
                        cv=kfold,
                        scoring='accuracy')
grid.fit(X, y)
params = grid.best_params_

best_svm = grid.best_estimator_
cv_error = 1 - np.mean(skm.cross_val_score(best_svm, X, y, cv=5))

y_pred_train = best_svm.predict(X)
train_error = 1 - accuracy_score(y, y_pred_train)


print("Optimal params:", params)
print("CV error:", cv_error)
print("Training error:", train_error)

Optimal params: {'C': np.float64(1.0)}
CV error: 0.17570093457943925
Training error: 0.1607476635514019


In [15]:
svm_r = SVC(kernel='rbf')

kfold = skm.KFold(5, 
                  random_state=0,
                  shuffle=True)
grid = skm.GridSearchCV(svm_r,
                        {'C':np.linspace(0.1, 1, 5)},
                        refit=True,
                        cv=kfold,
                        scoring='accuracy')
grid.fit(X, y)
params = grid.best_params_

best_r = grid.best_estimator_
cv_error = 1 - np.mean(skm.cross_val_score(best_r, X, y, cv=5))

y_pred_train = best_r.predict(X)
train_error = 1 - accuracy_score(y, y_pred_train)


print("Optimal params:", params)
print("CV error:", cv_error)
print("Training error:", train_error)

Optimal params: {'C': np.float64(0.1)}
CV error: 0.3897196261682243
Training error: 0.3897196261682243


In this case, the linear SVC significantly outperformed the nonlinear RBF SVC. The linear SVC, with an optimal **C = 1.0**, achieved a **cross-validation error of \~0.176** and a **training error of \~0.161**, demonstrating that a simple linear decision boundary was sufficient to model the data effectively. In contrast, the nonlinear RBF SVC, with **C = 0.1**, had much higher errors (\~0.39), indicating it failed to generalize well and likely suffered from underfitting due to inappropriate parameter choice or kernel complexity.