In [1]:
%matplotlib inline
import numpy as np 
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_curve, roc_auc_score
from sklearn import neighbors, preprocessing
from sklearn.grid_search import GridSearchCV
import seaborn as sns
import matplotlib.pylab as plt



In [2]:
OJ = pd.read_csv('OJ.csv') 

In [3]:
OJ.shape

(1070, 18)

In [4]:
OJ.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 18 columns):
Purchase          1070 non-null object
WeekofPurchase    1070 non-null int64
StoreID           1070 non-null int64
PriceCH           1070 non-null float64
PriceMM           1070 non-null float64
DiscCH            1070 non-null float64
DiscMM            1070 non-null float64
SpecialCH         1070 non-null int64
SpecialMM         1070 non-null int64
LoyalCH           1070 non-null float64
SalePriceMM       1070 non-null float64
SalePriceCH       1070 non-null float64
PriceDiff         1070 non-null float64
Store7            1070 non-null object
PctDiscMM         1070 non-null float64
PctDiscCH         1070 non-null float64
ListPriceDiff     1070 non-null float64
STORE             1070 non-null int64
dtypes: float64(11), int64(5), object(2)
memory usage: 150.5+ KB


In [5]:
OJ.head()

Unnamed: 0,Purchase,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,SalePriceMM,SalePriceCH,PriceDiff,Store7,PctDiscMM,PctDiscCH,ListPriceDiff,STORE
0,CH,237,1,1.75,1.99,0.0,0.0,0,0,0.5,1.99,1.75,0.24,No,0.0,0.0,0.24,1
1,CH,239,1,1.75,1.99,0.0,0.3,0,1,0.6,1.69,1.75,-0.06,No,0.150754,0.0,0.24,1
2,CH,245,1,1.86,2.09,0.17,0.0,0,0,0.68,2.09,1.69,0.4,No,0.0,0.091398,0.23,1
3,MM,227,1,1.69,1.69,0.0,0.0,0,0,0.4,1.69,1.69,0.0,No,0.0,0.0,0.0,1
4,CH,228,7,1.69,1.69,0.0,0.0,0,0,0.956535,1.69,1.69,0.0,Yes,0.0,0.0,0.0,0


In [6]:
OJ["Purchase"] = OJ["Purchase"].astype("category").cat.codes
OJ["Store7"] = OJ["Store7"].astype("category").cat.codes
OJ.head()

Unnamed: 0,Purchase,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,SalePriceMM,SalePriceCH,PriceDiff,Store7,PctDiscMM,PctDiscCH,ListPriceDiff,STORE
0,0,237,1,1.75,1.99,0.0,0.0,0,0,0.5,1.99,1.75,0.24,0,0.0,0.0,0.24,1
1,0,239,1,1.75,1.99,0.0,0.3,0,1,0.6,1.69,1.75,-0.06,0,0.150754,0.0,0.24,1
2,0,245,1,1.86,2.09,0.17,0.0,0,0,0.68,2.09,1.69,0.4,0,0.0,0.091398,0.23,1
3,1,227,1,1.69,1.69,0.0,0.0,0,0,0.4,1.69,1.69,0.0,0,0.0,0.0,0.0,1
4,0,228,7,1.69,1.69,0.0,0.0,0,0,0.956535,1.69,1.69,0.0,1,0.0,0.0,0.0,0


In [7]:
df_x = OJ.drop("Purchase",axis=1)
df_y = OJ["Purchase"]
df_y.value_counts()

0    653
1    417
Name: Purchase, dtype: int64

In [8]:
from __future__ import division
x_train, x_test, y_train, y_test = train_test_split(df_x,df_y, test_size= (1070-800)/1070, random_state = 10)

In [9]:
clf = SVC(C=0.01)
clf.fit(x_train, y_train)  

SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [10]:
train_preds = clf.predict(x_train)
print(classification_report(y_train,train_preds))

             precision    recall  f1-score   support

          0       0.61      1.00      0.75       485
          1       0.00      0.00      0.00       315

avg / total       0.37      0.61      0.46       800



  'precision', 'predicted', average, warn_for)


In [11]:
svm_preds = clf.predict(x_test)

In [12]:
print(classification_report(y_test,svm_preds))

             precision    recall  f1-score   support

          0       0.62      1.00      0.77       168
          1       0.00      0.00      0.00       102

avg / total       0.39      0.62      0.48       270



In [13]:
print (confusion_matrix(y_test,svm_preds))

[[168   0]
 [102   0]]


In [14]:
param_grid = {"C": np.arange(0.01,10,0.5)}

cv = GridSearchCV(clf, param_grid, cv=10, n_jobs=4, refit=True, verbose=True)
cv.fit(x_train, y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=4)]: Done  80 tasks      | elapsed:    2.7s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    6.1s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'C': array([ 0.01,  0.51,  1.01,  1.51,  2.01,  2.51,  3.01,  3.51,  4.01,
        4.51,  5.01,  5.51,  6.01,  6.51,  7.01,  7.51,  8.01,  8.51,
        9.01,  9.51])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=True)

In [15]:
cv.best_estimator_

SVC(C=6.5099999999999998, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [16]:
cv.best_params_

{'C': 6.5099999999999998}

In [17]:
cv_test_preds = cv.predict(x_test)
print classification_report(y_test,cv_test_preds)

             precision    recall  f1-score   support

          0       0.82      0.90      0.86       168
          1       0.80      0.67      0.73       102

avg / total       0.81      0.81      0.81       270



In [18]:
conf = pd.crosstab(index=y_test, columns=cv_test_preds, rownames=['True'], colnames=['Predicted'])
print(conf)

Predicted    0   1
True              
0          151  17
1           34  68


#### polynomial kernel

In [None]:
svm_poly = SVC(kernel="poly",degree=2)
param_grid = {"C": np.arange(0.01,10,0.5)}

cv_poly = GridSearchCV(svm_poly, param_grid, cv=10, n_jobs=4, refit=True, verbose=True)
cv_poly.fit(x_train, y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


In [None]:
cv_poly.best_estimator_

In [None]:
poly_train_preds = cv_poly.predict(x_train)
poly_test_preds = cv_poly.predict(x_test)
print classification_report(y_train,cv_test_preds)
print classification_report(y_test,cv_test_preds)

#### linear kernel

In [None]:
svm_linear = SVC(kernel="linear")
param_grid = {"C": np.arange(0.01,10,0.5)}

cv_linear = GridSearchCV(svm_linear , param_grid, cv=10, n_jobs=4, refit=True, verbose=True)
cv_linear.fit(x_train, y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=4)]: Done  91 tasks      | elapsed:   17.3s


In [None]:
cv_linear.best_estimator_

In [None]:
linear_train_preds = cv_linear.predict(x_train)
linear_test_preds = cv_linear.predict(x_test)
print classification_report(y_train,cv_test_preds)
print classification_report(y_test,cv_test_preds)

Overall, Gaussian kernel seems to be producing minimum misclassification error on both train and test data.