# Linear SVC Assignment

In [12]:
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

### Import the admissions data set (admissions.csv).

In [13]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/admissions.csv')
data.head()

Unnamed: 0,GRE,TOEFL,SchoolRank,SOP,LOR,GPA,Research,Admitted
0,337,118,4,4.5,4.5,9.65,1,1
1,324,107,4,4.0,4.5,8.87,1,1
2,316,104,3,3.0,3.5,8.0,1,1
3,322,110,3,3.5,2.5,8.67,1,1
4,314,103,2,2.0,3.0,8.21,0,0


### Split the data into training and test sets, with the test set comprising 30% of the data.  Use `'Admitted'` as the target.

In [14]:
from sklearn.model_selection import train_test_split
y = data['Admitted']
X = data.drop('Admitted', axis = 1)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3)

### Generate an SVC model with a linear kernel. Set the regularization parameter (C) = 10. Check the score for both train and test sets. 

In [15]:
from sklearn.svm import SVC
from mlxtend.plotting import plot_decision_regions

svc = SVC(C=10.0,kernel='linear')
svc.fit(X_train,y_train)

print('Train score: ', svc.score(X_train,y_train))
print('Test score: ', svc.score(X_test,y_test))

Train score:  0.8857142857142857
Test score:  0.85


### Choose some other values for C and show the difference between the scores for the train and test sets.

In [17]:
c = [0.001,0.01,0.1,1.0,5.0]
for x in c:
  svc = SVC(C=x,kernel='linear')
  svc.fit(X_train,y_train)
  print('C = ', x)
  print('Train score: ',svc.score(X_train,y_train))
  print('Test score: ',svc.score(X_test,y_test))


C =  0.001
Train score:  0.7964285714285714
Test score:  0.8416666666666667
C =  0.01
Train score:  0.8392857142857143
Test score:  0.875
C =  0.1
Train score:  0.8571428571428571
Test score:  0.8833333333333333
C =  1.0
Train score:  0.8821428571428571
Test score:  0.85
C =  5.0
Train score:  0.8821428571428571
Test score:  0.85


### What if we switched up the target variable? Let assume that we know whether a student was admitted. Let's try to predict what their SchoolRank was. 

Create an SVC model with a linear kernel with the SchoolRank field as the target variable. Report both the train and the test scores.

In [18]:
from sklearn.model_selection import train_test_split
y = data['SchoolRank']
X = data.drop('SchoolRank', axis = 1)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3)

svc = SVC(C=10.0,kernel='linear')
svc.fit(X_train,y_train)

print('Train score: ', svc.score(X_train,y_train))
print('Test score: ', svc.score(X_test,y_test))

Train score:  0.5892857142857143
Test score:  0.65


### Show confusion matrices for the training and test sets, and a classification report for the test set. What trends do you notice?

In [21]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_train,svc.predict(X_train)))

[[13  5  0  0  0]
 [ 7 48 18  1  1]
 [ 0 30 52  5  1]
 [ 0  2 12 20 17]
 [ 0  1  6  9 32]]


In [24]:
print(confusion_matrix(y_test,svc.predict(X_test)))

[[ 5  2  1  0  0]
 [ 4 21  7  0  0]
 [ 0  9 34  2  0]
 [ 0  2  7 10  4]
 [ 0  0  1  3  8]]


In [26]:
print(classification_report(y_test,svc.predict(X_test)))

              precision    recall  f1-score   support

           1       0.56      0.62      0.59         8
           2       0.62      0.66      0.64        32
           3       0.68      0.76      0.72        45
           4       0.67      0.43      0.53        23
           5       0.67      0.67      0.67        12

    accuracy                           0.65       120
   macro avg       0.64      0.63      0.63       120
weighted avg       0.65      0.65      0.64       120

