# Linear SVC Assignment

In [31]:
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report as report, confusion_matrix, accuracy_score

### Import the admissions data set (admissions.csv).

In [2]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/admissions.csv')
data.head()

Unnamed: 0,GRE,TOEFL,SchoolRank,SOP,LOR,GPA,Research,Admitted
0,337,118,4,4.5,4.5,9.65,1,1
1,324,107,4,4.0,4.5,8.87,1,1
2,316,104,3,3.0,3.5,8.0,1,1
3,322,110,3,3.5,2.5,8.67,1,1
4,314,103,2,2.0,3.0,8.21,0,0


In [33]:
data.shape

(400, 8)

### Split the data into training and test sets, with the test set comprising 30% of the data.  Use `'Admitted'` as the target.

In [3]:
x = data.drop(columns='Admitted')
y = data['Admitted']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

### Generate an SVC model with a linear kernel. Set the regularization parameter (C) = 10. Check the score for both train and test sets. 

In [5]:
svc = SVC(kernel='linear', C=10).fit(x_train, y_train)

print('Train score:', svc.score(x_train, y_train))
print('Test score:', svc.score(x_test, y_test))

Train score: 0.8821428571428571
Test score: 0.8416666666666667


In [7]:
test_pred = svc.predict(x_test)

print('Test Accuracy Score:', accuracy_score(y_test, test_pred))

Test Accuracy Score: 0.8416666666666667


### Choose some other values for C and show the difference between the scores for the train and test sets.

In [18]:
C_list = [0.01, 0.1, 1, 10, 100]
for C in C_list:
  svc = SVC(kernel='linear', C=C).fit(x_train, y_train)
  print(f'Train score for C = {C}: {svc.score(x_train, y_train):.2f}')
  print(f'Test score for C = {C}: {svc.score(x_test, y_test):.2f}\n')


Train score for C = 0.01: 0.85
Test score for C = 0.01: 0.85

Train score for C = 0.1: 0.86
Test score for C = 0.1: 0.83

Train score for C = 1: 0.89
Test score for C = 1: 0.82

Train score for C = 10: 0.88
Test score for C = 10: 0.84

Train score for C = 100: 0.88
Test score for C = 100: 0.81



### What if we switched up the target variable? Let assume that we know whether a student was admitted. Let's try to predict what their SchoolRank was. 

Create an SVC model with a linear kernel with the SchoolRank field as the target variable. Report both the train and the test scores.

In [20]:
data

Unnamed: 0,GRE,TOEFL,SchoolRank,SOP,LOR,GPA,Research,Admitted
0,337,118,4,4.5,4.5,9.65,1,1
1,324,107,4,4.0,4.5,8.87,1,1
2,316,104,3,3.0,3.5,8.00,1,1
3,322,110,3,3.5,2.5,8.67,1,1
4,314,103,2,2.0,3.0,8.21,0,0
...,...,...,...,...,...,...,...,...
395,324,110,3,3.5,3.5,9.04,1,1
396,325,107,3,3.0,3.5,9.11,1,1
397,330,116,4,5.0,4.5,9.45,1,1
398,312,103,3,3.5,4.0,8.78,0,0


In [25]:
x2 = data.drop(columns=['SchoolRank'])
y2 = data['SchoolRank']

x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.3)


svc2 = SVC(kernel='linear').fit(x2_train, y2_train)
print(f'Train score : {svc2.score(x2_train, y2_train):.2f}')
print(f'Test score: {svc2.score(x2_test, y2_test):.2f}')

Train score : 0.60
Test score: 0.52


### Show confusion matrices for the training and test sets, and a classification report for the test set. What trends do you notice?

In [32]:
train_pred = svc2.predict(x2_train)
test_pred = svc2.predict(x2_test)
print(f'Train report: {report(y2_train, train_pred)}')
print(f'Train confusion matrix:\n{confusion_matrix(y2_train, train_pred)}')
print(f'Test report: {report(y2_test, test_pred)}')
print(f'Test confusion matrix:\n{confusion_matrix(y2_test, test_pred)}')


Train report:               precision    recall  f1-score   support

           1       0.60      0.68      0.64        22
           2       0.68      0.66      0.67        76
           3       0.62      0.73      0.67        89
           4       0.49      0.65      0.56        57
           5       0.00      0.00      0.00        36

    accuracy                           0.60       280
   macro avg       0.48      0.54      0.51       280
weighted avg       0.53      0.60      0.56       280

Train confusion matrix:
[[15  6  1  0  0]
 [ 9 50 14  3  0]
 [ 1 17 65  6  0]
 [ 0  1 19 37  0]
 [ 0  0  6 30  0]]
Test report:               precision    recall  f1-score   support

           1       0.40      0.50      0.44         4
           2       0.53      0.58      0.55        31
           3       0.69      0.75      0.72        44
           4       0.27      0.53      0.36        17
           5       0.00      0.00      0.00        24

    accuracy                           0.52

  _warn_prf(average, modifier, msg_start, len(result))
