# Linear SVC Assignment

In [1]:
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

### Import the admissions data set (admissions.csv).

In [2]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/admissions.csv')
data.head()

Unnamed: 0,GRE,TOEFL,SchoolRank,SOP,LOR,GPA,Research,Admitted
0,337,118,4,4.5,4.5,9.65,1,1
1,324,107,4,4.0,4.5,8.87,1,1
2,316,104,3,3.0,3.5,8.0,1,1
3,322,110,3,3.5,2.5,8.67,1,1
4,314,103,2,2.0,3.0,8.21,0,0


### Split the data into training and test sets, with the test set comprising 30% of the data.  Use `'Admitted'` as the target.

In [3]:
X = data.drop('Admitted', axis=1)
y = data.Admitted

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Generate an SVC model with a linear kernel. Set the regularization parameter (C) = 10. Check the score for both train and test sets. 

In [4]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')
def score_svc(C=1, svc=SVC()):
    svc.C = C
    svc.fit(X_train, y_train)

    train_score = svc.score(X_train, y_train)
    test_score = svc.score(X_test, y_test)
    return train_score, test_score

C = 10
train_score, test_score = score_svc(C, svc)
print(
    f'C: {svc.C}\n'
    f'Training score: {train_score}\n'
    f'Test score: {test_score}\n'
)

C: 10
Training score: 0.8678571428571429
Test score: 0.8916666666666667



### Choose some other values for C and show the difference between the scores for the train and test sets.

In [5]:
C_list = np.logspace(-3,4, 8)

for C in C_list:
    train_score, test_score = score_svc(C, svc)
    print(
        f'C: {svc.C}\n'
        f'Training score: {train_score:0.3f}\n'
        f'Test score: {test_score:0.3f}\n'
    )
    

C: 0.001
Training score: 0.829
Test score: 0.800

C: 0.01
Training score: 0.846
Test score: 0.850

C: 0.1
Training score: 0.857
Test score: 0.867

C: 1.0
Training score: 0.871
Test score: 0.892

C: 10.0
Training score: 0.868
Test score: 0.892

C: 100.0
Training score: 0.879
Test score: 0.858

C: 1000.0
Training score: 0.846
Test score: 0.850

C: 10000.0
Training score: 0.889
Test score: 0.867



### What if we switched up the target variable? Let assume that we know whether a student was admitted. Let's try to predict what their SchoolRank was. 

Create an SVC model with a linear kernel with the SchoolRank field as the target variable. Report both the train and the test scores.

In [12]:
X = data.drop('SchoolRank', axis=1)
y = data.SchoolRank

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

svc2 = SVC(kernel='linear')


train_score, test_score = score_svc(10, svc2)

print(
        f'Training score: {train_score:0.3f}\n'
        f'Test score: {test_score:0.3f}\n'
)
    

Training score: 0.632
Test score: 0.583



### Show confusion matrices for the training and test sets, and a classification report for the test set. What trends do you notice?

In [13]:
y_test_pred = svc2.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           1       0.43      0.50      0.46         6
           2       0.66      0.59      0.62        39
           3       0.63      0.71      0.67        41
           4       0.44      0.37      0.40        19
           5       0.50      0.53      0.52        15

    accuracy                           0.58       120
   macro avg       0.53      0.54      0.53       120
weighted avg       0.58      0.58      0.58       120



In [8]:
confusion_matrix(y_test, y_test_pred)

array([[ 4,  2,  1,  0,  0],
       [ 5, 17, 16,  0,  1],
       [ 0,  4, 29,  4,  0],
       [ 1,  1,  5,  9,  8],
       [ 0,  0,  1,  5,  7]])

In [9]:
# The classification report and confusion matrix show the model is better at predicting 
# class 2, 3 which are more common in this particular test split.