# Linear SVC Assignment

In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

import math
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix

### Import the admissions data set (admissions.csv).

In [3]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/admissions.csv')
data.head()

Unnamed: 0,GRE,TOEFL,SchoolRank,SOP,LOR,GPA,Research,Admitted
0,337,118,4,4.5,4.5,9.65,1,1
1,324,107,4,4.0,4.5,8.87,1,1
2,316,104,3,3.0,3.5,8.0,1,1
3,322,110,3,3.5,2.5,8.67,1,1
4,314,103,2,2.0,3.0,8.21,0,0


### Split the data into training and test sets, with the test set comprising 30% of the data.  Use `'Admitted'` as the target.

In [4]:
y = data['Admitted']

X = data.drop('Admitted', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Generate an SVC model with a linear kernel. Set the regularization parameter (C) = 10. Check the score for both train and test sets. 

In [5]:
svc = SVC(kernel='linear', C=10)

fit = svc.fit(X_train, y_train)

print('train score for c=10:', svc.score(X_train, y_train))
print('test score for c=10:', svc.score(X_test, y_test))

train score for c=10: 0.8785714285714286
test score for c=10: 0.8916666666666667


### Choose some other values for C and show the difference between the scores for the train and test sets.

In [None]:
svc = SVC(kernel='linear', C=100)

fit = svc.fit(X_train, y_train)

print('train score for c=100:', svc.score(X_train, y_train))
print('test score for c=100:', svc.score(X_test, y_test))

print()

svc = SVC(kernel='linear', C=1)

fit = svc.fit(X_train, y_train)

print('train score for c=1:', svc.score(X_train, y_train))
print('test score for c=1:', svc.score(X_test, y_test))

print()

svc = SVC(kernel='linear', C=0.1)
fit = svc.fit(X_train, y_train)
print('train score for c=0.1:', svc.score(X_train, y_train))
print('test score for c=0.1:', svc.score(X_test, y_test))

### What if we switched up the target variable? Let assume that we know whether a student was admitted. Let's try to predict what their SchoolRank was. 

Create an SVC model with a linear kernel with the SchoolRank field as the target variable. Report both the train and the test scores.

In [6]:
y = data['SchoolRank']

X = data.drop('SchoolRank', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

svc = SVC(kernel='linear', C=1)

fit = svc.fit(X_train, y_train)

print('train score for c=1:', svc.score(X_train, y_train))
print('test score for c=1:', svc.score(X_test, y_test))

train score for c=1: 0.675
test score for c=1: 0.5416666666666666


### Show confusion matrices for the training and test sets, and a classification report for the test set. What trends do you notice?

In [9]:
train_pred = fit.predict(X_train)

test_pred = fit.predict(X_test)

print(confusion_matrix(y_train, train_pred), '\n')
print(confusion_matrix(y_test, test_pred), '\n')
print(classification_report(y_test, test_pred))

[[ 5 10  0  0  0]
 [ 2 51 22  0  1]
 [ 0 12 78  5  0]
 [ 0  2 10 22 14]
 [ 0  1  4  8 33]] 

[[ 3  6  2  0  0]
 [ 2 18 10  0  1]
 [ 0  9 22  4  3]
 [ 0  2  2 10 12]
 [ 0  0  1  1 12]] 

              precision    recall  f1-score   support

           1       0.60      0.27      0.37        11
           2       0.51      0.58      0.55        31
           3       0.59      0.58      0.59        38
           4       0.67      0.38      0.49        26
           5       0.43      0.86      0.57        14

    accuracy                           0.54       120
   macro avg       0.56      0.53      0.51       120
weighted avg       0.57      0.54      0.53       120

