# Linear SVC Assignment

In [25]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

### Import the admissions data set (admissions.csv).

In [2]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/admissions.csv')
data.head()

Unnamed: 0,GRE,TOEFL,SchoolRank,SOP,LOR,GPA,Research,Admitted
0,337,118,4,4.5,4.5,9.65,1,1
1,324,107,4,4.0,4.5,8.87,1,1
2,316,104,3,3.0,3.5,8.0,1,1
3,322,110,3,3.5,2.5,8.67,1,1
4,314,103,2,2.0,3.0,8.21,0,0


### Split the data into training and test sets, with the test set comprising 30% of the data.  Use `'Admitted'` as the target.

In [3]:
X = data.drop(columns='Admitted')
y = data['Admitted']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

### Generate an SVC model with a linear kernel. Set the regularization parameter (C) = 10. Check the score for both train and test sets. 

In [4]:
vectored = SVC(C=10, kernel='linear')
vectored.fit(X_train, y_train)
print('SVM Train Score: ', vectored.score(X_train, y_train))
print('SVM Test Score: ', vectored.score(X_test, y_test))

SVM Train Score:  0.8821428571428571
SVM Test Score:  0.875


### Choose some other values for C and show the difference between the scores for the train and test sets.

In [33]:
score_train = []
score_test = []
C_list = [.01, .1, .5, 1, 10]
for i in C_list:
  vec = SVC(C=i, kernel='linear')
  vec.fit(X_train, y_train)
  score_train.append(vec.score(X_train, y_train))
  score_test.append(vec.score(X_test, y_test))
  #f1_train.append(f1_score(y_train, neigh.predict(X_train)))
  #f1_test.append(f1_score(y_test, neigh.predict(X_test)))
  #y_pred_train = abs.predict(X_train_sca)
  #rmse_train.append(mean_squared_error(y_train, y_pred_train, squared=False))
  #y_pred_test = abs.predict(X_test_sca)
  #rmse_test.append(mean_squared_error(y_test, y_pred_test, squared=False))

print('Train scores: ', score_train)
print('Test scores: ', score_test)

Train scores:  [0.8464285714285714, 0.8607142857142858, 0.875, 0.8714285714285714, 0.8821428571428571]
Test scores:  [0.8666666666666667, 0.8833333333333333, 0.8583333333333333, 0.85, 0.875]


### What if we switched up the target variable? Let assume that we know whether a student was admitted. Let's try to predict what their SchoolRank was. 

Create an SVC model with a linear kernel with the SchoolRank field as the target variable. Report both the train and the test scores.

In [48]:
X1 = data.drop(columns=['SchoolRank'])
y1 = data['SchoolRank']

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size = 0.3)

vectored1 = SVC(C=1, kernel='linear')
vectored1.fit(X_train1, y_train1)
print('SVM Train Score: ', vectored1.score(X_train1, y_train1))
print('SVM Test Score: ', vectored1.score(X_test1, y_test1))

SVM Train Score:  0.6357142857142857
SVM Test Score:  0.6


### Show confusion matrices for the training and test sets, and a classification report for the test set. What trends do you notice?

In [49]:
y_pred_train = vectored1.predict(X_train1)
y_pred_test = vectored1.predict(X_test1)

conf_train = confusion_matrix(y_train1, y_pred_train)
conf_test = confusion_matrix(y_test1, y_pred_test)

print(conf_train)
print('')
print(conf_test)
print('')
train_repo = classification_report(y_train1, y_pred_train)
test_repo = classification_report(y_test1, y_pred_test)
#print(train_repo)
print(test_repo)

[[10  6  2  0  0]
 [ 4 43 19  1  0]
 [ 1 17 66  8  1]
 [ 1  2  7 35 12]
 [ 0  1  4 16 24]]

[[ 4  4  0  0  0]
 [ 6 21 11  1  1]
 [ 0  2 33  5  0]
 [ 0  1  6  8  2]
 [ 0  0  2  7  6]]

              precision    recall  f1-score   support

           1       0.40      0.50      0.44         8
           2       0.75      0.53      0.62        40
           3       0.63      0.82      0.72        40
           4       0.38      0.47      0.42        17
           5       0.67      0.40      0.50        15

    accuracy                           0.60       120
   macro avg       0.57      0.54      0.54       120
weighted avg       0.63      0.60      0.60       120

