# Linear SVC Assignment

In [69]:
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

### Import the admissions data set (admissions.csv).

In [70]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/admissions.csv')
data.head()

Unnamed: 0,GRE,TOEFL,SchoolRank,SOP,LOR,GPA,Research,Admitted
0,337,118,4,4.5,4.5,9.65,1,1
1,324,107,4,4.0,4.5,8.87,1,1
2,316,104,3,3.0,3.5,8.0,1,1
3,322,110,3,3.5,2.5,8.67,1,1
4,314,103,2,2.0,3.0,8.21,0,0


### Split the data into training and test sets, with the test set comprising 30% of the data.  Use `'Admitted'` as the target.

In [71]:
y = data['Admitted']
X = data.drop(columns='Admitted')

#Size of the test set.
SIZE = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=SIZE)

print('There are {:d} training samples and {:d} test samples'.format(X_train.shape[0], X_test.shape[0]))

There are 280 training samples and 120 test samples


### Generate an SVC model with a linear kernel. Set the regularization parameter (C) = 10. Check the score for both train and test sets. 

In [72]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

model = make_pipeline(StandardScaler(), SVC(gamma='auto', C=10))
model.fit(X_train, y_train)

prediction_train = model.predict(X_train)
predictions_test = model.predict(X_test)

print(classification_report(y_test, predictions_test))

              precision    recall  f1-score   support

           0       0.63      0.87      0.73        39
           1       0.92      0.75      0.83        81

    accuracy                           0.79       120
   macro avg       0.78      0.81      0.78       120
weighted avg       0.83      0.79      0.80       120



In [73]:
print(classification_report(y_train, prediction_train))


              precision    recall  f1-score   support

           0       0.90      0.91      0.90       114
           1       0.94      0.93      0.93       166

    accuracy                           0.92       280
   macro avg       0.92      0.92      0.92       280
weighted avg       0.92      0.92      0.92       280



### Choose some other values for C and show the difference between the scores for the train and test sets.

In [74]:
model = make_pipeline(StandardScaler(), SVC(gamma='auto', C=2))
model.fit(X_train, y_train)

prediction_train = model.predict(X_train)
predictions_test = model.predict(X_test)

print(classification_report(y_test, predictions_test), '\n')
print(classification_report(y_train, prediction_train))

              precision    recall  f1-score   support

           0       0.63      0.79      0.70        39
           1       0.89      0.78      0.83        81

    accuracy                           0.78       120
   macro avg       0.76      0.79      0.77       120
weighted avg       0.80      0.78      0.79       120
 

              precision    recall  f1-score   support

           0       0.89      0.89      0.89       114
           1       0.92      0.93      0.92       166

    accuracy                           0.91       280
   macro avg       0.91      0.91      0.91       280
weighted avg       0.91      0.91      0.91       280



In [75]:
def score_svc(c):
  model = make_pipeline(StandardScaler(), SVC(gamma='auto', C=c, kernel='linear'))
  #model = SVC(gamma='auto', C=c, kernel='linear')
  model.fit(X_train, y_train)

  prediction_train = model.predict(X_train)
  predictions_test = model.predict(X_test)

  print("Classification Report with c=", c)
  print(classification_report(y_test, predictions_test), '\n')
  print(classification_report(y_train, prediction_train))

In [76]:
cs = [90, 12, 3, 1]

for c in cs:
  score_svc(c)


Classification Report with c= 90
              precision    recall  f1-score   support

           0       0.65      0.77      0.71        39
           1       0.88      0.80      0.84        81

    accuracy                           0.79       120
   macro avg       0.77      0.79      0.77       120
weighted avg       0.80      0.79      0.80       120
 

              precision    recall  f1-score   support

           0       0.86      0.87      0.86       114
           1       0.91      0.90      0.91       166

    accuracy                           0.89       280
   macro avg       0.88      0.89      0.89       280
weighted avg       0.89      0.89      0.89       280

Classification Report with c= 12
              precision    recall  f1-score   support

           0       0.65      0.77      0.71        39
           1       0.88      0.80      0.84        81

    accuracy                           0.79       120
   macro avg       0.77      0.79      0.77       120
weight

### What if we switched up the target variable? Let assume that we know whether a student was admitted. Let's try to predict what their SchoolRank was. 

Create an SVC model with a linear kernel with the SchoolRank field as the target variable. Report both the train and the test scores.

In [77]:
y = data['SchoolRank']
X = data.drop(columns='SchoolRank')

#Size of the test set.
SIZE = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=SIZE)

print('There are {:d} training samples and {:d} test samples'.format(X_train.shape[0], X_test.shape[0]))

There are 280 training samples and 120 test samples


In [78]:
score_svc(10)

Classification Report with c= 10
              precision    recall  f1-score   support

           1       0.50      0.54      0.52        13
           2       0.49      0.53      0.51        32
           3       0.61      0.53      0.56        38
           4       0.50      0.57      0.53        21
           5       0.57      0.50      0.53        16

    accuracy                           0.53       120
   macro avg       0.53      0.53      0.53       120
weighted avg       0.54      0.53      0.53       120
 

              precision    recall  f1-score   support

           1       0.70      0.54      0.61        13
           2       0.65      0.67      0.66        75
           3       0.66      0.75      0.70        95
           4       0.55      0.51      0.53        53
           5       0.67      0.55      0.60        44

    accuracy                           0.64       280
   macro avg       0.64      0.60      0.62       280
weighted avg       0.64      0.64      0.6

### Show confusion matrices for the training and test sets, and a classification report for the test set. What trends do you notice?

In [79]:
!pip install pygal



In [80]:
#!pip install pygal
import pygal
from IPython.display import display, HTML
from pygal.style import NeonStyle

base_html = """
<!DOCTYPE html>
<html>
  <head>
  <script type="text/javascript" src="http://kozea.github.com/pygal.js/javascripts/svg.jquery.js"></script>
  <script type="text/javascript" src="https://kozea.github.io/pygal.js/2.0.x/pygal-tooltips.min.js""></script>
  </head>
  <body>
    <figure>
      {rendered_chart}
    </figure>
  </body>
</html>
"""

def galplot(chart):
    rendered_chart = chart.render(is_unicode=True)
    plot_html = base_html.format(rendered_chart=rendered_chart)
    display(HTML(plot_html))
def plot_cm(y_true, y_pred):
    labels = np.unique(y_true)
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    dot_chart = pygal.Dot(x_label_rotation=30, interpolate='cubic', style=NeonStyle)
    dot_chart.title = 'Confusion Matrix'
    dot_chart.x_labels = labels
    for i in range(len(labels)):
        dot_chart.add(labels[i], cm[i,:])
    galplot(dot_chart)

plot_cm(y_test.astype(str), model.predict(X_test).astype(str))

In [81]:
confusion_matrix(y_train, model.predict(X_train))

array([[13,  0,  0,  0,  0],
       [75,  0,  0,  0,  0],
       [95,  0,  0,  0,  0],
       [53,  0,  0,  0,  0],
       [44,  0,  0,  0,  0]])