# Linear SVM 

## Setting Up

In [1]:
import pandas as pd 
import numpy as np

In [2]:
import sklearn
assert sklearn.__version__ >= "0.20"
import yfinance as yf

np.random.seed(42)

from sklearn.svm import LinearSVC 
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score, precision_score

from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import RandomOverSampler

In [3]:
metrics_df = pd.DataFrame(columns =['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'Confusion Matrix'])

## Import Data 

In [4]:
X_train = pd.read_excel("data/model_inputs/X_train.xlsx")
y_train = pd.read_excel("data/model_inputs/y_train.xlsx")

X_test = pd.read_excel("data/model_inputs/X_test.xlsx")
y_test = pd.read_excel("data/model_inputs/y_test.xlsx")

In [5]:
X_train = X_train.rename(columns ={'Unnamed: 0': 'Date'})
X_train = X_train.set_index('Date')

y_train = y_train.rename(columns ={'Unnamed: 0': 'Date'})
y_train = y_train.set_index('Date')

X_test = X_test.rename(columns ={'Unnamed: 0': 'Date'})
X_test = X_test.set_index('Date')

y_test = y_test.rename(columns ={'Unnamed: 0': 'Date'})
y_test = y_test.set_index('Date')

### RANDOM OVERSAMPLING RESAMPLE

In [6]:
def random_oversampler(X_train, y_train):
    oversample = RandomOverSampler(sampling_strategy='not majority')
    X_over, y_over = oversample.fit_resample(X_train, y_train)
    return X_over, y_over

### Random Resample

In [7]:
data_resample = random_oversampler(X_train, y_train)

In [8]:
X_train = data_resample[0]
y_train = data_resample[1]

In [9]:
y_train['decision'].value_counts()

 0    1383
 1    1383
-1    1383
Name: decision, dtype: int64

## Initializing Linear SVM

In [10]:
linearsvc = LinearSVC(C=1, loss="hinge", random_state=42)

### Fitting Data

In [11]:
linearsvc.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


### Predicting on train set

In [12]:
y_predict = linearsvc.predict(X_train)
cm = confusion_matrix(y_train, y_predict)
accuracy = accuracy_score(y_train, y_predict)
precision = precision_score(y_train, y_predict, average = "weighted")
recall = recall_score(y_train, y_predict, average = "weighted")
f1 = f1_score(y_train, y_predict, average = "weighted")

In [13]:
model = 'Linear SVM TRAIN'
int_metric = pd.DataFrame([[model, accuracy, precision, recall, f1, cm]], 
                          columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'Confusion Matrix'])

metrics_df = metrics_df.append(int_metric)

In [14]:
print("Confusion_Matrix:\n{}"
      .format(cm))
print("Accuracy_Score: {}\n"
      .format(accuracy))
print("Precision_Score: {}\n"
      .format(precision))
print("Recall Score: {}\n"
      .format(recall))
print("F1 Score: {}\n"
      .format(f1))

Confusion_Matrix:
[[1176  207    0]
 [ 154 1197   32]
 [ 142  537  704]]
Accuracy_Score: 0.7416244878283924

Precision_Score: 0.7907090697309769

Recall Score: 0.7416244878283924

F1 Score: 0.7361662799605286



### Predicting on test set

In [15]:
y_predict = linearsvc.predict(X_test)
cm = confusion_matrix(y_test, y_predict)
accuracy = accuracy_score(y_test, y_predict)
precision = precision_score(y_test, y_predict, average = "weighted")
recall = recall_score(y_test, y_predict, average = "weighted")
f1 = f1_score(y_test, y_predict, average = "weighted")

  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
model = 'Linear SVM TEST'
int_metric = pd.DataFrame([[model, accuracy, precision, recall, f1, cm]], 
                          columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'Confusion Matrix'])

metrics_df = metrics_df.append(int_metric)

In [17]:
print("Confusion_Matrix:\n{}"
      .format(cm))
print("Accuracy_Score: {}\n"
      .format(accuracy))
print("Precision_Score: {}\n"
      .format(precision))
print("Recall Score: {}\n"
      .format(recall))
print("F1 Score: {}\n"
      .format(f1))

Confusion_Matrix:
[[12  9  0]
 [38 99  0]
 [ 2  8  0]]
Accuracy_Score: 0.6607142857142857

Precision_Score: 0.7248129026146268

Recall Score: 0.6607142857142857

F1 Score: 0.679294648174934



## RandomizedSearchCV Tuning Linear SVM
### Parameters includes: penalty, loss, dual, C, multi_class

In [67]:
penalty = ['l1', 'l2']
loss = ['hinge', 'squared_hinge']
C_range = np.logspace(-2, 10, 13)

params = {'penalty': penalty, 'loss': loss, 'C': C_range}
rs = RandomizedSearchCV(linearsvc, params, random_state = 42, n_iter = 1000)
rs.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
130 fits failed out of a total of 260.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------


In [68]:
rs.best_params_

{'penalty': 'l2', 'loss': 'squared_hinge', 'C': 0.1}

In [69]:
rs.best_score_

0.6851872629238305

## Initialising tuned Linear SVM

In [18]:
linearsvc = LinearSVC(penalty = "l2", loss = 'squared_hinge', C= 0.1)

### Fitting Data

In [19]:
linearsvc.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


### Predicting on train set

In [20]:
y_predict = linearsvc.predict(X_train)
cm = confusion_matrix(y_train, y_predict)
accuracy = accuracy_score(y_train, y_predict)
precision = precision_score(y_train, y_predict, average = "weighted")
recall = recall_score(y_train, y_predict, average = "weighted")
f1 = f1_score(y_train, y_predict, average = "weighted")

In [21]:
model = 'Linear SVM TRAIN Tuned'
int_metric = pd.DataFrame([[model, accuracy, precision, recall, f1, cm]], 
                          columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'Confusion Matrix'])

metrics_df = metrics_df.append(int_metric)

In [22]:
print("Confusion_Matrix:\n{}"
      .format(cm))
print("Accuracy_Score: {}\n"
      .format(accuracy))
print("Precision_Score: {}\n"
      .format(precision))
print("Recall Score: {}\n"
      .format(recall))
print("F1 Score: {}\n"
      .format(f1))

Confusion_Matrix:
[[1148  235    0]
 [ 148 1134  101]
 [  98  437  848]]
Accuracy_Score: 0.7543986502771752

Precision_Score: 0.7816695232507688

Recall Score: 0.7543986502771752

F1 Score: 0.7550863202613519



### Predicting on test set

In [23]:
model = 'Linear SVM TEST Tuned'
y_predict = linearsvc.predict(X_test)
cm = confusion_matrix(y_test, y_predict)
accuracy = accuracy_score(y_test, y_predict)
precision = precision_score(y_test, y_predict, average = "weighted")
recall = recall_score(y_test, y_predict, average = "weighted")
f1 = f1_score(y_test, y_predict, average = "weighted")

In [24]:
int_metric = pd.DataFrame([[model, accuracy, precision, recall, f1, cm]], 
                          columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'Confusion Matrix'])

metrics_df = metrics_df.append(int_metric)

In [25]:
print("Confusion_Matrix:\n{}"
      .format(cm))
print("Accuracy_Score: {}\n"
      .format(accuracy))
print("Precision_Score: {}\n"
      .format(precision))
print("Recall Score: {}\n"
      .format(recall))
print("F1 Score: {}\n"
      .format(f1))

Confusion_Matrix:
[[  8  12   1]
 [ 17 103  17]
 [  0  10   0]]
Accuracy_Score: 0.6607142857142857

Precision_Score: 0.7119523809523809

Recall Score: 0.6607142857142857

F1 Score: 0.6846541968928295



In [26]:
metrics_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,Confusion Matrix
0,Linear SVM TRAIN,0.741624,0.790709,0.741624,0.736166,"[[1176, 207, 0], [154, 1197, 32], [142, 537, 7..."
0,Linear SVM TEST,0.660714,0.724813,0.660714,0.679295,"[[12, 9, 0], [38, 99, 0], [2, 8, 0]]"
0,Linear SVM TRAIN Tuned,0.754399,0.78167,0.754399,0.755086,"[[1148, 235, 0], [148, 1134, 101], [98, 437, 8..."
0,Linear SVM TEST Tuned,0.660714,0.711952,0.660714,0.684654,"[[8, 12, 1], [17, 103, 17], [0, 10, 0]]"


# Linear SVM Business Metrics

## Import Data 

In [27]:
X_train = pd.read_excel("data/model_inputs/X_train.xlsx")
y_train = pd.read_excel("data/model_inputs/y_train.xlsx")

X_test = pd.read_excel("data/model_inputs/X_test.xlsx")
y_test = pd.read_excel("data/model_inputs/y_test.xlsx")

In [28]:
X_train = X_train.rename(columns ={'Unnamed: 0': 'Date'})
X_train = X_train.set_index('Date')

y_train = y_train.rename(columns ={'Unnamed: 0': 'Date'})
y_train = y_train.set_index('Date')

X_test = X_test.rename(columns ={'Unnamed: 0': 'Date'})
X_test = X_test.set_index('Date')

y_test = y_test.rename(columns ={'Unnamed: 0': 'Date'})
y_test = y_test.set_index('Date')

In [29]:
whole_df_x = pd.concat([X_train, X_test])
whole_df_x = whole_df_x.reset_index()
whole_df_x['year'] = pd.DatetimeIndex(whole_df_x['Date']).year
tuned_model = linearsvc

In [30]:
business_metric_results = pd.DataFrame(columns=['exp_strategy_annual_return', 'exp_benchmark_annual_return', 'strategy_over_benchmark'])

for year in [2016, 2017, 2018, 2019, 2020, 2021, 2022]:
    year_data = whole_df_x[whole_df_x['year'] == year]
    year_data = year_data.set_index('Date')
    year_data = year_data.drop(['year'], axis = 1)
    predict_x = tuned_model.predict(np.array(year_data)) 
    predictions = predict_x

    df_pred = pd.DataFrame({'prediction':predictions}, index=year_data.index)
    df_pred = df_pred.replace({2:1, 1:-1}) # convert classes to buy hold sell
    dates = df_pred.index

    if year == 2022:
        end_date = "2022-09-02"
    else:
        end_date = str(year+1) + "-01-01"
    df_prices = yf.download("^GSPC", start=dates[0], end=end_date)[['Adj Close']]

    # create positions column
    positions = []
    prev = 0
    for i in range(len(df_pred)):
        positions.append(df_pred.iloc[i]['prediction'])
        # if df_pred.iloc[i]['prediction'] == 0:
        #     positions.append(prev)
        # else:
        #     prev = df_pred.iloc[i]['prediction']
        #     positions.append(prev)

    df_business = pd.DataFrame()
    df_business['stock_daily_log_return'] = np.log(df_prices /df_prices.shift(1))['Adj Close']
    df_business['prediction'] = df_pred['prediction']
    df_business['position'] = positions
    df_business['benchmark'] = 1 # long and hold strategy
    df_business["strategy_Returns"] = df_business["stock_daily_log_return"] * df_business["position"].shift(1)
    df_business["benchmark_Returns"] = df_business["stock_daily_log_return"] * df_business["benchmark"].shift(1)

    # Annual Mean Returns or Expected returns
    expected_strategy_annual_return = np.exp(df_business['strategy_Returns'].mean() * 252) - 1 
    expected_benchmark_annual_return = np.exp(df_business['benchmark_Returns'].mean() * 252) - 1 
    strategy_over_benchmark = expected_strategy_annual_return-expected_benchmark_annual_return
    business_metric_results.loc[year] = [expected_strategy_annual_return, expected_benchmark_annual_return, strategy_over_benchmark]
    # print(f'Expected Annual Returns: Strategy: {round(expected_strategy_annual_return*100, 2)}%  |  Stock: {round(expected_benchmark_annual_return*100, 2)}%')

business_metric_results



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed




[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed




[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed




Unnamed: 0,exp_strategy_annual_return,exp_benchmark_annual_return,strategy_over_benchmark
2016,-0.073302,0.107616,-0.180918
2017,-0.011202,0.185753,-0.196955
2018,0.109483,-0.070634,0.180118
2019,-0.110299,0.288443,-0.398742
2020,0.031884,0.152929,-0.121045
2021,-0.016053,0.28923,-0.305282
2022,0.274743,-0.249185,0.523928
