# Track TSMC with SVM, added diff to 5d avg

### Libraries

In [4]:
# Import Libraries
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression  # or RandomForestRegressor etc.
from sklearn.metrics import accuracy_score  # or mean_squared_error for regression
from datetime import time
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC  # SVM Classifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


## Introduce Data from: 2330, S&P500, VIX, SOX, TSM

In [56]:
# Load Data
data = yf.download(["2330.TW" ,"^GSPC", "^VIX", "^SOX", "TSM"], period = '10y')

# deal with multi-index column
data.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in data.columns]
# print(data.head())

# fill the n.a. data
data = data.ffill()

# Copy and store
df = data[['Open_2330.TW']].copy()
df['diff'] = data['Close_2330.TW'] - data['Open_2330.TW']

# Calculate diff last night
df['Prev_Close'] = data['Close_2330.TW'].shift(1)  # yesterday's close
df['diff_overnight'] = data['Open_2330.TW'] - df['Prev_Close']  # today's open minus yesterday's close

# Get S&P500 last day's volume and % and volitility last day
df['SP500_pct_change'] = data['Close_^GSPC'].pct_change() * 100
df['SP500_pct_change_lastday'] = df['SP500_pct_change'].shift(1)

# Get S&P500 5d avg
df['SP500_5d_avg'] = data['Close_^GSPC'].rolling(5).mean()
df['SP500_diff_to_5d_avg'] = data['Close_^GSPC'] - df['SP500_5d_avg']
# df['SP500_5d_avg_lastday'] = df['SP500_5d_avg'].shift(1)
df['SP500_diff_to_5d_avg_lastday'] = df['SP500_diff_to_5d_avg'].shift(1)

df['Volume_^GSPC_lastday'] = data['Volume_^GSPC'].shift(1)
df['Close_^VIX_lastday'] = data['Close_^VIX'].shift(1)

df['SOX_pct_change'] = data['Close_^SOX'].pct_change() * 100
df['SOX_pct_change_lastday'] = df['SOX_pct_change'].shift(1)
# df['Volume_^SOX_lastday'] = data['Volume_^SOX'].shift(1)

df['TSM_pct_change'] = data['Close_TSM'].pct_change() * 100
df['TSM_pct_change_lastday'] = df['TSM_pct_change'].shift(1)
df['Volume_TSM_lastday'] = data['Volume_TSM'].shift(1)


# drop unwanted columns
df = df.drop(columns = ['Open_2330.TW', 'Prev_Close', 'SP500_pct_change', 'SP500_5d_avg', 'SP500_diff_to_5d_avg', 'SOX_pct_change', 'TSM_pct_change'], axis=1)
print(df.head())

[*********************100%***********************]  5 of 5 completed

                diff  diff_overnight  SP500_pct_change_lastday  \
Date                                                             
2015-05-26 -1.488498             NaN                       NaN   
2015-05-27 -0.372124       -0.744277                       NaN   
2015-05-28  0.000000        1.488495                  0.916264   
2015-05-29 -0.744249        0.000017                 -0.126676   
2015-06-01 -0.372124       -0.372138                 -0.631847   

            SP500_diff_to_5d_avg_lastday  Volume_^GSPC_lastday  \
Date                                                             
2015-05-26                           NaN                   NaN   
2015-05-27                           NaN          3.342130e+09   
2015-05-28                           NaN          3.127960e+09   
2015-05-29                           NaN          2.980350e+09   
2015-06-01                           NaN          3.927390e+09   

            Close_^VIX_lastday  SOX_pct_change_lastday  \
Date            




## Scale the Volume

In [59]:
# Scale the volume
scaler = StandardScaler()
df['SP500_lastday_Volume_scaled'] = scaler.fit_transform(df[['Volume_^GSPC_lastday']])
df['TSM_lastday_Volume_scaled'] = scaler.fit_transform(df[['Volume_TSM_lastday']])
df = df.drop(columns = ['Volume_^GSPC_lastday', 'Volume_TSM_lastday'], axis=1)  # features



## Map the target into classes(1, 0, -1)

In [62]:
# Turn target 'diff' from real number to 0, 1, -1
conditions = [
    df['diff'] > 0.4,
    df['diff'].between(-0.4, 0.4),
    df['diff'] < -0.4
]
choices = [1, 0, -1]
df['label'] = np.select(conditions, choices)
print(df.head())

                diff  diff_overnight  SP500_pct_change_lastday  \
Date                                                             
2015-05-26 -1.488498             NaN                       NaN   
2015-05-27 -0.372124       -0.744277                       NaN   
2015-05-28  0.000000        1.488495                  0.916264   
2015-05-29 -0.744249        0.000017                 -0.126676   
2015-06-01 -0.372124       -0.372138                 -0.631847   

            SP500_diff_to_5d_avg_lastday  Close_^VIX_lastday  \
Date                                                           
2015-05-26                           NaN                 NaN   
2015-05-27                           NaN               14.06   
2015-05-28                           NaN               13.27   
2015-05-29                           NaN               13.31   
2015-06-01                           NaN               13.84   

            SOX_pct_change_lastday  TSM_pct_change_lastday  \
Date                      

## Assign Labels Features
### (S&P lastnight, vol; 2330 diff overnight; VIX lastnight; TSM lasnight changes, vol; SOX lastnight changes

In [65]:
# Clean the df
df_clean = df.dropna()

# Split your dataset first
X = df_clean.drop(columns = ['label', 'diff'], axis=1)  # features
y = df_clean['label']  # target
print(X.head())


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Scale features (recommended for SVM and others)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

            diff_overnight  SP500_pct_change_lastday  \
Date                                                   
2015-06-02       -0.372115                  0.205946   
2015-06-03       -0.372123                 -0.100860   
2015-06-04       -0.744239                  0.211887   
2015-06-05       -1.116372                 -0.862317   
2015-06-08       -0.372138                 -0.143618   

            SP500_diff_to_5d_avg_lastday  Close_^VIX_lastday  \
Date                                                           
2015-06-02                     -1.787988               13.97   
2015-06-03                     -4.997900               14.24   
2015-06-04                      1.354053               13.66   
2015-06-05                    -11.885938               14.71   
2015-06-08                    -11.983984               14.21   

            SOX_pct_change_lastday  TSM_pct_change_lastday  \
Date                                                         
2015-06-02                0.089889

## Train the Model with SVM

In [70]:
model = SVC(kernel='rbf', C=1, gamma='auto')  # Common setup
model.fit(X_train, y_train)

## Evaluate the Result

In [73]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.45


In [75]:
print(y_train.value_counts(normalize=True))

label
-1    0.438495
 1    0.416305
 0    0.145200
Name: proportion, dtype: float64


In [77]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.46      0.62      0.53       227
           0       0.00      0.00      0.00        76
           1       0.43      0.43      0.43       216

    accuracy                           0.45       519
   macro avg       0.30      0.35      0.32       519
weighted avg       0.38      0.45      0.41       519



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Try finding Best-performing Hyper-perimeter

In [80]:
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'poly', 'linear'],
    'gamma': ['scale', 'auto', 0.01, 0.001]
}

grid = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)

Best parameters: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
Best score: 0.4597206216169024


## Potential Idea for features

In [None]:
# ideas: volumn change(1d - 5d.avg), 