<a href="https://colab.research.google.com/github/y-lims/DADS6003_Machine_Learning_Final_Project/blob/main/TISCO_Model/TISCO_Final_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library

## to Install

In [None]:
!pip install pendulum
!wget http://prdownloads.sourceforge.net/ta-lib/ta-lib-0.4.0-src.tar.gz
!tar -xzvf ta-lib-0.4.0-src.tar.gz
%cd ta-lib
!./configure --prefix=/usr
!make
!make install
!pip install Ta-Lib
!pip install ta

--2024-01-13 17:12:47--  http://prdownloads.sourceforge.net/ta-lib/ta-lib-0.4.0-src.tar.gz
Resolving prdownloads.sourceforge.net (prdownloads.sourceforge.net)... 204.68.111.105
Connecting to prdownloads.sourceforge.net (prdownloads.sourceforge.net)|204.68.111.105|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.sourceforge.net/project/ta-lib/ta-lib/0.4.0/ta-lib-0.4.0-src.tar.gz [following]
--2024-01-13 17:12:47--  http://downloads.sourceforge.net/project/ta-lib/ta-lib/0.4.0/ta-lib-0.4.0-src.tar.gz
Resolving downloads.sourceforge.net (downloads.sourceforge.net)... 204.68.111.105
Reusing existing connection to prdownloads.sourceforge.net:80.
HTTP request sent, awaiting response... 302 Found
Location: http://gigenet.dl.sourceforge.net/project/ta-lib/ta-lib/0.4.0/ta-lib-0.4.0-src.tar.gz [following]
--2024-01-13 17:12:47--  http://gigenet.dl.sourceforge.net/project/ta-lib/ta-lib/0.4.0/ta-lib-0.4.0-src.tar.gz
Resolving gigenet.dl.sour

## to Import

In [None]:
# Import the Libraries

# Data Manupulation
import numpy as np
import pandas as pd
import pendulum
import scipy.stats as stats
from datetime import datetime, timedelta

# Techinical Indicators
import talib
import ta

# Plotting graphs
import matplotlib.pyplot as plt
import seaborn as sns

# Standardization
from sklearn.preprocessing import MinMaxScaler

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, KFold

# Evaluation
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, roc_auc_score, confusion_matrix

# Data fetching
from pandas_datareader import data as pdr
import yfinance as yf
yf.pdr_override()

# Save model
import os
import joblib

import requests
import warnings

# Get Features Data

In [None]:
# List of stock symbols
stock_symbols = ['TISCO.BK', 'ERW.BK', 'SPRC.BK',
                 'THB=X', 'CNY=X', 'GC=F', 'BZ=F', 'CL=F',
                 'BSET100.BK', 'TDEX.BK', 'TOP.BK', 'IRPC.BK', 'BCP.BK',
                 '^FINCIAL.BK', '^BANK.BK', '^SERVICE.BK', '^TOURISM.BK', '^RESOURC.BK', '^ENERG.BK']

# Mapping of stock symbols to categories
symbol_categories = {
    'BANKING': ['^BANK.BK'],
    'BRENT': ['BZ=F'],
    'CNY': ['CNY=X'],
    'CRUDE': ['CL=F'],
    'ENERGY': ['^ENERG.BK'],
    'FINCIAL': ['^FINCIAL.BK'],
    'GOLD': ['GC=F'],
    'RESOURCE': ['^RESOURC.BK'],
    'SERVICE': ['^SERVICE.BK'],
    'SET50': ['TDEX.BK'],
    'SET100': ['BSET100.BK'],
    'TOURISM': ['^TOURISM.BK'],
    'USD': ['THB=X'],
    'ERW': ['ERW.BK'],
    'TISCO': ['TISCO.BK'],
    'SPRC': ['SPRC.BK'],
    'IRPC': ['IRPC.BK'],
    'TOP': ['TOP.BK'],
    'BCP': ['BCP.BK'],
}

bkk_tz = 'Asia/Bangkok'
end_date = pendulum.now(bkk_tz).strftime('%Y-%m-%d')
start_date = (pendulum.now(bkk_tz) - pendulum.duration(days=59)).strftime('%Y-%m-%d')
interval = '15m'

# Define variables for each table
banking_data = []
brent_data = []
cny_data = []
crude_data = []
energy_data = []
fincial_data = []
gold_data = []
resource_data = []
service_data = []
set50_data = []
set100_data = []
tourism_data = []
usd_data = []
erw_data = []
tisco_data = []
sprc_data = []
irpc_data = []
top_data = []
bcp_data = []

for category, symbols in symbol_categories.items():
    print(f"\nProcessing data for category: {category}")

    for symbol in symbols:
        try:
            hist_data = yf.download(symbol, start=start_date, end=end_date, interval=interval)
            hist_data = hist_data.between_time('10:00', '16:30')
            locals()[f"{category.lower()}_data"].append(hist_data)
        except Exception as e:
            print(f"Failed to download data for '{symbol}': {e}")

# Convert each list to a DataFrame
bank = pd.concat(banking_data).reset_index()
brent = pd.concat(brent_data).reset_index()
cny = pd.concat(cny_data).reset_index()
crude = pd.concat(crude_data).reset_index()
energy = pd.concat(energy_data).reset_index()
fincial = pd.concat(fincial_data).reset_index()
gold = pd.concat(gold_data).reset_index()
resource = pd.concat(resource_data).reset_index()
service = pd.concat(service_data).reset_index()
set50 = pd.concat(set50_data).reset_index()
set100 = pd.concat(set100_data).reset_index()
tour = pd.concat(tourism_data).reset_index()
usd = pd.concat(usd_data).reset_index()
erw = pd.concat(erw_data).reset_index()
tisco = pd.concat(tisco_data).reset_index()
sprc = pd.concat(sprc_data).reset_index()
irpc = pd.concat(irpc_data).reset_index()
top = pd.concat(top_data).reset_index()
bcp = pd.concat(bcp_data).reset_index()


Processing data for category: BANKING
[*********************100%%**********************]  1 of 1 completed

Processing data for category: BRENT
[*********************100%%**********************]  1 of 1 completed

Processing data for category: CNY
[*********************100%%**********************]  1 of 1 completed

Processing data for category: CRUDE
[*********************100%%**********************]  1 of 1 completed

Processing data for category: ENERGY
[*********************100%%**********************]  1 of 1 completed

Processing data for category: FINCIAL
[*********************100%%**********************]  1 of 1 completed

Processing data for category: GOLD
[*********************100%%**********************]  1 of 1 completed

Processing data for category: RESOURCE
[*********************100%%**********************]  1 of 1 completed

Processing data for category: SERVICE
[*********************100%%**********************]  1 of 1 completed

Processing data for category: SET50
[*

In [None]:
# Features (X)
tisco_close = tisco['Close']
erw_close = erw['Close']
sprc_close = sprc['Close']
tisco_vol = tisco['Volume']
erw_vol = erw['Volume']
sprc_vol = sprc['Volume']
usd = usd['Close']
cny = cny['Close']
set100 = set100['Close']
set50 = set50['Close']
gold = gold['Close']
fin = fincial['Close']
bank = bank['Close']
service = service['Close']
tour = tour['Close']
resource = resource['Close']
energy = energy['Close']
brent = brent['Close']
crude = crude['Close']
bcp = bcp['Close']
top = top['Close']
irpc = irpc['Close']

# Define Variables (All Features + Technical Features)

In [None]:
## Set Y
tisco['Close_Shift'] = tisco['Close'].shift(-1)
tisco['Close_Shift'] = tisco['Close_Shift'].fillna(method='ffill')

## Set Y Condition
tisco['Signal'] = np.where(tisco['Close_Shift'] > tisco['Close'], 1, 0)

In [None]:
# Set Variables
# Set Y
tisco_signal = tisco['Signal']

# Reference date
ref_date = tisco['Datetime']

In [None]:
# Create DataFrames for each variable
tisco_df = pd.DataFrame({'Datetime': tisco['Datetime'], 'Signal': tisco['Signal'], 'tisco_close': tisco['Close'], 'tisco_vol': tisco['Volume']})
usd_df = pd.DataFrame({'Datetime': ref_date, 'usd_close': usd})
cny_df = pd.DataFrame({'Datetime': ref_date, 'cny_close': cny})
set100_df = pd.DataFrame({'Datetime': ref_date, 'set100_close': set100})
set50_df = pd.DataFrame({'Datetime': ref_date, 'set50_close': set50})
gold_df = pd.DataFrame({'Datetime': ref_date, 'gold_close': gold})
fin_df = pd.DataFrame({'Datetime': ref_date, 'fincial_close': fin})
bank_df = pd.DataFrame({'Datetime': ref_date, 'bank_close': bank})
brent_df = pd.DataFrame({'Datetime': ref_date, 'brent_close': brent})
crude_df = pd.DataFrame({'Datetime': ref_date, 'crude_close': crude})

# Merge all DataFrames on the 'Datetime' column
tisco_merged = pd.merge(tisco_df, usd_df, on='Datetime', how='left')
tisco_merged = pd.merge(tisco_merged, cny_df, on='Datetime', how='left')
tisco_merged = pd.merge(tisco_merged, set100_df, on='Datetime', how='left')
tisco_merged = pd.merge(tisco_merged, set50_df, on='Datetime', how='left')
tisco_merged = pd.merge(tisco_merged, gold_df, on='Datetime', how='left')
tisco_merged = pd.merge(tisco_merged, fin_df, on='Datetime', how='left')
tisco_merged = pd.merge(tisco_merged, bank_df, on='Datetime', how='left')
tisco_merged = pd.merge(tisco_merged, brent_df, on='Datetime', how='left')
tisco_merged = pd.merge(tisco_merged, crude_df, on='Datetime', how='left')

# Display the merged DataFrame
print(tisco_merged)

               Datetime  Signal  tisco_close  tisco_vol   gold_close  \
0   2023-11-16 10:00:00       1        97.25     107526  1985.699951   
1   2023-11-16 10:15:00       0        97.50      12823  1988.000000   
2   2023-11-16 10:30:00       0        97.25     188991  1987.099976   
3   2023-11-16 10:45:00       0        97.25     223111  1987.900024   
4   2023-11-16 11:00:00       0        97.25      98923  1991.000000   
..                  ...     ...          ...        ...          ...   
698 2024-01-12 15:15:00       0       101.00      32261  2065.300049   
699 2024-01-12 15:30:00       0       101.00      51658  2064.800049   
700 2024-01-12 15:45:00       0       101.00      87328  2064.399902   
701 2024-01-12 16:00:00       0       101.00      57612  2064.899902   
702 2024-01-12 16:15:00       0       100.50     630573  2065.100098   

     fincial_close  bank_close  
0       138.960007  373.209991  
1       139.229996  373.859985  
2       138.979996  372.890015  
3  

In [None]:
# Fill Null Values
tisco_merged.fillna(method = 'ffill', inplace = True)
tisco_merged

Unnamed: 0,Datetime,Signal,tisco_close,tisco_vol,gold_close,fincial_close,bank_close
0,2023-11-16 10:00:00,1,97.25,107526,1985.699951,138.960007,373.209991
1,2023-11-16 10:15:00,0,97.50,12823,1988.000000,139.229996,373.859985
2,2023-11-16 10:30:00,0,97.25,188991,1987.099976,138.979996,372.890015
3,2023-11-16 10:45:00,0,97.25,223111,1987.900024,138.679993,372.549988
4,2023-11-16 11:00:00,0,97.25,98923,1991.000000,138.720001,372.709991
...,...,...,...,...,...,...,...
698,2024-01-12 15:15:00,0,101.00,32261,2065.300049,139.220001,375.250000
699,2024-01-12 15:30:00,0,101.00,51658,2064.800049,139.009995,375.309998
700,2024-01-12 15:45:00,0,101.00,87328,2064.399902,138.820007,374.959991
701,2024-01-12 16:00:00,0,101.00,57612,2064.899902,138.649994,374.970001


In [None]:
# Create a copy of the DataFrame to avoid modifying the original data
tisco_data = tisco_merged.copy()

# Adding Technical Features
tisco_data['ema'] = ta.trend.ema_indicator(close=tisco_data['tisco_close'], window=14)
tisco_data['rsi'] = ta.momentum.RSIIndicator(close=tisco_data['tisco_close'], window=14).rsi()
tisco_data['obv'] = ta.volume.OnBalanceVolumeIndicator(close=tisco_data['tisco_close'], volume=tisco_data['tisco_vol']).on_balance_volume()
tisco_data['bb_upper'], _, tisco_data['bb_lower'] = ta.volatility.bollinger_hband(close=tisco_data['tisco_close'], window=20), ta.volatility.bollinger_mavg(close=tisco_data['tisco_close'], window=20), ta.volatility.bollinger_lband(close=tisco_data['tisco_close'], window=20)

tisco_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 703 entries, 0 to 702
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Datetime       703 non-null    datetime64[ns]
 1   Signal         703 non-null    int64         
 2   tisco_close    703 non-null    float64       
 3   tisco_vol      703 non-null    int64         
 4   gold_close     703 non-null    float64       
 5   fincial_close  703 non-null    float64       
 6   bank_close     703 non-null    float64       
 7   ema            690 non-null    float64       
 8   rsi            690 non-null    float64       
 9   obv            703 non-null    int64         
 10  bb_upper       684 non-null    float64       
 11  bb_lower       684 non-null    float64       
dtypes: datetime64[ns](1), float64(8), int64(3)
memory usage: 71.4 KB


In [None]:
tisco_features = ['tisco_close', 'fincial_close', 'bank_close', 'ema', 'obv', 'bb_upper', 'bb_lower', 'gold_close', 'rsi']
for column in tisco_features:
    median_value = tisco_data[column].median()
    tisco_data[column].fillna(median_value, inplace=True)

# Normalization

In [None]:
# Load the scaler
tisco_scaler = joblib.load("/content/tisco_scaler.pkl")

tisco_scaled = tisco_scaler.fit_transform(tisco_data[tisco_features])
tisco_scaled

array([[0.0625    , 0.56990657, 0.36643783, ..., 0.30043963, 0.10663258,
        0.47884966],
       [0.125     , 0.60189578, 0.39823792, ..., 0.30043963, 0.12594448,
        0.47884966],
       [0.0625    , 0.57227493, 0.35078339, ..., 0.30043963, 0.11838761,
        0.47884966],
       ...,
       [1.        , 0.55331897, 0.45205426, ..., 0.85851977, 0.76742088,
        0.61512977],
       [1.        , 0.5331752 , 0.45254398, ..., 0.87255394, 0.77161903,
        0.61512977],
       [0.875     , 0.54028572, 0.43786747, ..., 0.89204868, 0.77329993,
        0.42705447]])

In [None]:
X_predict = tisco_scaled
y_predict = tisco_signal

# Prediction

In [None]:
tisco_model = joblib.load("/content/tisco_final_model.pkl")

y_prob = tisco_model.predict_proba(X_predict)
y_pred = tisco_model.predict(X_predict)



# AUC

In [None]:
test_auc = roc_auc_score(y_predict, y_prob[:, 1]) * 100
print(f"AUC Score: {test_auc:.4f}")

AUC Score: 70.7311
