<a href="https://colab.research.google.com/github/tys203831/bursa-scraper/blob/main/notebook/github%20-%20bursa_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bursa Stock Scraper 

[Demo App here](https://tys203831-bursa-scraper-bursa-scraper-1okr1g.streamlitapp.com/)

## Input

In [None]:
rf: float = 4.00 # risk free rate (in percentage %) 
period: int = 5 # e.g. 5 = 5 years
interval: str = "1 day" #"1 week" # options = "1 day", "1 week", "1 month", 3 month""
confidence_level: float = 1.0 # confidence_level (in percentage %)
include_dividends: bool = False # include dividends into stock return calculation

## Setup

In [None]:
# no need amend, haven't worked perfectly as they should be 
skipna: bool = False # skip NaN values for annualized return & annualized standard deviation
exclude_warrant: bool = True

interval_dict = dict(zip(["1 day","1 week","1 month","3 month"], ["1d","1wk","1mo","3mo"]))
interval = interval_dict[interval]
confidence_level = confidence_level/100
rf = rf/100

In [None]:
pip install requests requests_html yfinance gspread statsmodels scipy bokeh

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Step 1: connect to google spreadsheet
import google.auth
from google.colab import auth
import gspread

auth.authenticate_user()
credentials, project_id = google.auth.default()

gc = gspread.authorize(credentials)

gs = gc.open_by_url("https://docs.google.com/spreadsheets/d/1mJjBNTGAF3CLOi1aeJDUSWx6EvtRqB-RhJdvnBh1iAU/edit#gid=14691771")

## Retrieve Data & Data Cleaning

### (i) Get stock ticker from Bursa

In [None]:
# Step 4: Get stock tickers from i3investor and then upload to the google spreadsheet: "Bursa stock list" sheet
# start scraping stock list from i3investor
import pandas
import requests
import json

url = "https://klse.i3investor.com/wapi/web/stock/listing/datatables"
data = {"dtDraw":1,"start":0,"order":[{"column":1,"dir":"asc"}],"page":0,"size":100,"marketList":[],"sectorList":[],"subsectorList":[],"type":"","stockType":""}
retries = 2

session = requests.Session()
response = session.post(url, json=data)

df_stock_list = pandas.DataFrame()

#for idx in range(0,11,1):
while len(response.json()["data"]) != 0:
  try:
    response = session.post(url, json=data)
  
  except requests.exceptions.HTTPError as err:
    code = err.response.status_code

    if code in [429,500,502,503,504]:
      continue
    
    else:
      for retry in range(0,retries,1):
        response = session.post(url, json=data)
        print(response.raise_for_status())
    
  finally:
    df_stock_list= pandas.DataFrame(json.loads(response.text)["data"]) if df_stock_list.empty else pandas.concat((df_stock_list, pandas.DataFrame(json.loads(response.text)["data"])), axis=0)
    #workSheet.append_rows(df_stock_list_values, value_input_option='RAW', table_range ="A1")

    data["dtDraw"] +=1
    data["start"] +=100
    data["page"]+=1

# data cleaning for this stock ticker table, e.g., drop unwanted columns and reordering them
clean_df_stock_list = df_stock_list.drop([0,1,8,11], axis=1) # drop unwanted columns
clean_df_stock_list = clean_df_stock_list[[12,13,10,14,9, 2,3,4,5,6,7]] # reordering the columns
clean_df_stock_list.columns = ["STOCK_SYMBOL","STOCK CODE","SECTOR","SUBSECTOR", "MKT", "OPEN","LAST","CHG%","CHG","VOL","MKT CAP"] #rename the columns

clean_df_stock_list[["OPEN","LAST","CHG%","CHG","VOL","MKT CAP"]] \
= clean_df_stock_list[["OPEN","LAST","CHG%","CHG","VOL","MKT CAP"]].replace(
            {"<((\s|\w|[='-/])+)>":"",
                         "K": "E+03",
                         "M": "E+06",
                        "B": "E+09",}, regex=True)

clean_df_stock_list["MKT CAP"] = clean_df_stock_list["MKT CAP"].str.replace(" ","") #.map(pandas.eval).astype(float) # convert string (e.g., 897 M)
clean_df_stock_list["VOL"] = clean_df_stock_list["VOL"].str.replace(" ","") #.map(pandas.eval).astype(int)
clean_df_stock_list= clean_df_stock_list.apply(pandas.to_numeric, errors="ignore") 
print(clean_df_stock_list.dtypes)
clean_df_stock_list.sort_values("MKT CAP",ascending=False)


STOCK_SYMBOL     object
STOCK CODE       object
SECTOR           object
SUBSECTOR        object
MKT              object
OPEN            float64
LAST            float64
CHG%            float64
CHG             float64
VOL             float64
MKT CAP         float64
dtype: object


Unnamed: 0,STOCK_SYMBOL,STOCK CODE,SECTOR,SUBSECTOR,MKT,OPEN,LAST,CHG%,CHG,VOL,MKT CAP
61,MAYBANK,1155,FINANCIAL SERVICES,BANKING,MAIN,8.92,8.970,0.67,0.060,10501000.0,1.073710e+11
14,PBBANK,1295,FINANCIAL SERVICES,BANKING,MAIN,4.65,4.650,-0.21,-0.010,13102000.0,9.025972e+10
16,PCHEM,5183,INDUSTRIAL PRODUCTS & SERVICES,CHEMICALS,MAIN,8.76,8.750,-0.34,-0.030,6463000.0,7.000000e+10
15,IHH,5225,HEALTH CARE,HEALTH CARE PROVIDERS,MAIN,6.48,6.490,0.00,0.000,4811000.0,5.715122e+10
72,CIMB,1023,FINANCIAL SERVICES,BANKING,MAIN,5.44,5.450,0.37,0.020,14471000.0,5.708471e+10
...,...,...,...,...,...,...,...,...,...,...,...
75,MEITUAN-C12,058312,,,,0.63,0.630,8.62,0.050,20000.0,0.000000e+00
22,AGMO,0258,,,,,,,,,0.000000e+00
31,DSR,3055,,,,,,,,,0.000000e+00
46,PINGAN-C8,0527C8,,,,0.00,0.015,0.00,0.000,0.0,0.000000e+00


In [None]:
# data cleaning for this stock ticker table, e.g., drop unwanted columns and reordering them
clean_df_stock_list = df_stock_list.drop([0,1,8,11], axis=1) # drop unwanted columns
clean_df_stock_list = clean_df_stock_list[[12,13,10,14,9, 2,3,4,5,6,7]] # reordering the columns
clean_df_stock_list.columns = ["STOCK_SYMBOL","STOCK CODE","SECTOR","SUBSECTOR", "MKT", "OPEN","LAST","CHG%","CHG","VOL","MKT CAP"] #rename the columns

clean_df_stock_list[["OPEN","LAST","CHG%","CHG","VOL","MKT CAP"]] \
= clean_df_stock_list[["OPEN","LAST","CHG%","CHG","VOL","MKT CAP"]].replace(
            {"<((\s|\w|[='-/])+)>":"",
                         "K": "E+03",
                         "M": "E+06",
                        "B": "E+09",}, regex=True)

clean_df_stock_list["MKT CAP"] = clean_df_stock_list["MKT CAP"].str.replace(" ","") #.map(pandas.eval).astype(float) # convert string (e.g., 897 M)
clean_df_stock_list["VOL"] = clean_df_stock_list["VOL"].str.replace(" ","") #.map(pandas.eval).astype(int)
clean_df_stock_list= clean_df_stock_list.apply(pandas.to_numeric, errors="ignore") 
print(clean_df_stock_list.dtypes)
clean_df_stock_list.sort_values("MKT CAP",ascending=False)

STOCK_SYMBOL     object
STOCK CODE       object
SECTOR           object
SUBSECTOR        object
MKT              object
OPEN            float64
LAST            float64
CHG%            float64
CHG             float64
VOL             float64
MKT CAP         float64
dtype: object


Unnamed: 0,STOCK_SYMBOL,STOCK CODE,SECTOR,SUBSECTOR,MKT,OPEN,LAST,CHG%,CHG,VOL,MKT CAP
61,MAYBANK,1155,FINANCIAL SERVICES,BANKING,MAIN,8.92,8.970,0.67,0.060,10501000.0,1.073710e+11
14,PBBANK,1295,FINANCIAL SERVICES,BANKING,MAIN,4.65,4.650,-0.21,-0.010,13102000.0,9.025972e+10
16,PCHEM,5183,INDUSTRIAL PRODUCTS & SERVICES,CHEMICALS,MAIN,8.76,8.750,-0.34,-0.030,6463000.0,7.000000e+10
15,IHH,5225,HEALTH CARE,HEALTH CARE PROVIDERS,MAIN,6.48,6.490,0.00,0.000,4811000.0,5.715122e+10
72,CIMB,1023,FINANCIAL SERVICES,BANKING,MAIN,5.44,5.450,0.37,0.020,14471000.0,5.708471e+10
...,...,...,...,...,...,...,...,...,...,...,...
75,MEITUAN-C12,058312,,,,0.63,0.630,8.62,0.050,20000.0,0.000000e+00
22,AGMO,0258,,,,,,,,,0.000000e+00
31,DSR,3055,,,,,,,,,0.000000e+00
46,PINGAN-C8,0527C8,,,,0.00,0.015,0.00,0.000,0.0,0.000000e+00


In [None]:
# update stock code list that need to scrape

import re


stock_list = clean_df_stock_list["STOCK CODE"]
full_stock_list = ["^KLSE"]
full_stock_list.extend(stock_list)

if exclude_warrant: 
  full_stock_list = [stock for stock in full_stock_list if not bool(re.match(pattern="\d+[a-zA-Z]+",string=stock))]

print(full_stock_list)

['^KLSE', '03041', '0012', '7054', '5238', '7167', '7086', '2488', '03028', '7131', '0218', '5281', '7191', '9148', '7146', '0181', '6599', '5139', '5185', '5198', '7145', '0258', '7315', '7090', '0122', '0209', '5014', '2658', '7609', '5115', '5116', '2674', '058212', '1163', '0079', '5269', '1015', '5293', '0159', '5120', '03011', '7031', '6351', '7083', '0048', '4758', '0226', '6556', '5082', '9342', '03051', '5568', '5088', '5015', '6432', '0119', '7214', '7181', '7007', '5210', '5127', '0038', '1481', '0068', '7722', '7129', '4057', '0105', '7162', '03032', '6399', '0072', '8176', '5302', '7048', '5130', '7099', '03037', '8885', '5204', '7579', '6888', '5106', '7120', '2305', '5021', '7078', '03012', '0098', '7251', '4162', '5248', '6602', '0187', '6173', '5190', '9814', '7668', '8133', '7005', '5258', '0195', '6998', '0179', '5032', '3239', '3395', '5196', '4219', '1899', '5069', '0168', '9288', '7036', '6297', '5254', '5100', '5932', '9938', '7221', '2771', '0011', '7188', '1818

### (ii) Get stock & dividend

*   stock price, stock return

#### -- Get stock price history

In [None]:
# Step 2: create a function to get stock & index price history from yahoo finance
re
import pandas
import yfinance
from typing import Union

def getData(ticker_code: Union[str, list], period:str, interval:str) -> pandas.DataFrame:
  # get stock return dataframe of the Bursa stocks
  if isinstance(ticker_code, list):
    ticker_code = [str(ticker) + ".KL" if not str(ticker).endswith(".KL") and not ticker.startswith("^") else str(ticker) for ticker in ticker_code]
    ticker_code = " ".join(ticker_code)
  elif isinstance(ticker_code, str) and not ticker_code.endswith(".KL") and not ticker_code.startswith("^"): 
    ticker_code = ticker_code + ".KL"
  df_stock = yfinance.download(tickers=ticker_code, period=period, interval=interval, group_by="column")
  df_stock = pandas.DataFrame(df_stock["Adj Close"]).fillna(method="ffill", axis=0)
  return df_stock.reset_index()

def getReturn(df_stock: pandas.DataFrame) -> pandas.DataFrame:
  df_stock["Date"] = pandas.to_datetime(df_stock["Date"])
  df_stock_return = df_stock.set_index("Date").divide(df_stock.set_index("Date").shift(1)).sub(1) # df["A"].divide(df["A"].shift(1)) -1
  return df_stock_return[1:].reset_index()

import dateutil
import datetime

def filterDataBasedYear(df, period: int):
  # get stock history of previous n years (e.g. 2 years stock return data out of 5 years)
  n_years_from_today = datetime.date.today() - dateutil.relativedelta.relativedelta(years=+period)
  n_years_from_today = n_years_from_today.strftime("%Y-%m-%d")
  return df[df["Date"]>= n_years_from_today]

In [None]:
# Step 5: Download stock return dataframe of Bursa Malaysia stocks from yahoo finance using yfinance
#stock_list = workSheet.col_values(2)[1:]

stock_df = getData(ticker_code=full_stock_list, period=str(period)+"y", interval=interval)
stock_df.to_csv("stock_df.csv")
stock_df

[*********************100%***********************]  990 of 990 completed

9 Failed downloads:
- 03009.KL: No data found, symbol may be delisted
- 058312.KL: No data found, symbol may be delisted
- 054810.KL: No data found, symbol may be delisted
- 0258.KL: No data found, symbol may be delisted
- 0045.KL: No data found, symbol may be delisted
- 0259.KL: No data found, symbol may be delisted
- 058212.KL: No data found, symbol may be delisted
- 3055.KL: No data found, symbol may be delisted
- 054620.KL: No data found, symbol may be delisted


Unnamed: 0,Date,0001.KL,0002.KL,0005.KL,0006.KL,0007.KL,0008.KL,0010.KL,0011.KL,0012.KL,...,9814.KL,9822.KL,9873.KL,9881.KL,9938.KL,9946.KL,9954.KL,9962.KL,9997.KL,^KLSE
0,2017-08-16,0.174276,1.647284,0.075,0.35,0.625,0.621143,0.165,0.251474,1.228287,...,0.430,1.857921,0.618650,0.555330,0.248,0.432827,0.050699,0.390491,0.622442,1773.750000
1,2017-08-17,0.174276,1.638836,0.075,0.35,0.650,0.621143,0.165,0.251474,1.210224,...,0.460,1.850954,0.628316,0.564823,0.256,0.432827,0.057037,0.386003,0.636482,1776.310059
2,2017-08-18,0.169435,1.638836,0.080,0.35,0.650,0.617109,0.160,0.251474,1.201192,...,0.460,1.741801,0.628316,0.560077,0.256,0.427612,0.057037,0.377026,0.631801,1776.219971
3,2017-08-21,0.169435,1.689522,0.075,0.35,0.625,0.605009,0.165,0.251474,1.210224,...,0.460,1.681419,0.657315,0.550584,0.256,0.425874,0.048586,0.386003,0.631801,1771.619995
4,2017-08-22,0.169435,1.689522,0.075,0.35,0.600,0.613076,0.160,0.251474,1.183129,...,0.460,1.713932,0.676648,0.560077,0.252,0.429351,0.033799,0.377026,0.622442,1774.219971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1236,2022-08-10,1.230000,4.110000,0.090,1.01,0.025,0.365000,0.135,0.285000,0.860000,...,0.085,3.850000,0.495000,0.560000,0.185,0.160000,0.360000,0.280000,0.530000,1492.329956
1237,2022-08-11,1.350000,4.110000,0.095,1.01,0.025,0.370000,0.140,0.285000,0.865000,...,0.085,4.590000,0.500000,0.570000,0.185,0.165000,0.365000,0.280000,0.550000,1505.560059
1238,2022-08-12,1.370000,4.090000,0.090,1.01,0.025,0.370000,0.135,0.285000,0.860000,...,0.085,4.460000,0.500000,0.570000,0.185,0.160000,0.375000,0.285000,0.570000,1506.189941
1239,2022-08-15,1.400000,4.080000,0.090,1.01,0.020,0.380000,0.145,0.285000,0.855000,...,0.080,4.430000,0.495000,0.530000,0.180,0.155000,0.385000,0.275000,0.550000,1504.010010


#### -- Get beta and alpha using Linear Regression

In [None]:
# Step 3: create function to calculate beta using Linear Regression

import numpy
import pandas
from typing import Tuple, Union
import scipy
import logging
import scipy.stats

def calc_linregress_data(excess_stock_return:list, excess_index_return:list, confidence_level: float, nan_policy: str ="propagate") -> Tuple[float, float, float, float, float, str]:
  beta, intercept, rvalue, pvalue, stderr = scipy.stats.linregress(x=excess_index_return, y=excess_stock_return)
  rsquared = rvalue**2
  normality = normality_test(excess_stock_return, confidence_level)
  return beta, intercept, rsquared, pvalue, stderr, normality

def normality_test(excess_stock_return:list, confidence_level:float) -> Union[bool, str]:
  # The Kolmogorov-Smirnov test - to test normality of stock returns data
  # null hypothesis: the data sample is normal
  # Youtube link: https://www.youtube.com/watch?v=R-MBFCK3p9Q
  try:  
    k2, p = scipy.stats.kstest(excess_stock_return, scipy.stats.norm.cdf)
    if p > confidence_level: # if p-value greater than confidence level, accept null hypothesis 
      return "normal"
    elif p <= confidence_level: # if p-value lesser than confidence level, reject  null hypothesis
      return "not normal"
  except ValueError as error:
    logging.debug(error)

def getRegression(df: pandas.core.frame.DataFrame, period: int, interval:str, rf: float, confidence_level: float) -> pandas.core.frame.DataFrame:
  interval_dict = {"1d": 252,"1wk":52, "1mo":12, "3mo":4}
  df_ny = filterDataBasedYear(df, period).set_index("Date").sub((1+rf)**(1/interval_dict[interval]) -1)
  df_ny = df_ny.apply(lambda x: calc_linregress_data(x.values.tolist(), df_ny["^KLSE"].values.tolist(), confidence_level=confidence_level), axis=0)
  df_ny = df_ny.transpose().reset_index()
  df_ny.columns = ["STOCK CODE",f"BETA_{period}Y", f"INTERCEPT_{period}Y", f"R-SQUARED_{period}Y", f"P-VALUE_{period}Y", f"BETA STANDARD ERROR_{period}Y", f"NORMALITY TEST_{period}Y" ] 
  df_ny["STOCK CODE"] = df_ny["STOCK CODE"].replace({"[.]KL": ""}, regex=True)
  return df_ny

In [None]:
regression_df = getRegression(total_stock_return_df, period=period, interval=interval, rf=rf, confidence_level=confidence_level)
regression_df.sort_values(f"INTERCEPT_{period}Y", ascending=False)

Unnamed: 0,STOCK CODE,BETA_5Y,INTERCEPT_5Y,R-SQUARED_5Y,P-VALUE_5Y,BETA STANDARD ERROR_5Y,NORMALITY TEST_5Y
374,4316,15.962654,0.084795,0.001907,0.124293,10.378518,not normal
62,0091,-5.404349,0.058115,0.001367,0.193255,4.151702,not normal
381,4464,1.708991,0.036647,0.001965,0.118768,1.094773,not normal
86,0118,2.800721,0.016087,0.011328,0.000174,0.743643,not normal
111,0154,1.797769,0.014945,0.005584,0.008478,0.681846,not normal
...,...,...,...,...,...,...,...
615,5303,,,,,,
616,5305,,,,,,
617,5306,,,,,,
618,5308,,,,,,


#### -- Calculate Annualized Return and Standard Devation

In [None]:
import math

def getAnnualizedReturn(df:pandas.DataFrame, interval:str, calc_type:str="geometric", skipna:bool=False) -> float:
    """Params:
    interval : str
        Valid intervals: 1d, 1wk,1mo,3mo
    """
    annualized_return: float = None
    annualized_return_dict: dict = {"1d": 252,"1wk":52, "1mo":12, "3mo":4}
    if calc_type == "arithmetic":
        annualized_return: float = df.mean(skipna=skipna) * annualized_return_dict[interval]
    if calc_type == "geometric":
        annualized_return: float = df.add(1).cumprod(skipna=skipna).iloc[-1] ** (1/annualized_return_dict[interval]) -1
    return annualized_return

def getAnnualizedStdDeviation(df: pandas.DataFrame, interval:str, skipna:bool) -> float:
    """Params:
    interval : str
        Valid intervals: 1d,1wk,1mo,3mo
    """
    annualized_std_dict: dict = {"1d": 252,"1wk":52, "1mo":12, "3mo":4}
    annualized_std: float = df.std(skipna=skipna) * math.sqrt(annualized_std_dict[interval])
    return annualized_std

def getSkewness(df: pandas.DataFrame) -> float:
  # skewsness and kurtosis reference: https://www.analyticsvidhya.com/blog/2021/05/shape-of-data-skewness-and-kurtosis/
  skewness = scipy.stats.skew(df, nan_policy="propagate") # negative skewed return is preferred -> frequent small returns and less big losses -> Reference: https://corporatefinanceinstitute.com/resources/knowledge/other/negatively-skewed-distribution/#:~:text=Negatively%20Skewed%20Distribution%20in%20Finance&text=Although%20many%20finance%20theories%20and,and%20a%20few%20large%20losses.
  return skewness

def getPearsonKurtosis(df: pandas.DataFrame) -> float:
  pearson_kurtosis = scipy.stats.kurtosis(df, fisher=False, nan_policy="propagate") # Mesokurtic or normal distribution when kurtosis=3 
  return pearson_kurtosis

def getFisherKurtosis(df: pandas.DataFrame) -> float:
  fisher_kurtosis = scipy.stats.kurtosis(df, fisher=True, nan_policy="propagate") # unlike pearson kurtosis, already exclude 3
  return fisher_kurtosis

In [None]:
# Calculate Annualized return, standard deviation, skewness and kurtosis

import numpy, pandas
import scipy
import numpy

x_list = []
descriptive_df = pandas.DataFrame()

descriptive_df[f"annualized_return_of_equity_{period}Y"] = getAnnualizedReturn(filterDataBasedYear(total_stock_return_df, period=period).set_index("Date"), interval=interval, skipna=skipna, calc_type="geometric")
descriptive_df[f"annualized_standard_deviation_of_equity_{period}Y"] = getAnnualizedStdDeviation(filterDataBasedYear(total_stock_return_df, period =period).set_index("Date"), interval=interval, skipna=skipna)
descriptive_df[f"SKEWNESS_{period}Y"] = getSkewness(filterDataBasedYear(total_stock_return_df, period =period).set_index("Date"))
descriptive_df[f"PEARSON_KURTOSIS_{period}Y"] = getPearsonKurtosis(filterDataBasedYear(total_stock_return_df, period =period).set_index("Date"))
descriptive_df[f"FISHER_KURTOSIS_{period}Y"] = getFisherKurtosis(filterDataBasedYear(total_stock_return_df, period =period).set_index("Date"))

descriptive_df = descriptive_df.reset_index()
descriptive_df = descriptive_df.rename(columns={"index":"STOCK CODE"})
descriptive_df["STOCK CODE"] = descriptive_df["STOCK CODE"].replace("[.]KL", "", regex=True)

"""
df = df.replace(numpy.nan, "")

workSheet = gs.worksheet("Calculated")
workSheet.clear()
columns = ["Stock Code", "5Y Beta", "2Y Beta", "2Y Standard Deviation of Equity", "2Y Average Return of Equity"]
workSheet.append_row(columns)
workSheet.format("1:1", {"textFormat":{"bold":"True"}})

workSheet.append_rows(descriptive_df.values.tolist(), value_input_option='RAW', table_range="A1")
"""

descriptive_df.to_csv("descriptive_df.csv")
descriptive_df.sort_values(f"annualized_return_of_equity_{period}Y", ascending=False)

Unnamed: 0,STOCK CODE,annualized_return_of_equity_5Y,annualized_standard_deviation_of_equity_5Y,SKEWNESS_5Y,PEARSON_KURTOSIS_5Y,FISHER_KURTOSIS_5Y
75,0104,0.014571,5.930054,34.287794,1197.275294,1194.275294
797,7172,0.010762,0.438478,2.677799,21.506273,18.506273
93,0128,0.010324,0.516815,0.518058,11.527209,8.527209
695,6971,0.009668,0.624284,2.466141,18.308557,15.308557
28,0041,0.008286,0.860408,2.646789,21.747685,18.747685
...,...,...,...,...,...,...
615,5303,,,,,
616,5305,,,,,
617,5306,,,,,
618,5308,,,,,


financial_summary = pandas.DataFrame.from_dict(fov_multilevel_dict)
financial_summary

## Merge All DataFrame

In [None]:
# merge dataframes of `clean_df_stock_list`, `regression_df`, `descriptive_df`

from functools import reduce

main_dataframe = [clean_df_stock_list, regression_df, descriptive_df]

merged_df = reduce(lambda left, right: pandas.merge(left, right, on="STOCK CODE", how ="outer"), main_dataframe)

# then calculate other performance metrics such as SHARPE RATIO, Treynor Ratio, and Jensen Alpha
interval_dict: dict = {"1d": 252,"1wk":52, "1mo":12, "3mo":4}

merged_df[f"SHARPE_RATIO_{period}Y"] = merged_df[f"annualized_return_of_equity_{period}Y"].sub((1+rf)**(1/interval_dict[interval]) -1).divide(merged_df[f"annualized_standard_deviation_of_equity_{period}Y"])
merged_df[f"TREYNOR_RATIO_{period}Y"] = merged_df[f"annualized_return_of_equity_{period}Y"].sub((1+rf)**(1/interval_dict[interval]) -1).divide(merged_df[f"BETA_{period}Y"])

#beta_multiply_market_return: pandas.DataFrame = merged_df[f"BETA_{period}Y"].multiply(float(merged_df[merged_df["STOCK CODE"] == "^KLSE"][f"annualized_return_of_equity_{period}Y"]))
#merged_df[f"JENSEN_ALPHA_{period}Y"] = merged_df[f"annualized_return_of_equity_{period}Y"].sub((1+rf)**(1/interval_dict[interval]) -1).divide(beta_multiply_market_return)

merged_df.to_csv("merged_df.csv")
merged_df


Unnamed: 0,STOCK_SYMBOL,STOCK CODE,SECTOR,SUBSECTOR,MKT,OPEN,LAST,CHG%,CHG,VOL,...,P-VALUE_5Y,BETA STANDARD ERROR_5Y,NORMALITY TEST_5Y,annualized_return_of_equity_5Y,annualized_standard_deviation_of_equity_5Y,SKEWNESS_5Y,PEARSON_KURTOSIS_5Y,FISHER_KURTOSIS_5Y,SHARPE_RATIO_5Y,TREYNOR_RATIO_5Y
0,1TECH,03041,TECHNOLOGY,SOFTWARE,LEAP,0.000,0.200,0.00,0.000,0.0,...,,,,,,,,,,
1,3A,0012,CONSUMER PRODUCTS & SERVICES,FOOD& BEVERAGES,MAIN,0.865,0.860,0.58,0.005,101000.0,...,0.0,0.071685,not normal,-0.001413,0.324599,0.189911,9.316727,6.316727,-0.004834,-0.002169
2,AASIA,7054,PLANTATION,PLANTATION,MAIN,0.105,0.110,0.00,0.000,350000.0,...,0.0,0.164074,not normal,-0.002370,0.729139,2.050741,23.356902,20.356902,-0.003463,-0.002124
3,AAX,5238,CONSUMER PRODUCTS & SERVICES,"TRAVEL, LEISURE & HOSPITALITY",MAIN,0.500,0.490,-2.00,-0.010,93000.0,...,0.0,0.19516,not normal,-0.007937,0.884128,2.877602,28.800087,25.800087,-0.009153,-0.004084
4,ABFMY1,0800EA,EXCHANGE TRADED FUND-BOND,BOND FUND,ETF,1.178,1.178,0.00,0.000,1000.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1064,ZECON,7028,CONSTRUCTION,CONSTRUCTION,MAIN,0.385,0.385,-3.75,-0.015,3000.0,...,0.0,0.161662,not normal,-0.001378,0.714420,3.956915,56.703742,53.703742,-0.002147,-0.001536
1065,ZELAN,2283,CONSTRUCTION,CONSTRUCTION,MAIN,0.055,0.055,10.00,0.005,2000.0,...,0.0,0.24395,not normal,-0.003253,1.075250,1.589209,13.721931,10.721931,-0.003170,-0.00249
1066,ZENTECH,0094,TECHNOLOGY,SOFTWARE,ACE,0.025,0.025,25.00,0.005,1857000.0,...,0.00083,0.321833,not normal,-0.003785,1.407167,4.309839,63.543895,60.543895,-0.002800,-0.003654
1067,ZHULIAN,5131,CONSUMER PRODUCTS & SERVICES,CONSUMER SERVICES,MAIN,1.930,1.940,0.52,0.010,13000.0,...,0.0,0.058734,not normal,0.002208,0.262782,0.668821,16.509316,13.509316,0.007812,0.004176


## Aggregate Data

In [None]:
sector_overview_df = merged_df.groupby("SECTOR").agg({f"BETA_{period}Y": "mean",
                                    f"INTERCEPT_{period}Y": "mean",
                                    f"annualized_return_of_equity_{period}Y": "mean", 
                                    f"annualized_standard_deviation_of_equity_{period}Y": "mean",
                                    }).dropna().sort_values(f"INTERCEPT_{period}Y", ascending=False)
sector_overview_df.to_csv("sector_overview_df.csv")
sector_overview_df

Unnamed: 0_level_0,BETA_5Y,INTERCEPT_5Y,annualized_return_of_equity_5Y,annualized_standard_deviation_of_equity_5Y
SECTOR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENERGY,1.483703,0.002854,-0.004119,1.393693
PLANTATION,1.004876,0.00245,1.7e-05,1.498689
TECHNOLOGY,1.225872,0.002191,-0.001576,1.018766
TELECOMMUNICATIONS & MEDIA,0.943271,0.001104,-0.003074,0.785227
HEALTH CARE,0.922184,0.000883,0.00079,0.531878
INDUSTRIAL PRODUCTS & SERVICES,0.844717,0.000795,-0.000957,0.640344
PROPERTY,0.636501,0.000638,-0.002233,0.59564
CONSUMER PRODUCTS & SERVICES,0.731389,0.00058,-0.00101,0.56632
TRANSPORTATION & LOGISTICS,0.769211,0.000493,-0.001861,0.595404
FINANCIAL SERVICES,0.74762,0.000371,-0.000433,0.359211


In [None]:
sub_sector_overview_df = merged_df.groupby(["SUBSECTOR","SECTOR"]).agg({f"BETA_{period}Y": "mean",
                                    f"INTERCEPT_{period}Y": "mean",
                                    f"annualized_return_of_equity_{period}Y": "mean", 
                                    f"annualized_standard_deviation_of_equity_{period}Y": "mean"
                                    }).dropna().sort_values(f"INTERCEPT_{period}Y", ascending=False)
sub_sector_overview_df.to_csv("sub_sector_overview_df.csv")
sub_sector_overview_df

Unnamed: 0_level_0,Unnamed: 1_level_0,BETA_5Y,INTERCEPT_5Y,annualized_return_of_equity_5Y,annualized_standard_deviation_of_equity_5Y
SUBSECTOR,SECTOR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"ENERGY INFRASTRUCTURE, EQUIPMENT & SERVICES",ENERGY,1.426321,0.003294,-0.004614,1.538961
TELECOMMUNICATIONS EQUIPMENT,TELECOMMUNICATIONS & MEDIA,0.998049,0.003227,-0.002393,1.234194
TECHNOLOGY EQUIPMENT,TECHNOLOGY,1.255456,0.00264,-0.001876,1.220029
SOFTWARE,TECHNOLOGY,1.226374,0.002454,-0.003303,1.161937
PLANTATION,PLANTATION,1.004876,0.00245,1.7e-05,1.498689
OTHER ENERGY RESOURCES,ENERGY,2.214791,0.002365,-0.00079,1.196608
DIGITAL SERVICES,TECHNOLOGY,1.237004,0.001859,-0.000982,0.866736
SEMICONDUCTORS,TECHNOLOGY,1.159915,0.001431,0.001579,0.630184
HEALTH CARE EQUIPMENT & SERVICES,HEALTH CARE,1.255985,0.00134,0.000796,0.703614
INDUSTRIAL SERVICES,INDUSTRIAL PRODUCTS & SERVICES,0.796005,0.001103,-0.001669,0.765788


## Chart Plotting

In [None]:
# Return vs Standard Deviation

from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, PanTool, ZoomInTool, ZoomOutTool, WheelZoomTool, ResetTool, SaveTool
from bokeh.palettes import Category20c
from bokeh.transform import factor_cmap

output_notebook()
# output_file("toolbar.html")

sector_list = ['TECHNOLOGY', 'CONSUMER PRODUCTS & SERVICES', 'PLANTATION',
      'EXCHANGE TRADED FUND-BOND', 'INDUSTRIAL PRODUCTS & SERVICES',
      'FINANCIAL SERVICES', 'PROPERTY', 'CONSTRUCTION', 'HEALTH CARE',
      '', 'TRANSPORTATION & LOGISTICS', 'ENERGY',
      'REAL ESTATE INVESTMENT TRUSTS', 'TELECOMMUNICATIONS & MEDIA', 'UTILITIES']

normalized: bool = False
filter_sector: bool= True

def normalize_data(df: pandas.core.frame.DataFrame, x:str, y:str):
  x = df[x]; y = df[y]
  df = df[x.between(x.quantile(0.05), x.quantile(0.95)) & y.between(y.quantile(0.05), y.quantile(0.95))]
  return df

def filter_value(df: pandas.core.frame.DataFrame, column: str, unique_val_in_col: list):
  return df[df[column].isin(unique_val_in_col)]

def plot_fig(df:pandas.core.frame.DataFrame, x:str, y:str):
  fig = figure(height=800,width=1000,tools="hover",  toolbar_location="above", 
              x_axis_label=x, y_axis_label=y,
              tooltips=[("STOCK","@STOCK_SYMBOL"),("SUBSECTOR", "@SUBSECTOR"), ("SECTOR", "@SECTOR"),
                        ("EXCESS RETURN", f"@annualized_return_of_equity_{period}Y"),  
                        ("STANDARD DEVIATION", f"@annualized_standard_deviation_of_equity_{period}Y")])

  fig.add_tools(PanTool(), ZoomInTool(), ZoomOutTool(), WheelZoomTool(), ResetTool(), SaveTool())

  adjusted_df = df.copy(deep=True)

  if filter_sector:
    adjusted_df = filter_value(df, "SECTOR", sector_list)
  if normalized:
    adjusted_df = normalize_data(df,x=x,y=y)

  source = ColumnDataSource(data=adjusted_df)

  fig.scatter(source=source, y=y, x=x, legend_field="SECTOR", 
              fill_color=factor_cmap("SECTOR", Category20c[14], factors=adjusted_df["SECTOR"].unique()),
              line_color=None, size=10)

  fig.add_layout(fig.legend[0], 'right')
  return fig

show(plot_fig(df=merged_df, x = f"annualized_standard_deviation_of_equity_{period}Y", y=f"annualized_return_of_equity_{period}Y"))

