<a href="https://colab.research.google.com/github/tys203831/bursa-scraper/blob/main/Bursa_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [None]:
rf: float = 0.04 # risk free rate
period: int = 5 # e.g. 5 = 5 years
interval: str = "1wk"
confidence_level = 0.01
skipna: bool = False # skip NaN values for annualized return & annualized standard deviation

include_dividends: bool = True
exclude_warrant: bool = True

In [None]:
pip install requests requests_html yfinance gspread statsmodels scipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Retrieve Data & Data Cleaning

### (i) Get stock ticker from Bursa

In [None]:
# Step 4: Get stock tickers from i3investor and then upload to the google spreadsheet: "Bursa stock list" sheet

import requests
import time
from requests.exceptions import HTTPError
import pandas
import json
import re

# start scraping stock list from i3investor
url = "https://klse.i3investor.com/wapi/web/stock/listing/datatables"
data = {"dtDraw":1,"start":0,"order":[{"column":1,"dir":"asc"}],"page":0,"size":100,"marketList":[],"sectorList":[],"subsectorList":[],"type":"","stockType":""}
retries = 2

session = requests.Session()
response = session.post(url, json=data)

df_stock_list = pandas.DataFrame()

#for idx in range(0,11,1):
while len(response.json()["data"]) != 0:
  try:
    response = session.post(url, json=data)
  
  except HTTPError as err:
    code = err.response.status_code

    if code in [429,500,502,503,504]:
      continue
    
    else:
      for retry in range(0,retries,1):
        response = session.post(url, json=data)
        print(response.raise_for_status())
    
  finally:
    df_stock_list= pandas.DataFrame(json.loads(response.text)["data"]) if df_stock_list.empty else pandas.concat((df_stock_list, pandas.DataFrame(json.loads(response.text)["data"])), axis=0)
    #workSheet.append_rows(df_stock_list_values, value_input_option='RAW', table_range ="A1")

    data["dtDraw"] +=1
    data["start"] +=100
    data["page"]+=1

# print out df_stock_list
df_stock_list

#df_stock_list[5] = df_stock_list[5].apply(lambda text: re.sub("<((\s|\w|[='-/])+)>","",text))
#df_stock_list_values = df_stock_list[[12,13,2,3,4,5,6,7,9,10,14]].values.tolist()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1708,"<div class="""" style=""min-width:250px""><div cla...",0.00,0.20,0.00,<span class='up fw-bold'>0.00</span>,0,15.39 M,,LEAP,TECHNOLOGY,,1TECH,03041,SOFTWARE
1,1,"<div class="""" style=""min-width:250px""><div cla...",0.865,0.865,0.00,<span class='up fw-bold'>0.00</span>,0.009 M,425.58 M,,MAIN,CONSUMER PRODUCTS & SERVICES,,3A,0012,FOOD& BEVERAGES
2,3,"<div class="""" style=""min-width:250px""><div cla...",0.00,0.10,0.00,<span class='up fw-bold'>0.00</span>,0,65.998 M,,MAIN,PLANTATION,,AASIA,7054,PLANTATION
3,4,"<div class="""" style=""min-width:250px""><div cla...",0.495,0.495,0.00,<span class='up fw-bold'>0.00</span>,0.031 M,205.333 M,,MAIN,CONSUMER PRODUCTS & SERVICES,,AAX,5238,"TRAVEL, LEISURE & HOSPITALITY"
4,5,"<div class="""" style=""min-width:250px""><div cla...",1.178,1.178,0.77,<span class='up fw-bold'>+0.009</span>,0.002 M,1652.642 M,,ETF,EXCHANGE TRADED FUND-BOND,,ABFMY1,0800EA,BOND FUND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,4050,"<div class="""" style=""min-width:250px""><div cla...",0.205,0.205,0.00,<span class='up fw-bold'>0.00</span>,0.868 M,76.291 M,,ACE,CONSUMER PRODUCTS & SERVICES,,YXPM,0250,PERSONAL GOODS
65,1200,"<div class="""" style=""min-width:250px""><div cla...",0.00,0.38,0.00,<span class='up fw-bold'>0.00</span>,0,56.013 M,,MAIN,CONSTRUCTION,,ZECON,7028,CONSTRUCTION
66,1201,"<div class="""" style=""min-width:250px""><div cla...",0.045,0.055,0.00,<span class='up fw-bold'>0.00</span>,0.524 M,46.471 M,,MAIN,CONSTRUCTION,,ZELAN,2283,CONSTRUCTION
67,472,"<div class="""" style=""min-width:250px""><div cla...",0.025,0.02,-20.00,<span class='down fw-bold'>-0.005</span>,0.667 M,16.169 M,,ACE,TECHNOLOGY,,ZENTECH,0094,SOFTWARE


In [None]:
# data cleaning for this stock ticker table, e.g., drop unwanted columns and reordering them
clean_df_stock_list = df_stock_list.drop([0,1,8,11], axis=1) # drop unwanted columns
clean_df_stock_list = clean_df_stock_list[[12,13,10,14,9, 2,3,4,5,6,7]] # reordering the columns
clean_df_stock_list.columns = ["STOCK SYMBOL","STOCK CODE","SECTOR","SUBSECTOR", "MKT", "OPEN","LAST","CHG%","CHG","VOL","MKT CAP"] #rename the columns

clean_df_stock_list[["OPEN","LAST","CHG%","CHG","MKT CAP"]] \
= clean_df_stock_list[["OPEN","LAST","CHG%","CHG","MKT CAP"]].replace(
            {"<((\s|\w|[='-/])+)>":"",
                         "K": "*1e3",
                         "M": "*1e6",
                        "B": "*1e9",}, regex=True)

clean_df_stock_list["MKT CAP"] = clean_df_stock_list["MKT CAP"].map(pandas.eval).astype(int) # convert string (e.g., 897 M	)
clean_df_stock_list= clean_df_stock_list.apply(pandas.to_numeric, errors="ignore") 

clean_df_stock_list

Unnamed: 0,STOCK SYMBOL,STOCK CODE,SECTOR,SUBSECTOR,MKT,OPEN,LAST,CHG%,CHG,VOL,MKT CAP
0,1TECH,03041,TECHNOLOGY,SOFTWARE,LEAP,0.000,0.200,0.00,0.000,0,15390000
1,3A,0012,CONSUMER PRODUCTS & SERVICES,FOOD& BEVERAGES,MAIN,0.865,0.865,0.00,0.000,0.009 M,425580000
2,AASIA,7054,PLANTATION,PLANTATION,MAIN,0.000,0.100,0.00,0.000,0,65998000
3,AAX,5238,CONSUMER PRODUCTS & SERVICES,"TRAVEL, LEISURE & HOSPITALITY",MAIN,0.495,0.495,0.00,0.000,0.031 M,205333000
4,ABFMY1,0800EA,EXCHANGE TRADED FUND-BOND,BOND FUND,ETF,1.178,1.178,0.77,0.009,0.002 M,1652642000
...,...,...,...,...,...,...,...,...,...,...,...
64,YXPM,0250,CONSUMER PRODUCTS & SERVICES,PERSONAL GOODS,ACE,0.205,0.205,0.00,0.000,0.868 M,76291000
65,ZECON,7028,CONSTRUCTION,CONSTRUCTION,MAIN,0.000,0.380,0.00,0.000,0,56013000
66,ZELAN,2283,CONSTRUCTION,CONSTRUCTION,MAIN,0.045,0.055,0.00,0.000,0.524 M,46471000
67,ZENTECH,0094,TECHNOLOGY,SOFTWARE,ACE,0.025,0.020,-20.00,-0.005,0.667 M,16169000


In [None]:
# update stock code list that need to scrape

import re


stock_list = clean_df_stock_list["STOCK CODE"]
full_stock_list = ["^KLSE"]
full_stock_list.extend(stock_list)

if exclude_warrant: 
  full_stock_list = [stock for stock in full_stock_list if not bool(re.match(pattern="\d+[a-zA-Z]+",string=stock))]

print(full_stock_list)

['^KLSE', '03041', '0012', '7054', '5238', '7167', '7086', '2488', '03028', '7131', '0218', '5281', '7191', '9148', '7146', '0181', '6599', '5139', '5185', '5198', '7145', '0258', '7315', '7090', '0122', '0209', '5014', '2658', '7609', '5115', '5116', '2674', '058212', '1163', '0079', '5269', '1015', '5293', '0159', '5120', '03011', '7031', '6351', '7083', '0048', '4758', '0226', '6556', '5082', '9342', '03051', '5568', '5088', '5015', '6432', '0119', '7214', '7181', '7007', '5210', '5127', '0038', '1481', '0068', '7722', '7129', '4057', '0105', '7162', '03032', '6399', '0072', '8176', '5302', '7048', '5130', '7099', '03037', '8885', '5204', '7579', '6888', '5106', '7120', '2305', '5021', '7078', '03012', '0098', '7251', '4162', '5248', '6602', '0187', '6173', '5190', '9814', '7668', '8133', '7005', '5258', '0195', '6998', '0179', '5032', '3239', '3395', '5196', '4219', '1899', '5069', '0168', '9288', '7036', '6297', '5254', '5100', '5932', '9938', '7221', '2771', '0011', '7188', '1818

### (ii) Get stock & dividend

*   stock price, stock return
*   dividend

#### -- Get stock price history

In [None]:
# Step 2: create a function to get stock & index price history from yahoo finance

import pandas
import yfinance
from typing import Union

def getData(ticker_code: Union[str, list], period:str, interval:str) -> pandas.DataFrame:
  # get stock return dataframe of the Bursa stocks
  if isinstance(ticker_code, list):
    ticker_code = [str(ticker) + ".KL" if not str(ticker).endswith(".KL") and not ticker.startswith("^") else str(ticker) for ticker in ticker_code]
    ticker_code = " ".join(ticker_code)
  elif isinstance(ticker_code, str) and not ticker_code.endswith(".KL") and not ticker_code.startswith("^"): 
    ticker_code = ticker_code + ".KL"
  df_stock = yfinance.download(tickers=ticker_code, period=period, interval=interval, group_by="column")
  df_stock = pandas.DataFrame(df_stock["Adj Close"]).fillna(method="ffill", axis=0).resample("W").mean()
  return df_stock.reset_index()

def getReturn(df_stock: pandas.DataFrame) -> pandas.DataFrame:
  df_stock["Date"] = pandas.to_datetime(df_stock["Date"])
  df_stock_return = df_stock.set_index("Date").pct_change()
  return df_stock_return[1:].reset_index()

import dateutil
import datetime

def filterDataBasedYear(df, period: int):
  # get stock history of previous n years (e.g. 2 years stock return data out of 5 years)
  n_years_from_today = datetime.date.today() - dateutil.relativedelta.relativedelta(years=+period)
  n_years_from_today = n_years_from_today.strftime("%Y-%m-%d")
  return df[df["Date"]>= n_years_from_today]

In [None]:
# Step 5: Download stock return dataframe of Bursa Malaysia stocks from yahoo finance using yfinance
#stock_list = workSheet.col_values(2)[1:]

stock_df = getData(ticker_code=full_stock_list, period=str(period)+"y", interval=interval)
stock_df 

[*********************100%***********************]  990 of 990 completed

9 Failed downloads:
- 058312.KL: No data found, symbol may be delisted
- 058212.KL: No data found, symbol may be delisted
- 0045.KL: No data found, symbol may be delisted
- 054620.KL: No data found, symbol may be delisted
- 0259.KL: No data found, symbol may be delisted
- 054810.KL: No data found, symbol may be delisted
- 3055.KL: No data found, symbol may be delisted
- 03009.KL: No data found, symbol may be delisted
- 0258.KL: No data found, symbol may be delisted


Unnamed: 0,Date,0001.KL,0002.KL,0005.KL,0006.KL,0007.KL,0008.KL,0010.KL,0011.KL,0012.KL,...,9814.KL,9822.KL,9873.KL,9881.KL,9938.KL,9946.KL,9954.KL,9962.KL,9997.KL,^KLSE
0,2017-08-13,0.159753,1.638836,0.075000,0.300,0.525000,0.588875,0.160000,0.251474,1.228287,...,0.430,1.806828,0.604150,0.550584,0.244000,0.422398,0.048586,0.390491,0.636482,1766.959961
1,2017-08-20,0.169435,1.638836,0.080000,0.350,0.650000,0.617109,0.160000,0.251474,1.201192,...,0.460,1.741801,0.628316,0.560077,0.256000,0.427612,0.057037,0.377026,0.631801,1776.219971
2,2017-08-27,0.164594,1.647284,0.145000,0.380,0.650000,0.588875,0.175000,0.251474,1.165066,...,0.460,1.688386,0.671815,0.555330,0.252000,0.417183,0.033799,0.377026,0.622442,1769.170044
3,2017-09-03,0.164594,1.638836,0.150000,0.450,0.625000,0.572742,0.170000,0.278979,1.147003,...,0.445,1.646583,0.671815,0.545838,0.244000,0.403277,0.033799,0.372538,0.613081,1773.160034
4,2017-09-10,0.159753,1.638836,0.235000,0.415,0.700000,0.536441,0.185000,0.247545,1.156034,...,0.430,1.597812,0.691148,0.612287,0.248000,0.396324,0.033799,0.372538,0.570961,1779.900024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,2022-07-17,1.110000,3.950000,0.085000,1.010,0.045000,0.355000,0.110000,0.275000,0.840000,...,0.090,2.766553,0.456905,0.520000,0.180000,0.195000,0.330000,0.285000,0.525000,1418.439941
258,2022-07-24,1.180000,4.000000,0.105000,1.010,0.045000,0.365000,0.130000,0.285000,0.865000,...,0.090,3.053089,0.476143,0.570000,0.175000,0.165670,0.355000,0.280000,0.535000,1465.800049
259,2022-07-31,1.220000,4.100000,0.105000,1.010,0.050000,0.365000,0.125000,0.300000,0.875000,...,0.090,3.790000,0.505000,0.570000,0.190000,0.180000,0.335000,0.285000,0.540000,1492.229980
260,2022-08-07,1.210000,4.200000,0.095000,1.010,0.025000,0.365000,0.125000,0.290000,0.865000,...,0.090,4.440000,0.480952,0.560000,0.185000,0.160000,0.375000,0.285000,0.530000,1501.550049


#### -- Get dividend history

In [None]:
# Step 6: Download dividend dataframe of Bursa Malaysia stocks from yahoo finance using yfinance

#stock_list = workSheet.col_values(2)[1:]
import yfinance
import pandas 
import threading
from typing import Union, List, Dict
import datetime

multilevel_dict = {}
threads =[]

def bursa_ticker_amend(ticker_code: Union[list, str]) -> List[str]:
  if isinstance(ticker_code, list):
    ticker_code = [str(ticker) + ".KL" if not str(ticker).endswith(".KL") and not ticker.startswith("^") else str(ticker) for ticker in ticker_code]
  elif isinstance(ticker_code, str) and not ticker_code.endswith(".KL") and not ticker_code.startswith("^"): 
    ticker_code = ticker_code + ".KL"
  return ticker_code

# scrape dividend history from yahoo finance
def dividend_scraper(stock: str) -> Dict[str, Dict[datetime.datetime ,float]]:
  try:
    ticker = yfinance.Ticker(stock)
    dividends = ticker.get_dividends().to_dict()
    multilevel_dict.update({stock:dividends})
    return multilevel_dict
  
  except AttributeError:
    print("No data found! No dividends are distributed!")

if include_dividends:
  full_bursa_stock_list = bursa_ticker_amend(full_stock_list)

  for idx in range(0, len(full_stock_list),1):
    t =threading.Thread(target=dividend_scraper,args=[full_bursa_stock_list[idx]])
    t.start()
    threads.append(t)

  for thread in threads:
    thread.join()

- 0258.KL: No data found, symbol may be delisted
No data found! No dividends are distributed!
- 058212.KL: No data found, symbol may be delisted
No data found! No dividends are distributed!
- 054810.KL: No data found, symbol may be delisted
No data found! No dividends are distributed!
- 3055.KL: No data found, symbol may be delisted
No data found! No dividends are distributed!
- 0045.KL: No data found, symbol may be delisted
No data found! No dividends are distributed!
- 058312.KL: No data found, symbol may be delisted
No data found! No dividends are distributed!
- 03009.KL: No data found, symbol may be delisted
No data found! No dividends are distributed!
- 0259.KL: No data found, symbol may be delisted
No data found! No dividends are distributed!
- 054620.KL: No data found, symbol may be delisted
No data found! No dividends are distributed!


#### -- Merge dataframe of stock price & dividend history

In [None]:
if include_dividends:
  dividend_history = pandas.DataFrame.from_dict(multilevel_dict)
  dividend_history = dividend_history.reset_index()
  dividend_history = dividend_history.rename(columns= {"index":"Date"})

  # create a date range series
  datetable = pandas.date_range(start=min(stock_df["Date"]), end = max(stock_df["Date"]), freq="D").to_series(name="Date")

  # merge two dataframes: datetable and dividend_history
  sorted_dividend_history = pandas.merge(datetable, dividend_history, on="Date", how="left")

  new_sorted_dividend_history= sorted_dividend_history.copy(deep=True).reset_index()
  new_sorted_dividend_history["Date"] = pandas.to_datetime(new_sorted_dividend_history["Date"])
  new_sorted_dividend_history = new_sorted_dividend_history.reindex(columns=stock_df.columns).fillna(0)
  new_sorted_dividend_history

  stock_with_div_df = stock_df.set_index("Date").add(new_sorted_dividend_history.set_index("Date"))
  stock_with_div_df = stock_with_div_df.resample("W").mean().reset_index()
  stock_with_div_df

#### -- Get stock return (including dividend)

In [None]:
if include_dividends:
  total_stock_return_df = getReturn(stock_with_div_df)
  
else:
  total_stock_return_df = getReturn(stock_df)
total_stock_return_df

Unnamed: 0,Date,0001.KL,0002.KL,0005.KL,0006.KL,0007.KL,0008.KL,0010.KL,0011.KL,0012.KL,...,9814.KL,9822.KL,9873.KL,9881.KL,9938.KL,9946.KL,9954.KL,9962.KL,9997.KL,^KLSE
0,2017-08-20,0.060606,0.000000,0.066667,0.166667,0.238095,0.047945,0.000000,0.000000,-0.022059,...,0.069767,-0.035990,0.040000,0.017241,0.049180,0.012346,0.173926,-0.034483,-0.007353,0.005241
1,2017-08-27,-0.028571,0.005155,0.812500,0.085714,0.000000,-0.045752,0.093750,0.000000,-0.030075,...,0.000000,-0.030667,0.069231,-0.008475,-0.015625,-0.024390,-0.407421,0.000000,-0.014815,-0.003969
2,2017-09-03,0.000000,-0.005128,0.034483,0.184211,-0.038462,-0.027397,-0.028571,0.109375,-0.015504,...,-0.032609,-0.024759,0.000000,-0.017094,-0.031746,-0.033333,0.000000,-0.011905,-0.015038,0.002255
3,2017-09-10,-0.029412,0.000000,0.566667,-0.077778,0.120000,-0.063380,0.088235,-0.112676,0.007874,...,-0.033708,-0.029619,0.028777,0.121739,0.016393,-0.017241,0.000000,0.000000,-0.068702,0.003801
4,2017-09-17,0.060606,-0.025773,0.000000,-0.012048,0.035714,-0.037594,-0.027027,0.111111,-0.031250,...,0.023256,0.005814,-0.013986,0.108527,0.048387,-0.017544,0.750036,-0.012048,-0.016394,0.003613
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,2022-07-17,-0.017699,0.007653,-0.150000,0.000000,0.000000,0.014286,-0.120000,0.000000,-0.017544,...,0.000000,-0.060403,-0.040404,-0.018868,0.000000,0.000000,0.000000,0.017857,-0.009434,-0.005155
257,2022-07-24,0.063063,0.012658,0.235294,0.000000,0.000000,0.028169,0.181818,0.036364,0.029762,...,0.000000,0.103571,0.042105,0.096154,-0.027778,-0.150410,0.075758,-0.017544,0.019048,0.033389
258,2022-07-31,0.033898,0.025000,0.000000,0.000000,0.111111,0.000000,-0.038462,0.052632,0.011561,...,0.000000,0.241366,0.060606,0.000000,0.085714,0.086497,-0.056338,0.017857,0.009346,0.018031
259,2022-08-07,-0.008197,0.024390,-0.095238,0.000000,-0.500000,0.000000,0.000000,-0.033333,-0.011429,...,0.000000,0.171504,-0.047619,-0.017544,-0.026316,-0.111111,0.119403,0.000000,-0.018519,0.006246


#### -- Get beta and alpha using Linear Regression

In [None]:
# Step 3: create function to calculate beta using Linear Regression

import numpy
import pandas
from typing import Tuple, Union
import scipy
import statsmodels.api as sm
import logging

def calc_beta(excess_stock_return:list, excess_index_return:list, confidence_level: float) -> Tuple[float, float, float, float, float, str]:
  beta, intercept, rvalue, pvalue, stderr = scipy.stats.linregress(x=excess_index_return, y=excess_stock_return)
  rsquared = rvalue**2
  normality = normality_test(excess_stock_return, confidence_level) 
  return beta, intercept, rsquared, pvalue, stderr, normality

""" statsmodels library to calculate beta, intercept, etc 
def calc_beta(excess_stock_return:list, excess_index_return:list):
  X1 = sm.add_constant(data=excess_index_return)
  model = sm.OLS(endog=excess_stock_return, exog=X1)
  results = model.fit()
  beta, intercept, rsquared, pvalue, stderr = (results.params[1], results.params[0], results.rsquared, results.pvalues[1], results.bse[1])
  return beta, intercept, rsquared, pvalue, stderr
"""

def normality_test(excess_stock_return:list, confidence_level:float, nan_policy="propagate") -> Union[bool, str]: 
  # D’Agostino’s K-squared test on excess return data
  try:  
    k2, p = scipy.stats.normaltest(excess_stock_return, nan_policy=nan_policy)
    if p > confidence_level:
      return False
    elif p <= confidence_level:
      return "normal"
  except ValueError as error:
    logging.debug(error)

def getRegression(df: pandas.core.frame.DataFrame, period: int, rf: float, confidence_level: float) -> pandas.core.frame.DataFrame:
  df_ny = filterDataBasedYear(df, period).set_index("Date").sub(rf)
  df_ny = df_ny.apply(lambda x: calc_beta(x.values.tolist(), df_ny["^KLSE"].values.tolist(), confidence_level=confidence_level), axis=0)
  df_ny = df_ny.transpose().reset_index()
  df_ny.columns = ["STOCK CODE",f"BETA_{period}Y", f"INTERCEPT_{period}Y", f"R-SQUARED_{period}Y", f"P-VALUE_{period}Y", f"BETA STANDARD ERROR_{period}Y", f"NORMALITY TEST_{period}Y"] 
  df_ny["STOCK CODE"] = df_ny["STOCK CODE"].replace({"[.]KL": ""}, regex=True)
  return df_ny

  import pandas.util.testing as tm


In [None]:
regression_df = getRegression(total_stock_return_df, period=period, rf=rf, confidence_level=confidence_level)
regression_df.sort_values(f"INTERCEPT_{period}Y", ascending=False)

Unnamed: 0,STOCK CODE,BETA_5Y,INTERCEPT_5Y,R-SQUARED_5Y,P-VALUE_5Y,BETA STANDARD ERROR_5Y,NORMALITY TEST_5Y
719,7045,3.11626,0.086377,0.096677,0.0,0.591894,normal
130,0174,2.199872,0.07993,0.005465,0.233951,1.843942,normal
29,0043,2.720588,0.074267,0.046508,0.00045,0.765437,normal
48,0072,2.744057,0.07171,0.099559,0.0,0.51278,normal
381,4464,1.785529,0.068192,0.008966,0.127043,1.166414,normal
...,...,...,...,...,...,...,...
615,5303,,,,,,
616,5305,,,,,,
617,5306,,,,,,
618,5308,,,,,,


#### -- Calculate Annualized Return and Standard Devation

In [None]:
def getAnnualizedReturn(df, interval:str, type:str="geometric", skipna:bool=False):
  """Params:
  interval : str
    Valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
  """
  if type == "arithmetic":
    df= df.mean(skipna=skipna)
  elif type == "geometric":
    if skipna:
      df = df.add(1).apply(lambda x: scipy.stats.gmean(x.dropna(inplace=False)), axis=0).sub(1)
    else:
      df = df.add(1).apply(scipy.stats.gmean).sub(1)
  annualized_return_dict = {"1d": 252,"1wk":52, "1mo":12, "3mo":4}
  annualized_return = df.multiply(annualized_return_dict[interval])
  return annualized_return

def getAnnualizedStdDeviation(df, interval:str = interval, skipna=skipna):
  """Params:
  interval : str
    Valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
  """
  df = df.std(skipna=skipna)
  annualized_std_dict = {"1d": 252,"1wk":52, "1mo":12, "3mo":4}
  annualized_std = df.multiply(annualized_std_dict[interval])
  return annualized_std

## Merge All DataFrame

In [None]:
# merge dataframes of `clean_df_stock_list`, `regression_df`, `descriptive_df`

from functools import reduce

main_dataframe = [clean_df_stock_list, regression_df, descriptive_df]

merged_df = reduce(lambda left, right: pandas.merge(left, right, on="STOCK CODE", how ="left"), main_dataframe)
merged_df.sort_values(f"INTERCEPT_{period}Y", ascending=False)

Unnamed: 0,STOCK SYMBOL,STOCK CODE,SECTOR,SUBSECTOR,MKT,OPEN,LAST,CHG%,CHG,VOL,MKT CAP,BETA_5Y,INTERCEPT_5Y,R-SQUARED_5Y,P-VALUE_5Y,BETA STANDARD ERROR_5Y,NORMALITY TEST_5Y,standard_deviation_of_equity_5Y,annualized_return_of_equity_5Y
839,SCOMIES,7045,ENERGY,"ENERGY INFRASTRUCTURE, EQUIPMENT & SERVICES",MAIN,0.000,0.025,0.00,0.000,0,11709000,3.11626,0.086377,0.096677,0.0,0.591894,normal,7.387408,-0.750296
275,EVD,0174,TECHNOLOGY,SOFTWARE,ACE,0.190,0.185,-2.63,-0.005,0.304 M,73568000,2.199872,0.07993,0.005465,0.233951,1.843942,normal,38.863415,0.492463
634,MTRONIC,0043,INDUSTRIAL PRODUCTS & SERVICES,INDUSTRIAL SERVICES,MAIN,0.055,0.055,0.00,0.000,32.008 M,84105000,2.720588,0.074267,0.046508,0.00045,0.765437,normal,15.100084,-1.208788
76,AT,0072,INDUSTRIAL PRODUCTS & SERVICES,"INDUSTRIAL MATERIALS, COMPONENTS & EQUIPMENT",ACE,0.010,0.010,-33.33,-0.005,0.03 M,60006000,2.744057,0.07171,0.099559,0.0,0.51278,normal,8.809609,-0.988343
742,PHB,4464,PROPERTY,PROPERTY,MAIN,0.000,0.010,0.00,0.000,0,108212000,1.785529,0.068192,0.008966,0.127043,1.166414,normal,15.197406,-0.682033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048,YB-LA,5048LA,INDUSTRIAL PRODUCTS & SERVICES,BUILDING MATERIALS,BOND,0.045,0.045,0.00,0.000,0.15 M,54330000,,,,,,,,
1050,YENHER,5300,CONSUMER PRODUCTS & SERVICES,AGRICULTURAL PRODUCTS,MAIN,0.800,0.800,0.00,0.000,0 M,240000000,,,,,,,,
1051,YEWLEE,0248,INDUSTRIAL PRODUCTS & SERVICES,"INDUSTRIAL MATERIALS, COMPONENTS & EQUIPMENT",ACE,0.265,0.260,-1.89,-0.005,1.227 M,138433000,,,,,,,,
1059,YONGTAI-PA,7066PA,PROPERTY,PROPERTY,MAIN,0.000,0.085,0.00,0.000,0,1452000,,,,,,,,


## Aggregate Data

In [None]:
sector_overview_df = merged_df.groupby("SECTOR").agg({f"BETA_{period}Y": "mean",
                                    f"INTERCEPT_{period}Y": "mean",
                                    f"annualized_return_of_equity_{period}Y": "mean", 
                                    f"standard_deviation_of_equity_{period}Y": "mean"
                                    }).dropna().sort_values(f"INTERCEPT_{period}Y", ascending=False)
sector_overview_df

Unnamed: 0_level_0,BETA_5Y,INTERCEPT_5Y,annualized_return_of_equity_5Y,standard_deviation_of_equity_5Y
SECTOR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENERGY,1.285268,0.018952,-0.217233,10.094581
TECHNOLOGY,1.231364,0.014597,-0.187891,7.569571
HEALTH CARE,1.005504,0.00461,-0.373515,3.150328
CONSTRUCTION,1.011877,0.000124,-0.119106,3.148551
INDUSTRIAL PRODUCTS & SERVICES,0.926662,-8.9e-05,0.0162,4.136353
TELECOMMUNICATIONS & MEDIA,0.936226,-0.000433,-0.181013,4.88854
TRANSPORTATION & LOGISTICS,0.869592,-0.003231,0.011599,4.22035
PLANTATION,0.832159,-0.004759,0.123334,2.62933
FINANCIAL SERVICES,0.828764,-0.005535,0.036985,2.235316
UTILITIES,0.81084,-0.006304,-0.036691,1.79308


In [None]:
sub_sector_overview_df = merged_df.groupby("SUBSECTOR").agg({f"BETA_{period}Y": "mean",
                                    f"INTERCEPT_{period}Y": "mean",
                                    f"annualized_return_of_equity_{period}Y": "mean", 
                                    f"standard_deviation_of_equity_{period}Y": "mean"
                                    }).dropna().sort_values(f"INTERCEPT_{period}Y", ascending=False)
sub_sector_overview_df

Unnamed: 0_level_0,BETA_5Y,INTERCEPT_5Y,annualized_return_of_equity_5Y,standard_deviation_of_equity_5Y
SUBSECTOR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
OTHER ENERGY RESOURCES,1.954564,0.046648,-0.370045,5.243031
OIL & GAS PRODUCERS,1.66714,0.028546,0.016695,4.429514
SOFTWARE,1.385653,0.020559,-0.322304,6.507558
HEALTH CARE EQUIPMENT & SERVICES,1.316023,0.019103,-0.865605,4.636583
TECHNOLOGY EQUIPMENT,1.318531,0.018123,-0.283561,5.334808
"ENERGY INFRASTRUCTURE, EQUIPMENT & SERVICES",1.165847,0.015045,-0.241467,11.339326
DIGITAL SERVICES,1.097248,0.009441,-0.092677,12.613479
INDUSTRIAL ENGINEERING,1.123913,0.007357,-0.058829,4.713028
METALS,1.098889,0.00668,0.226167,3.930723
SEMICONDUCTORS,0.976443,0.004449,0.053659,4.11938


In [None]:
merged_df.to_csv("../data/bursa")
sub_sector_overview_df.to_csv("")

merged_df.to_csv(f"../data/bursa_companies_p{period}_rf{rf}_int{interval}_cl{confidence_level}_exw{exclude_warrant}.csv")
sector_overview_df.to_csv(f"../data/sector_overview_p{period}_rf{rf}_int{interval}_cl{confidence_level}_exw{exclude_warrant}.csv")
sub_sector_overview_df.to_csv(f"./data/subsector_overview_p{period}_rf{rf}_int{interval}_cl{confidence_level}_exw{exclude_warrant}.csv")