In [1]:
import requests
import os
from dotenv import load_dotenv
load_dotenv()
from supabase import create_client
import pandas as pd
import datetime
from datetime import datetime
import logging
import argparse
import numpy as np
from bs4 import BeautifulSoup
import json
from cleansing import clean_daily_foreign_data, clean_periodic_foreign_data

In [6]:
url_supabase = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")
supabase = create_client(url_supabase, key)

In [16]:
data_code = supabase.table("sgx_companies").select("*").execute()
data_code = pd.DataFrame(data_code.data)

In [17]:
data_code

Unnamed: 0,name,symbol,currency,close,market_cap,volume,pe,revenue,eps,beta,...,operating_margin,net_profit_margin,quick_ratio,current_ratio,debt_to_equity,dividend_yield_5y_avg,dividend_growth_rate,payout_ratio,sector,sub_sector
0,Keppel Corp,KPLM,SGD,6.600,1.192000e+10,1723700,2.90,6.970000e+09,2.280,0.755,...,,,,1.04,,,,0.1454,Consumer Cyclicals,Apparel & Luxury Goods
1,HK Land Holdings,HKLD,USD,3.220,7.080000e+09,787500,-12.17,1.840000e+09,-0.263,0.539,...,0.4139,-0.3157,0.46,1.70,0.2056,0.0561,,-0.8350,Properties & Real Estate,Properties & Real Estate
2,Jardine C&C,JCYC,SGD,28.630,1.132000e+10,930800,6.89,2.223000e+10,3.080,0.459,...,0.1396,0.0547,0.46,1.26,0.9411,,0.3246,0.3644,Consumer Cyclicals,Apparel & Luxury Goods
3,City Development,CTDM,SGD,5.270,4.730000e+09,1212500,15.77,4.940000e+09,0.336,0.841,...,0.1345,0.0642,0.86,1.77,1.3871,0.0253,-0.4286,0.3070,Properties & Real Estate,Properties & Real Estate
4,Semb Corp,SCIL,SGD,5.030,8.970000e+09,637100,9.53,7.040000e+09,0.528,0.698,...,0.1681,0.1338,0.63,0.78,1.6486,0.0251,0.8571,0.1699,Infrastructures,Utilities
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
589,IHH Healthcare,IHHH,MYR,6.290,5.540000e+10,387800,23.83,2.175000e+10,0.264,0.143,...,0.1987,0.1071,0.73,0.84,0.3385,0.0102,0.7500,0.7600,Healthcare,Healthcare Equipment & Providers
590,KOP Ltd,KOPL,SGD,0.034,3.767000e+07,0,-11.33,8.072000e+07,-0.003,-0.098,...,0.0450,-0.0363,0.27,1.19,0.4984,,0.0000,,Properties & Real Estate,Properties & Real Estate
591,Sinostar PEC Holdings Ltd,SNSR,SGD,0.131,8.384000e+07,0,1.62,5.550000e+09,0.434,0.593,...,0.0795,0.0500,1.55,2.33,0.3683,0.0143,-1.0000,,Energy,"Oil, Gas & Coal"
592,Hengyang Petrochemical,HNYG,EUR,0.062,1.261000e+07,0,-6.28,8.600000e+04,-0.077,-0.263,...,-59.1860,-183.0116,11.92,11.92,0.0002,,0.0000,,Energy,"Oil, Gas & Coal"


In [12]:
def clean_daily_foreign_data(foreign_daily_data):
    """
    SGX/KLSE Daily Data Fetching Cleansing.

    Parameters:
    - foreign_daily_data: dataframe, KLSE/SGX daily data from investing.com api hit

    Returns:
    - foreign_daily_data: dataframe, Cleaned KLSE/SGX daily data
    """

    # Replace '-' data with ''
    foreign_daily_data = foreign_daily_data.replace('-',np.nan)

    # Remove percentage and change data to decimal
    for i in ["daily",'weekly','monthly','ytd','one_year','three_year']:
        foreign_daily_data[f"{i}_percentage_change"] = foreign_daily_data[f"{i}_percentage_change"]/100 

    # Rename columns
    foreign_daily_data.rename(columns={"daily_percentage_change":"change_1d", "weekly_percentage_change":'change_7d', 
                        "monthly_percentage_change":"change_1m", 
                        "ytd_percentage_change":"change_ytd",
                        "one_year_percentage_change":"change_1y",
                        "three_year_percentage_change":"change_3y"}, inplace=True)

    # Delete redundant percentage change columns
    foreign_daily_data.drop("percentage_change",axis=1, inplace = True)

    # Change data type to float
    float_columns = ['close', 'market_cap', 'volume','pe', 'revenue', 'beta','change_1d',
       'change_7d', 'change_1m', 'change_ytd', 'change_1y', 'change_3y',]

    foreign_daily_data[float_columns] = foreign_daily_data[float_columns].applymap(lambda x:float(str(x).replace(',', '')))

    return foreign_daily_data

def clean_periodic_foreign_data(foreign_periodic_data, foreign_sectors):
    """
    SGX/KLSE Periodic Data Fetching Cleansing.

    Parameters:
    - foreign_periodic_data: dataframe, periodic data from investing.com data scraping using request
    - foreign_sectors: dataframe, KLSE/SGX sectors mapping to IDX sectors

    Returns:
    - foreign_periodic_data: dataframe, Cleaned KLSE/SGX periodic data
    """

    # Replace '-' data with ''
    foreign_periodic_data = foreign_periodic_data.replace('-',np.nan)

    foreign_periodic_data['dividend_yield'] = foreign_periodic_data['dividend_yield'].apply(lambda x: float(x.strip('%')) / 100 if pd.notnull(x) else np.nan)

    for i in ["gross_margin","operating_margin",'net_profit_margin',"debt_to_equity","five_year_dividend_average",'dividend_growth_rate',"payout_ratio","five_year_eps_growth","five_year_sales_growth","five_year_capital_spending_growth"]:
        foreign_periodic_data[i] = foreign_periodic_data[i].apply(lambda x: float(x.replace('%', '').replace(',', '')) / 100 if pd.notnull(x) else np.nan)

    foreign_periodic_data.rename(columns={"five_year_dividend_average":"dividend_yield_5y_avg"}, inplace=True) 
    
    float_columns = ['eps', 'dividend', 'dividend_yield', 'pe_ttm', 'ps_ttm', 'pcf', 'pcf_ttm', 'pb', 'five_year_eps_growth',
       'five_year_sales_growth', 'five_year_capital_spending_growth',
       'asset_turnover', 'inventory turnover (ttm)', 'receivable_turnover',
       'gross_margin', 'operating_margin', 'net_profit_margin', 'quick_ratio',
       'current_ratio', 'debt_to_equity', 'dividend_yield_5y_avg',
       'dividend_growth_rate', 'payout_ratio']

    foreign_periodic_data[float_columns] = foreign_periodic_data[float_columns].applymap(lambda x:float(str(x).replace(',', '')))

    foreign_periodic_data = foreign_periodic_data.merge(foreign_sectors, on = ["sector",'industry']).drop(["sector",'industry'], axis=1).rename(columns={"sectors_id":"sector","sub_sector_id":"sub_sector"})

    return foreign_periodic_data

In [8]:
data = pd.read_csv("data_full_clean_sg.csv")
data.head()

Unnamed: 0,name,symbol,currency,sector,industry,close,percentage_change,market_cap,volume,pe,...,receivable_turnover,gross_margin,operating_margin,net_profit_margin,quick_ratio,current_ratio,debt_to_equity,five_year_dividend_average,dividend_growth_rate,payout_ratio
0,Keppel Corp,KPLM,SGD,Consumer Non-Cyclicals,Consumer Goods Conglomerates,6.6,+0.76%,11920000000.0,1723700,2.9,...,4.95,28.15%,-,-,-,1.04,-,-,-,14.54%
1,HK Land Holdings,HKLD,USD,Real Estate,Real Estate Operations,3.22,-0.92%,7080000000.0,787500,-12.17,...,56.44,50.46%,41.39%,-31.57%,0.46,1.7,20.56%,5.61%,-,-83.5%
2,Jardine C&C,JCYC,SGD,Consumer Non-Cyclicals,Consumer Goods Conglomerates,28.63,+1.60%,11320000000.0,930800,6.89,...,10.83,22.71%,13.96%,5.47%,0.46,1.26,94.11%,-,32.46%,36.44%
3,City Development,CTDM,SGD,Real Estate,Real Estate Operations,5.27,-0.19%,4730000000.0,1212500,15.77,...,1.79,33.36%,13.45%,6.42%,0.86,1.77,138.71%,2.53%,-42.86%,30.7%
4,Semb Corp,SCIL,SGD,Utilities,Multiline Utilities,5.03,+0.80%,8970000000.0,637100,9.53,...,5.18,22.34%,16.81%,13.38%,0.63,0.78,164.86%,2.51%,85.71%,16.99%


In [14]:
clean_daily_foreign_data(data).reset_index(drop = True)

  foreign_daily_data[float_columns] = foreign_daily_data[float_columns].applymap(lambda x:float(str(x).replace(',', '')))


Unnamed: 0,name,symbol,currency,sector,industry,close,market_cap,volume,pe,revenue,...,receivable_turnover,gross_margin,operating_margin,net_profit_margin,quick_ratio,current_ratio,debt_to_equity,five_year_dividend_average,dividend_growth_rate,payout_ratio
0,Keppel Corp,KPLM,SGD,Consumer Non-Cyclicals,Consumer Goods Conglomerates,6.600,1.192000e+10,1723700.0,2.90,6.970000e+09,...,4.95,28.15%,,,,1.04,,,,14.54%
1,HK Land Holdings,HKLD,USD,Real Estate,Real Estate Operations,3.220,7.080000e+09,787500.0,-12.17,1.840000e+09,...,56.44,50.46%,41.39%,-31.57%,0.46,1.7,20.56%,5.61%,,-83.5%
2,Jardine C&C,JCYC,SGD,Consumer Non-Cyclicals,Consumer Goods Conglomerates,28.630,1.132000e+10,930800.0,6.89,2.223000e+10,...,10.83,22.71%,13.96%,5.47%,0.46,1.26,94.11%,,32.46%,36.44%
3,City Development,CTDM,SGD,Real Estate,Real Estate Operations,5.270,4.730000e+09,1212500.0,15.77,4.940000e+09,...,1.79,33.36%,13.45%,6.42%,0.86,1.77,138.71%,2.53%,-42.86%,30.7%
4,Semb Corp,SCIL,SGD,Utilities,Multiline Utilities,5.030,8.970000e+09,637100.0,9.53,7.040000e+09,...,5.18,22.34%,16.81%,13.38%,0.63,0.78,164.86%,2.51%,85.71%,16.99%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,IHH Healthcare,IHHH,MYR,Healthcare,Healthcare Providers & Services,6.290,5.540000e+10,387800.0,23.83,2.175000e+10,...,8.15,33.82%,19.87%,10.71%,0.73,0.84,33.85%,1.02%,75%,76%
592,KOP Ltd,KOPL,SGD,Real Estate,Real Estate Operations,0.034,3.767000e+07,0.0,-11.33,8.072000e+07,...,0.89,27.21%,4.5%,-3.63%,0.27,1.19,49.84%,,0%,
593,Sinostar PEC Holdings Ltd,SNSR,SGD,Energy,Oil & Gas,0.131,8.384000e+07,0.0,1.62,5.550000e+09,...,339.74,8.9%,7.95%,5%,1.55,2.33,36.83%,1.43%,-100%,
594,Hengyang Petrochemical,HNYG,EUR,Energy,Oil & Gas Related Equipment and Services,0.062,1.261000e+07,0.0,-6.28,8.600000e+04,...,,100%,"-5,918.6%","-18,301.16%",11.92,11.92,0.02%,,0%,


In [9]:
def GetGeneralData(country):
    if country == "sg":
        url ="https://api.investing.com/api/financialdata/assets/equitiesByCountry/default?fields-list=id%2Cname%2Csymbol%2CisCFD%2Chigh%2Clow%2Clast%2ClastPairDecimal%2Cchange%2CchangePercent%2Cvolume%2Ctime%2CisOpen%2Curl%2Cflag%2CcountryNameTranslated%2CexchangeId%2CperformanceDay%2CperformanceWeek%2CperformanceMonth%2CperformanceYtd%2CperformanceYear%2Cperformance3Year%2CtechnicalHour%2CtechnicalDay%2CtechnicalWeek%2CtechnicalMonth%2CavgVolume%2CfundamentalMarketCap%2CfundamentalRevenue%2CfundamentalRatio%2CfundamentalBeta%2CpairType&country-id=36&filter-domain=&page=0&page-size=1000&limit=0&include-additional-indices=false&include-major-indices=false&include-other-indices=false&include-primary-sectors=false&include-market-overview=false"
    elif country == "my":
        url = "https://api.investing.com/api/financialdata/assets/equitiesByCountry/default?fields-list=id%2Cname%2Csymbol%2CisCFD%2Chigh%2Clow%2Clast%2ClastPairDecimal%2Cchange%2CchangePercent%2Cvolume%2Ctime%2CisOpen%2Curl%2Cflag%2CcountryNameTranslated%2CexchangeId%2CperformanceDay%2CperformanceWeek%2CperformanceMonth%2CperformanceYtd%2CperformanceYear%2Cperformance3Year%2CtechnicalHour%2CtechnicalDay%2CtechnicalWeek%2CtechnicalMonth%2CavgVolume%2CfundamentalMarketCap%2CfundamentalRevenue%2CfundamentalRatio%2CfundamentalBeta%2CpairType&country-id=42&filter-domain=&page=0&page-size=2000&limit=0&include-additional-indices=false&include-major-indices=false&include-other-indices=false&include-primary-sectors=false&include-market-overview=false"
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    for i in range(10):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            json_data = response.json()
            data = pd.DataFrame(json_data["data"])
            break
        else:
            continue
    return data

In [10]:
def GetAdditionalData(links):
    data_list = []
    failed_links = {
        "links" : [],
        "page" : []
    }
    for link in links:
        try:
            data_dict = {
                "Url" : link
            }
            # Page Overview
            url = f"https://www.investing.com{link}"
            response = requests.get(url)
            if response.status_code == 200:
                html_content = response.text
                soup = BeautifulSoup(html_content, "html.parser")
                close = soup.find(class_ = "text-5xl/9 font-bold text-[#232526] md:text-[42px] md:leading-[60px]").get_text()
                change_percent = soup.find('span', {'data-test': 'instrument-price-change-percent'}).get_text().replace("(", "").replace(")", "")
                currency = soup.find(class_ = "ml-1.5 font-bold").get_text()
                values = soup.find_all(class_ = "flex flex-wrap items-center justify-between border-t border-t-[#e6e9eb] pt-2.5 sm:pb-2.5 pb-2.5")
                expected_values = ["Volume", "Market Cap", "Revenue", "P/E Ratio", "EPS",  "Dividend (Yield)"]
                data_dict["close"] = close
                data_dict["change_percent"] = change_percent
                data_dict["currency"] = currency
                for value in values:
                    value = value.get_text()
                    for expected_value in expected_values:
                        if expected_value in value:
                            if expected_value == "Dividend (Yield)":
                                value = value.replace(expected_value, "")
                                try:
                                    dividend, yields = value.split("(")
                                    data_dict["dividend"] = dividend
                                    data_dict["dividend_yield"] = yields.replace(")", "")
                                except:
                                    data_dict["dividend"] = "-"
                                    data_dict["dividend_yield"] = "-"
                            else:
                                data_dict[expected_value] = value.replace(expected_value, "")
                company_profile = soup.find(class_ = "mt-6 font-semibold md:mt-0")
                desired_infos = ["Industry", "Sector"]
                for info in company_profile:
                    info = info.get_text()
                    for desired_info in desired_infos:
                        if desired_info in info:
                            data_dict[desired_info] = info.replace(desired_info, "")
            else:
                failed_links["links"].append(link)
                failed_links["page"].append("overview")
                logging.error(f"error at overview page with link: {link}")
                print(f"error at overview page with link: {link}")
                
            # Page Ratios
            url = f"https://www.investing.com{link}-ratios"
            response = requests.get(url)
            if response.status_code == 200:
                html_content= response.text
                soup = BeautifulSoup(html_content, "html.parser")
                values = soup.find_all("tr")
                expected_values = [
                    "P/E Ratio TTM", "Price to Sales TTM", "Price to Cash Flow MRQ", "Price to Free Cash Flow TTM", "Price to Book MRQ",
                    "5 Year EPS Growth 5YA", "5 Year Sales Growth 5YA", "5 Year Capital Spending Growth 5YA", "Asset Turnover TTM",
                    "Inventory Turnover TTM", "Receivable Turnover TTM", "Gross margin TTM", "Operating margin TTM", "Net Profit margin TTM",
                    "Quick Ratio MRQ", "Current Ratio MRQ", "Total Debt to Equity MRQ", "Dividend Yield 5 Year Avg. 5YA", "Dividend Growth Rate ANN",
                    "Payout Ratio TTM"
                ]
                for value in values:
                    temp = value.get_text()
                    for expected_value in expected_values:
                        if expected_value in temp:
                            metric = value.find_all("td")[1].get_text()
                            data_dict[expected_value] = metric
            else:
                failed_links["links"].append(link)
                failed_links["page"].append("ratios")
                logging.error(f"error at ratios page with link: {link}")
                print(f"error at ratios page with link: {link}")
            data_list.append(data_dict)
        except Exception as e:
            logging.error(f"error in {link}: ", e)
            failed_links["links"].append(link)
            failed_links["page"].append("all")
    extension = pd.DataFrame(data_list)
    return extension, failed_links

In [27]:
def convert_to_number(x):
    if isinstance(x, str):
        if 'T' in x:
            return float(x.replace('T', '')) * 1e12
        elif 'B' in x:
            return float(x.replace('B', '')) * 1e9
        elif 'M' in x:
            return float(x.replace('M', '')) * 1e6
        elif 'K' in x:
            return float(x.replace('K', '')) * 1e3
        else:
            try:
                return float(x.replace(',', ''))
            except ValueError:
                return np.nan
    elif isinstance(x, (int, float)):
        return x
    else:
        return np.nan

def rename_and_convert(data, period):
    if period == "monthly":
        rename_cols = {
            'Name' : 'name', 
            'Symbol' : 'symbol',
            'currency' : 'currency',
            'Sector' : 'sector', 
            'Industry' : 'industry', 
            'Last' : 'close', 
            'ChgPct' : 'percentage_change',
            'FundamentalMarketCap' : 'market_cap', 
            'Volume_x' : 'volume', 
            'FundamentalRatio' : 'pe', 
            'FundamentalRevenue' : 'revenue',
            'EPS' : 'eps',
            'FundamentalBeta' : 'beta', 
            'dividend' : 'dividend',
            'dividend_yield' : 'dividend_yield',
            'TechnicalDay' : 'daily_signal',
            'TechnicalWeek' : 'weekly_signal', 
            'TechnicalMonth' : 'monthly_signal',
            'PerformanceDay' : 'daily_percentage_change', 
            'PerformanceWeek' : 'weekly_percentage_change',
            'PerformanceMonth' : 'monthly_percentage_change', 
            'PerformanceYtd' : 'ytd_percentage_change',
            'PerformanceYear' : 'one_year_percentage_change', 
            'Performance3Year' : 'three_year_percentage_change', 
            'P/E Ratio TTM' : 'pe_ttm',
            'Price to Sales TTM' : 'ps_ttm', 
            'Price to Cash Flow MRQ' : 'pcf', 
            'Price to Free Cash Flow TTM' : 'pcf_ttm', 
            'Price to Book MRQ' : 'pb', 
            '5 Year EPS Growth 5YA' : 'five_year_eps_growth',
            '5 Year Sales Growth 5YA' : 'five_year_sales_growth', 
            '5 Year Capital Spending Growth 5YA' : 'five_year_capital_spending_growth',
            'Asset Turnover TTM' : 'asset_turnover', 
            'Inventory Turnover TTM' : 'inventory turnover (ttm)', 
            'Receivable Turnover TTM' : 'receivable_turnover',
            'Gross margin TTM' : 'gross_margin', 
            'Operating margin TTM' : 'operating_margin', 
            'Net Profit margin TTM' : 'net_profit_margin', 
            'Quick Ratio MRQ' : 'quick_ratio',
            'Current Ratio MRQ' : 'current_ratio', 
            'Total Debt to Equity MRQ' : 'debt_to_equity', 
            'Dividend Yield 5 Year Avg. 5YA' : 'five_year_dividend_average',
            'Dividend Growth Rate ANN' : 'dividend_growth_rate', 
            'Payout Ratio TTM' : 'payout_ratio'
        }
        cleaned_data = data[rename_cols.keys()].rename(rename_cols, axis = 1)

        cleaned_data.replace(['-', 'N/A'], np.nan, inplace=True)
        cleaned_data['revenue'] = cleaned_data['revenue'].apply(convert_to_number)
        cleaned_data['market_cap'] = cleaned_data['market_cap'].apply(convert_to_number)
        return cleaned_data
    elif period == "daily":
        rename_cols = {
            'Symbol' : 'symbol',
            'Last' : 'close', 
            'ChgPct' : 'percentage_change',
            'FundamentalMarketCap' : 'market_cap', 
            'Volume' : 'volume', 
            'FundamentalRatio' : 'pe', 
            'FundamentalRevenue' : 'revenue',
            'FundamentalBeta' : 'beta', 
            'TechnicalDay' : 'daily_signal',
            'TechnicalWeek' : 'weekly_signal', 
            'TechnicalMonth' : 'monthly_signal',
            'PerformanceDay' : 'daily_percentage_change', 
            'PerformanceWeek' : 'weekly_percentage_change',
            'PerformanceMonth' : 'monthly_percentage_change', 
            'PerformanceYtd' : 'ytd_percentage_change',
            'PerformanceYear' : 'one_year_percentage_change', 
            'Performance3Year' : 'three_year_percentage_change',
        }
        cleaned_data = data[rename_cols.keys()].rename(rename_cols, axis = 1)

        cleaned_data.replace(['-', 'N/A'], np.nan, inplace=True)
        cleaned_data['revenue'] = cleaned_data['revenue'].apply(convert_to_number)
        cleaned_data['market_cap'] = cleaned_data['market_cap'].apply(convert_to_number)
        return cleaned_data

In [77]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import urllib.request

In [78]:
def GetGeneralData(country):
    if country == "sg":
        url ="https://api.investing.com/api/financialdata/assets/equitiesByCountry/default?fields-list=id%2Cname%2Csymbol%2CisCFD%2Chigh%2Clow%2Clast%2ClastPairDecimal%2Cchange%2CchangePercent%2Cvolume%2Ctime%2CisOpen%2Curl%2Cflag%2CcountryNameTranslated%2CexchangeId%2CperformanceDay%2CperformanceWeek%2CperformanceMonth%2CperformanceYtd%2CperformanceYear%2Cperformance3Year%2CtechnicalHour%2CtechnicalDay%2CtechnicalWeek%2CtechnicalMonth%2CavgVolume%2CfundamentalMarketCap%2CfundamentalRevenue%2CfundamentalRatio%2CfundamentalBeta%2CpairType&country-id=36&filter-domain=&page=0&page-size=1000&limit=0&include-additional-indices=false&include-major-indices=false&include-other-indices=false&include-primary-sectors=false&include-market-overview=false"
    elif country == "my":
        url = "https://api.investing.com/api/financialdata/assets/equitiesByCountry/default?fields-list=id%2Cname%2Csymbol%2CisCFD%2Chigh%2Clow%2Clast%2ClastPairDecimal%2Cchange%2CchangePercent%2Cvolume%2Ctime%2CisOpen%2Curl%2Cflag%2CcountryNameTranslated%2CexchangeId%2CperformanceDay%2CperformanceWeek%2CperformanceMonth%2CperformanceYtd%2CperformanceYear%2Cperformance3Year%2CtechnicalHour%2CtechnicalDay%2CtechnicalWeek%2CtechnicalMonth%2CavgVolume%2CfundamentalMarketCap%2CfundamentalRevenue%2CfundamentalRatio%2CfundamentalBeta%2CpairType&country-id=42&filter-domain=&page=0&page-size=2000&limit=0&include-additional-indices=false&include-major-indices=false&include-other-indices=false&include-primary-sectors=false&include-market-overview=false"
    # headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    # data_from_api = None
    print("url :", url)
    proxy = os.environ.get("proxy")

    proxy_support = urllib.request.ProxyHandler({'http': proxy,'https': proxy})
    opener = urllib.request.build_opener(proxy_support)
    urllib.request.install_opener(opener)
    with urllib.request.urlopen(url) as response:
        html = response.read()

    data_from_api = json.loads(html)
    # for i in range(10):
        # response = requests.get(url, headers=headers)

        # if response.status_code == 200:
        #     json_data = response.json()
        #     data_from_api = pd.DataFrame(json_data["data"])
        #     break
    return data_from_api

In [79]:
data = GetGeneralData("sg")

url : https://api.investing.com/api/financialdata/assets/equitiesByCountry/default?fields-list=id%2Cname%2Csymbol%2CisCFD%2Chigh%2Clow%2Clast%2ClastPairDecimal%2Cchange%2CchangePercent%2Cvolume%2Ctime%2CisOpen%2Curl%2Cflag%2CcountryNameTranslated%2CexchangeId%2CperformanceDay%2CperformanceWeek%2CperformanceMonth%2CperformanceYtd%2CperformanceYear%2Cperformance3Year%2CtechnicalHour%2CtechnicalDay%2CtechnicalWeek%2CtechnicalMonth%2CavgVolume%2CfundamentalMarketCap%2CfundamentalRevenue%2CfundamentalRatio%2CfundamentalBeta%2CpairType&country-id=36&filter-domain=&page=0&page-size=1000&limit=0&include-additional-indices=false&include-major-indices=false&include-other-indices=false&include-primary-sectors=false&include-market-overview=false


TypeError: expected string or bytes-like object, got 'NoneType'

In [36]:
data = rename_and_convert(data, "daily")

In [43]:
data

Unnamed: 0,symbol,close,percentage_change,market_cap,volume,pe,revenue,beta,daily_signal,weekly_signal,monthly_signal,daily_percentage_change,weekly_percentage_change,monthly_percentage_change,ytd_percentage_change,one_year_percentage_change,three_year_percentage_change
0,KPLM,6.500,-1.66,1.172000e+10,5048100,2.840,6.970000e+09,0.755,strong_sell,strong_sell,buy,-1.66,-1.96,-3.27,-8.06,-2.99,97.15
1,HKLD,3.220,0.31,7.080000e+09,1551600,-12.210,1.840000e+09,0.539,strong_sell,strong_sell,strong_sell,0.31,-1.53,-7.20,-7.47,-19.30,-34.02
2,JCYC,28.840,0.80,1.138000e+10,666700,6.910,2.223000e+10,0.459,strong_buy,strong_buy,neutral,0.80,6.62,6.19,-3.09,-14.78,35.21
3,CTDM,5.280,0.38,4.720000e+09,2338700,15.650,4.940000e+09,0.841,strong_sell,strong_sell,strong_sell,0.38,-1.68,-9.74,-20.60,-23.03,-24.01
4,SCIL,4.980,-1.19,8.900000e+09,1667500,9.450,7.040000e+09,0.698,strong_sell,strong_sell,strong_buy,-1.19,-0.80,-4.23,-6.21,-7.43,131.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,SHEF,0.200,0.00,3.632000e+07,0,9.610,2.759000e+07,0.000,strong_sell,strong_sell,buy,0.00,2.56,0.50,-14.89,0.00,0.00
592,WINK,0.270,0.00,7.412000e+07,1000,25.620,2.928000e+07,0.000,strong_buy,strong_buy,strong_buy,0.00,1.89,5.88,28.57,0.00,0.00
593,SINI,0.100,0.00,1.097900e+08,11000,-7.030,1.623000e+07,0.000,sell,neutral,strong_buy,0.00,-1.96,0.00,0.00,0.00,0.00
594,ADVANC,0.770,1.32,2.265000e+10,171000,20.190,1.954500e+11,0.027,buy,buy,strong_buy,1.32,-0.65,0.00,0.00,0.00,0.00


In [45]:
data = clean_daily_foreign_data(data)

  foreign_daily_data[float_columns] = foreign_daily_data[float_columns].applymap(lambda x:float(str(x).replace(',', '')))


In [66]:
drop_cols = ['close', 'market_cap', 'volume', 'pe',
 'revenue', 'beta', 'daily_signal', 'weekly_signal',
 'monthly_signal', 'change_1d', 'change_7d', 'change_1m',
 'change_ytd', 'change_1y', 'change_3y']
data_code.drop(drop_cols, axis = 1, inplace = True)

In [67]:
pd.merge(data, data_code, on = "symbol", how = "inner")

Unnamed: 0,symbol,close,market_cap,volume,pe,revenue,beta,daily_signal,weekly_signal,monthly_signal,...,operating_margin,net_profit_margin,quick_ratio,current_ratio,debt_to_equity,dividend_yield_5y_avg,dividend_growth_rate,payout_ratio,sector,sub_sector
0,KPLM,6.500,1.172000e+10,5048100.0,2.840,6.970000e+09,0.755,strong_sell,strong_sell,buy,...,,,,1.04,,,,0.1454,Consumer Cyclicals,Apparel & Luxury Goods
1,HKLD,3.220,7.080000e+09,1551600.0,-12.210,1.840000e+09,0.539,strong_sell,strong_sell,strong_sell,...,0.4139,-0.3157,0.46,1.70,0.2056,0.0561,,-0.8350,Properties & Real Estate,Properties & Real Estate
2,JCYC,28.840,1.138000e+10,666700.0,6.910,2.223000e+10,0.459,strong_buy,strong_buy,neutral,...,0.1396,0.0547,0.46,1.26,0.9411,,0.3246,0.3644,Consumer Cyclicals,Apparel & Luxury Goods
3,CTDM,5.280,4.720000e+09,2338700.0,15.650,4.940000e+09,0.841,strong_sell,strong_sell,strong_sell,...,0.1345,0.0642,0.86,1.77,1.3871,0.0253,-0.4286,0.3070,Properties & Real Estate,Properties & Real Estate
4,SCIL,4.980,8.900000e+09,1667500.0,9.450,7.040000e+09,0.698,strong_sell,strong_sell,strong_buy,...,0.1681,0.1338,0.63,0.78,1.6486,0.0251,0.8571,0.1699,Infrastructures,Utilities
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
589,NIKS,0.150,1.950000e+07,0.0,12.490,1.106000e+07,0.000,strong_sell,strong_sell,neutral,...,0.2578,0.1524,,,0.0000,,0.0000,5.6346,Consumer Non-Cyclicals,Nondurable Household Products
590,SHEF,0.200,3.632000e+07,0.0,9.610,2.759000e+07,0.000,strong_sell,strong_sell,buy,...,0.1774,0.0897,3.04,3.59,0.0167,0.0459,0.0000,,Industrials,Industrial Services
591,WINK,0.270,7.412000e+07,1000.0,25.620,2.928000e+07,0.000,strong_buy,strong_buy,strong_buy,...,0.0446,0.0608,,,0.0000,,0.0000,,Industrials,Industrial Services
592,SINI,0.100,1.097900e+08,11000.0,-7.030,1.623000e+07,0.000,sell,neutral,strong_buy,...,-0.8463,-1.1132,,,0.0000,0.0000,0.0000,,Healthcare,Healthcare Equipment & Providers


In [69]:
pd.read_csv("sectors_mapping/sectors_sg.csv", sep = ";")

Unnamed: 0,sector,industry,sectors_id,sub_sector_id
0,Academic & Educational Services,Miscellaneous Educational Service Providers,Consumer Cyclicals,Consumer Services
1,Academic & Educational Services,Professional & Business Education,Consumer Cyclicals,Professional & Business Education
2,Basic Materials,Chemicals,Basic Materials,Basic Materials
3,Basic Materials,Construction Materials,Basic Materials,Basic Materials
4,Basic Materials,Containers & Packaging,Basic Materials,Basic Materials
5,Basic Materials,Metals & Mining,Basic Materials,Basic Materials
6,Basic Materials,Paper & Forest Products,Basic Materials,Basic Materials
7,Consumer Cyclicals,Automobiles & Auto Parts,Consumer Cyclicals,Automobiles & Components
8,Consumer Cyclicals,Diversified Retail,Consumer Cyclicals,Retailing
9,Consumer Cyclicals,Homebuilding & Construction Supplies,Infrastructures,Heavy Constructions & Civil Engineering


In [23]:
logging.basicConfig(filename="logs.log", level=logging.INFO)
data = GetGeneralData("sg")
links = data["Url"].tolist()
extension, failed_links = GetAdditionalData(links)
data_full = pd.merge(data, extension, on = "Url", how = "inner")
# Retry the failed links
n_try = 0
failed_links["links"] = [link.split("?")[0] if "?" in link else link for link in failed_links["links"]]
while len(failed_links["links"]) != 0 or n_try < 10:
    print(f"iterasi ke-{n_try+1}")
    if len(failed_links["links"]) == 0:
        break
    new_extension, failed_links = GetAdditionalData(failed_links["links"])
    n_try += 1
remaining = data[data["Url"].isin(failed_links["links"])]
remaining = remaining.assign(Url = [link.split("?")[0] if "?" in link else link for link in failed_links["links"]])
updated_extension = pd.merge(remaining, new_extension, on = "Url", how = "inner")
data_final = pd.concat([data_full[~data_full["Url"].isin(failed_links["links"])], updated_extension])
data_final = rename_and_convert(data_final)

error at overview page with link: /equities/ihh-healthcare-bhd?cid=955231
error at overview page with link: /equities/kop-ltd
error at overview page with link: /equities/sinostar-pec-holdings-ltd
error at ratios page with link: /equities/hengyang-petrochemical-logistics-lt?cid=991275
error at overview page with link: /equities/netlink


In [101]:
data_final.to_csv("data_full_sg_clean.csv")