## Import packages

In [96]:
import os
import investpy
import pandas as pd

from bs4 import BeautifulSoup
import requests
import time

## Get list of Malaysian stocks


In [81]:
stocks = investpy.get_stocks_list(country="malaysia")


In [4]:
print(f"Number of stocks: {len(stocks)}")

Number of stocks: 929


In [26]:
stocks[0]

'MDCH'

## Extract counter id

In [187]:
stock = stocks[926]
print(stock)
info = investpy.stocks.get_stock_company_profile(stock=stock, country='malaysia')
print(info['url'])

MEST
https://www.investing.com/equities/mestron-holdings-company-profile


In [188]:
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
url = info['url']
response = requests.get(url, timeout=5, headers=headers)
content = BeautifulSoup(response.content, "html.parser")

In [189]:
c_id = content.findAll('span',{"class":"elp"})[3].get("title")
c_id

'0207'

In [190]:
stock

'MEST'

In [191]:
stock_id = {"TICKER":stock,
           "COUNTER_ID":c_id}

In [192]:
stock_id

{'TICKER': 'MEST', 'COUNTER_ID': '0207'}

## Loop to get counter id for all Malaysian stocks

In [156]:
# %%time
stock_stack = []
faulty_stack = [] # to note down stocks that's unable to retrieve counter_id

for s in stocks[899:]:
    # Mark starting time (to slow down spider)
    t0 = time.time()
    
    # Get stock url
    info = investpy.stocks.get_stock_company_profile(stock=s, country='malaysia')
    url = info['url']
    
    try:
        # Request from investing.com
        headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
        response = requests.get(url, timeout=5, headers=headers)
        content = BeautifulSoup(response.content, "html.parser")
    
        # Get counter_id
        c_id = content.findAll('span',{"class":"elp"})[3].get("title")
    
        stock_dict = {
            "TICKER":s,
            "COUNTER_ID":c_id
            }
        
        stock_stack.append(stock_dict)
    
    # put aside s if error occurs for further action
    except Exception:
        faulty_dict = {
            "TICKER":s,
            "S_URL":url
        }
        
        faulty_stack.append(faulty_dict)
        
    # Stop spider for a moment
    response_delay = time.time() - t0 
    time.sleep(response_delay+1)
#     time.sleep(10*response_delay)   wait 10x longer than it took them to respond
    

In [157]:
df = pd.DataFrame(stock_stack)
print("Total number of Malaysian stock: {}".format(len(stocks)))
print("Number of retrieved counter id: {}".format(len(stock_stack)))
print("Number of failures: {}".format(len(faulty_stack)))

Total number of Malaysian stock: 929
Number of retrieved counter id: 29
Number of failures: 1


In [164]:
faulty_stack

[{'TICKER': 'HPMT',
  'S_URL': 'https://www.investing.com/equities/hpmt-holdings-bhd-company-profile'}]

In [165]:
df.to_json('~/Desktop/c_id.json')

## Combining json from two scrape attempts due to website maintenance

In [183]:
dtype_dic= {'TICKER': str, 
            'COUNTER_ID' : str}

# Manually add faulty row
hpmt = pd.DataFrame([{'TICKER': 'HPMT', 'COUNTER_ID':'5291'}])
js1 = pd.read_json("~/Desktop/c_id.json", dtype = dtype_dic)
js2 = pd.read_json("~/Desktop/c_id_2.json", dtype = dtype_dic)

cid_df = pd.concat([js1,js2,hpmt]).reset_index(drop=True)

In [184]:
cid_df

Unnamed: 0,TICKER,COUNTER_ID
0,MDCH,5090
1,AMMB,1015
2,CIMB,1023
3,RHBC,1066
4,HLCB,1082
...,...,...
924,GREA,0208
925,MEST,0207
926,PARL,0022
927,MTAG,0213


In [186]:
df.to_json("~/Desktop/counter_id.json")