# **Seleksi Asisten Lab Basdat 2024**
## **Discover the Titans: Top Companies Dominating Each Sector in Indonesia**
### *Data Scraping idnfinancials.com*
### Anindita Widya Santoso/18222128
---

#### **Setup Environment**

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import json
from sqlalchemy import create_engine, text
from datetime import datetime

chrome_options = Options()
chrome_options.add_argument("--log-level=3")  # Mengurangi logging ke level minimum
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_argument('--headless')  # Menambahkan opsi headless (tidak membuka chrome)
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

In [2]:
# Bisa diganti untuk melakukan load data (sehingga tidak perlu melakukan scraping kembali)
LOAD_STATE = False

#### **Scraping Sectors**

In [3]:
# Import konten pada halaman yang akan di-scrape
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
driver.get("https://www.idnfinancials.com/company")
html_content = driver.page_source
soup = BeautifulSoup(html_content, "html.parser")
driver.quit()

# Scrape seluruh data sektor yang ada
elements = soup.find("select", id="company-filter-sectors")
sectors = elements.find_all("option")

# Menyimpan data sektor dalam list
sectors_name = []

# Mulai iterasi dari elemen kedua dan hanya ambil yang memiliki value
for i, sector in enumerate(sectors):
    value = sector.get("value")
    if value: # Harus ada value, kalau tidak ada di-skip
        sector_name = sector.text.strip()
        sectors_name.append(sector_name)

##### **Data Sectors**

In [4]:
# Memasukkan data sectors ke file JSON
def save(filename, key, value):

    with open(f"../data/{filename}.json", "w") as f:
        data = {}
        data[key] = value
        f.write(json.dumps(data))

# Load/membaca data sectors dari file JSON
def load(filename):
    with open(f"../data/{filename}.json", "r") as f:
        return json.load(f)

In [105]:
# Menampilkan data sectors
sectors_name

['Energy',
 'Basic Materials',
 'Industrials',
 'Consumer Non-Cyclicals',
 'Consumer Cyclicals',
 'Healthcare',
 'Financials',
 'Properties and Real Estate',
 'Technology',
 'Infrastructure',
 'Transportation and Logistics',
 'Listed Investment Products']

In [6]:
# Menampilkan data sectors (tanpa scraping, load JSON)
if LOAD_STATE:
    sectors_name = load("sectors_name")["sectors-name"]
    sectors_name

#### **Scraping Holdings**

In [7]:
# Scrape data sesuai URL
def scrape_sector_data(sector_name, sector_url):
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    driver.get(sector_url)
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, "html.parser")
    driver.quit()

    # Scrape seluruh data sektor yang ada
    holdings = soup.find_all("div", class_="table-row")

    # Menyimpan data dalam list
    data = []
    for holding in holdings:
        entry = {}
        code_element = holding.find("span", class_="code")
        name_element = holding.find("span", class_="name")
        market_cap_element = holding.find("div", class_="tc-market-cap")
        price_element = holding.find("div", class_="tc-price")
        change_element = holding.find("div", class_="tc-change")
    
        entry["Code"] = code_element.get_text(strip=True) if code_element else ""
        entry["Company Name"] = name_element.get_text(strip=True) if name_element else ""
        entry["Sector"] = sector_name
        entry["Market Cap"] = market_cap_element.get_text(strip=True) if market_cap_element else ""
        entry["Price"] = price_element.get_text(strip=True) if price_element else ""
        entry["Change"] = change_element.get_text(strip=True) if change_element else ""
        
        net_profits = holding.find("div", class_="tc-net-profit").find_all("div", class_="tp-item")
        profits = {
            "Profit 2019": "",
            "Profit 2020": "",
            "Profit 2021": "",
            "Profit 2022": "",
            "Profit 2023": ""
        }
        for profit in net_profits:
            year = profit.find("div", class_="asof").get_text(strip=True)
            value = profit.find("div", class_="val").get_text(strip=True)
            if year == "2019":
                profits["Profit 2019"] = value
            elif year == "2020":
                profits["Profit 2020"] = value
            elif year == "2021":
                profits["Profit 2021"] = value
            elif year == "2022":
                profits["Profit 2022"] = value
            elif year == "2023":
                profits["Profit 2023"] = value
        
        entry.update(profits)
        data.append(entry)

    return data

# URL setiap sektor
sector_url_back = {
    'Energy': 'energy-a',
    'Basic Materials': 'basic-materials-b',
    'Industrials': 'industrials-c',
    'Consumer Non-Cyclicals': 'consumer-non-cyclicals-d',
    'Consumer Cyclicals': 'consumer-cyclicals-e',
    'Healthcare': 'healthcare-f',
    'Financials': 'financials-g',
    'Properties and Real Estate': 'properties-and-real-estate-h',
    'Technology': 'technology-i',
    'Infrastructure': 'infrastructure-j',
    'Transportation and Logistics': 'transportation-and-logistics-k',
    'Listed Investment Products': 'listed-investment-products-l'
}

# Base URL
base_url = 'https://www.idnfinancials.com/company/sector/'

# Menyimpan seluruh data dalam satu list
all_data = []

# Iterasi seluruh sektor untuk scrape data
for sector_name in sectors_name:
    sector_url = f"{base_url}{sector_url_back[sector_name]}"
    sector_data = scrape_sector_data(sector_name, sector_url)
    print(sector_data) # Melihat progress scraping (untuk mengecek apakah scraper berjalan dengan baik)
    all_data.extend(sector_data)

[{'Code': '', 'Company Name': '', 'Sector': 'Energy', 'Market Cap': 'Market Cap.(Million IDR)', 'Price': 'Price', 'Change': 'Change', 'Profit 2019': '', 'Profit 2020': '', 'Profit 2021': '', 'Profit 2022': '', 'Profit 2023': ''}, {'Code': 'BYAN', 'Company Name': 'PT. Bayan Resources Tbk', 'Sector': 'Energy', 'Market Cap': '568.332.755,00', 'Price': '17.050', 'Change': '-25 (-0,15%)', 'Profit 2019': '3.105.405(16.05%)', 'Profit 2020': '4.636.862(23.56%)', 'Profit 2021': '17.304.927(42.52%)', 'Profit 2022': '34.269.584(46.31%)', 'Profit 2023': '19.093.997(34.58%)'}, {'Code': 'DSSA', 'Company Name': 'PT. Dian Swastatika Sentosa Tbk', 'Sector': 'Energy', 'Market Cap': '182.314.198,00', 'Price': '29.575', 'Change': '-25 (-0,25%)', 'Profit 2019': '717.440(3.01%)', 'Profit 2020': '-1.197.456(-5.56%)', 'Profit 2021': '1.715.393(5.55%)', 'Profit 2022': '9.921.080(9.99%)', 'Profit 2023': '7.102.984(8.5%)'}, {'Code': 'ADRO', 'Company Name': 'PT. Adaro Energy Indonesia Tbk', 'Sector': 'Energy', 'M

In [8]:
# Menampilkan seluruh data tanpa melakukan scraping (load file JSON)
if LOAD_STATE:
    all_data = load("sectors_data")
    all_data

#### **Data Frame**

In [124]:
# Menampilkan isi data (head, 5 data teratas)
df = pd.DataFrame(all_data)
df.head()

Unnamed: 0,Code,Company Name,Sector,Market Cap,Price,Change,Profit 2019,Profit 2020,Profit 2021,Profit 2022,Profit 2023
0,,,Energy,Market Cap.(Million IDR),Price,Change,,,,,
1,BYAN,PT. Bayan Resources Tbk,Energy,"568.332.755,00",17.050,"-25 (-0,15%)",3.105.405(16.05%),4.636.862(23.56%),17.304.927(42.52%),34.269.584(46.31%),19.093.997(34.58%)
2,DSSA,PT. Dian Swastatika Sentosa Tbk,Energy,"182.314.198,00",29.575,"-25 (-0,25%)",717.440(3.01%),-1.197.456(-5.56%),1.715.393(5.55%),9.921.080(9.99%),7.102.984(8.5%)
3,ADRO,PT. Adaro Energy Indonesia Tbk,Energy,"99.128.650,00",3.210,"-10 (-0,31%)",5.613.750(11.69%),2.069.394(5.8%),13.335.600(23.38%),38.954.375(30.77%),25.252.868(25.18%)
4,CUAN,PT. Petrindo Jaya Kreasi Tbk,Energy,"95.556.152,00",8.525,"+50 (+1,00%)",,,,,238.327(15.95%)


#### **Data Cleaning**
- Menghapus header yang tidak dibutuhkan
- Mengubah tipe data numerik (awalnya string)
- Membersihkan data (tanda baca numerik)
- Memisahkan data angka dengan persentasenya

In [130]:
df = pd.DataFrame(all_data)

# Fungsi untuk membersihkan tanda baca numerik
def clean_numeric(value):
    if isinstance(value, str):
        return "".join(value.split(",")[0].split("(")[0].split("."))
    else:
        return value

# Fungsi untuk mengubah tipe data
def convert_to_numeric(value):
    try:
        return float(value)
    except ValueError:
        return value

def get_percentage(value):
    if isinstance(value, str):
        if value == "":
            return 0
        a = value.split("(")
        if len(a) == 1:
            return value
        b = a[1].replace(")","").replace("%","").replace(",",".")
        try:
            return float(b)    
        except ValueError:
            clean = b.replace("+","").replace("-","")
            if float(a[0]) > 0:
                return float(clean)
            else:
                return float(-clean)
    else:
        return value

def remove_na(value):
    if isinstance(value, str) and (value.lower() == "n/a" or value == ""):
        return 0
    else:
        return value

# Menghilangkan data yang kosong
df = df[df['Code'] != '']
    
# Mendapatkan kolom persentase
percentage_columns = {
    "Profit 2019": "Profit 2019 %", 
    "Profit 2020": "Profit 2020 %",
    "Profit 2021": "Profit 2021 %", 
    "Profit 2022": "Profit 2022 %", 
    "Profit 2023": "Profit 2023 %", 
    "Change": "Change %"}
for column in percentage_columns.keys():
    df[percentage_columns[column]] = df[column].apply(get_percentage)


# Membersihkan dan mengkonversi kolom numerik
numerical_columns = ["Market Cap", "Price", "Change", "Profit 2019", "Profit 2020", "Profit 2021", "Profit 2022", "Profit 2023"]
for column in numerical_columns:
    df[column] = df[column].apply(clean_numeric).apply(convert_to_numeric)

# N/A menjadi 0
for col in numerical_columns:
    df[col] = df[col].apply(remove_na)

for col in percentage_columns.keys():
    df[percentage_columns[col]] = df[percentage_columns[col]].apply(remove_na)

# Tampilkan data yang sudah dibersihkan
df

Unnamed: 0,Code,Company Name,Sector,Market Cap,Price,Change,Profit 2019,Profit 2020,Profit 2021,Profit 2022,Profit 2023,Profit 2019 %,Profit 2020 %,Profit 2021 %,Profit 2022 %,Profit 2023 %,Change %
1,BYAN,PT. Bayan Resources Tbk,Energy,568332755.0,17050.0,-25.0,3105405.0,4636862.0,17304927.0,34269584.0,19093997.0,16.05,23.56,42.52,46.31,34.58,-0.15
2,DSSA,PT. Dian Swastatika Sentosa Tbk,Energy,182314198.0,29575.0,-25.0,717440.0,-1197456.0,1715393.0,9921080.0,7102984.0,3.01,-5.56,5.55,9.99,8.50,-0.25
3,ADRO,PT. Adaro Energy Indonesia Tbk,Energy,99128650.0,3210.0,-10.0,5613750.0,2069394.0,13335600.0,38954375.0,25252868.0,11.69,5.80,23.38,30.77,25.18,-0.31
4,CUAN,PT. Petrindo Jaya Kreasi Tbk,Energy,95556152.0,8525.0,50.0,0.0,0.0,0.0,0.0,238327.0,0.00,0.00,0.00,0.00,15.95,1.00
5,ADMR,PT. Adaro Minerals Indonesia Tbk,Energy,54986692.0,1345.0,5.0,-5636.0,-398386.0,2215865.0,5190789.0,6784950.0,-0.37,-22.94,33.71,36.58,40.61,0.37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,TRJA,PT. Transkon Jaya Tbk,Transportation and Logistics,280897.0,186.0,3.0,0.0,38318.0,45991.0,36445.0,14982.0,0.00,9.52,11.42,7.55,2.47,1.64
227,RCCC,PT. Utama Radar Cahaya Tbk,Transportation and Logistics,255150.0,322.0,8.0,0.0,2116.0,3029.0,3759.0,3501.0,0.00,7.05,8.57,6.17,4.62,2.55
228,MPXL,PT. MPX Logistics International Tbk,Transportation and Logistics,240001.0,120.0,-1.0,0.0,0.0,10554.0,6013.0,9430.0,0.00,0.00,7.39,5.14,6.72,-1.00
229,GTRA,PT. Grahaprima Suksesmandiri Tbk,Transportation and Logistics,238692.0,126.0,-4.0,7303.0,6728.0,7788.0,15834.0,33421.0,6.26,5.32,5.29,7.33,9.69,-3.08


#### **JSON File**

In [132]:
# Memasukkan data ke dalam file JSON
with open('../data/all_data.json', 'w') as f:
    f.write(df.to_json(orient='records', indent=4))

#### **SQL File**

In [135]:
# Membuat database engine
engine = create_engine('postgresql+psycopg2://postgres:postgres@localhost:5432/titans')

# Membuat tabel sesuai diagram relasional (sectors, companies, profit, pricehistory)
create_tables_sql = """
CREATE TABLE IF NOT EXISTS sectors (
    sector_id SERIAL PRIMARY KEY,
    sector_name VARCHAR (30) UNIQUE NOT NULL
);

CREATE TABLE IF NOT EXISTS companies (
    company_code VARCHAR(4) PRIMARY KEY,
    sector_id INTEGER NOT NULL,
    company_name VARCHAR (100) NOT NULL,
    market_cap FLOAT NOT NULL,
    FOREIGN KEY (sector_id) REFERENCES sectors(sector_id)
);

CREATE TABLE IF NOT EXISTS profit (
    year INTEGER NOT NULL,
    company_code VARCHAR(255) NOT NULL,
    profit_value FLOAT NOT NULL,
    profit_percentage FLOAT NOT NULL,
    PRIMARY KEY (year, company_code),
    FOREIGN KEY (company_code) REFERENCES companies(company_code)
);

CREATE TABLE IF NOT EXISTS pricehistory (
    date DATE NOT NULL,
    company_code VARCHAR(255) NOT NULL,
    price FLOAT NOT NULL,
    change FLOAT NOT NULL,
    change_percentage FLOAT NOT NULL,
    PRIMARY KEY (date, company_code),
    FOREIGN KEY (company_code) REFERENCES companies(company_code)
);
"""

# Menjalankan perintah SQL untuk membuat tabel
with engine.connect() as connection:
    for statement in create_tables_sql.strip().split(";"):
        if statement:
            connection.execute(text(statement.strip()))
            connection.commit()

In [136]:
# Tabel sectors
sectors = pd.DataFrame(sectors_name)
sectors['sector_id'] = sectors.index + 1
sectors = sectors.rename(columns={0: 'sector_name'})
sectors = sectors[['sector_id', 'sector_name']]

# Menggabungkan sector_id ke dataframe df
df_merge = df.merge(sectors, left_on='Sector', right_on='sector_name').drop(columns=['Sector'])

# Tabel companies
companies = df_merge[['Code', 'sector_id', 'Company Name', 'Market Cap']].rename(
    columns={'Code': 'company_code', 'Company Name': 'company_name', 'Market Cap': 'market_cap'}
)

# Tabel profit
# Mendapatkan profit value untuk setiap tahun untuk setiap company
melted_profit_value = pd.melt(df, id_vars=['Code'], value_vars=['Profit 2019', 'Profit 2020', 'Profit 2021', 'Profit 2022', 'Profit 2023'],
                              var_name='Year', value_name='Profit Value')

melted_profit_value['Year'] = melted_profit_value['Year'].str.extract(r'(\d{4})').astype(int)

# Mendapatkan profit percentage untuk setiap tahun untuk setiap company
melted_profit_percentage = pd.melt(df, id_vars=['Code'], value_vars=['Profit 2019 %', 'Profit 2020 %', 'Profit 2021 %', 'Profit 2022 %', 'Profit 2023 %'],
                              var_name='Year', value_name='Profit Percentage')

melted_profit_percentage['Year'] = melted_profit_percentage['Year'].str.extract(r'(\d{4})').astype(int)

# Menyatukan kedua tabel profit
profit = pd.merge(melted_profit_value, melted_profit_percentage, on=['Code', 'Year']).rename(
    columns={'Code': 'company_code', 'Year': 'year', 'Profit Value': 'profit_value', 'Profit Percentage': 'profit_percentage'}
)

# Tabel pricehistory
pricehistory = df[['Code', 'Price', 'Change', 'Change %']].rename(
    columns={'Code': 'company_code', 'Price': 'price', 'Change': 'change', 'Change %': 'change_percentage'}
)
pricehistory['date'] = datetime.today().strftime('%Y-%m-%d')


In [9]:
# Membuat database engine
engine = create_engine('postgresql+psycopg2://postgres:postgres@localhost:5432/titans')

# Membuat trigger untuk automated schedule
create_function_trigger_sql = """
CREATE OR REPLACE FUNCTION check_and_insert_company() RETURNS TRIGGER AS $$
DECLARE
    company_exists BOOLEAN;
BEGIN
    SELECT EXISTS(SELECT 1 FROM companies WHERE company_code = NEW.company_code) INTO company_exists;

    IF NOT company_exists THEN
        INSERT INTO companies (company_code, company_name, market_cap)
        VALUES (NEW.company_code, 'Unknown Company', 0);

        -- Masukkan data profit default
        INSERT INTO profit (year, company_code, profit_value, profit_percentage)
        VALUES (extract(year from NEW.date), NEW.company_code, 0, 0);
    END IF;

    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

CREATE TRIGGER trigger_pricehistory_insert
BEFORE INSERT ON pricehistory
FOR EACH ROW
EXECUTE FUNCTION check_and_insert_company();
"""
# Memasukkan data ke tabel SQL dan menambahkan trigger untuk automated scraping
with engine.connect() as connection:
    connection.execute(text(create_function_trigger_sql))
    connection.commit()


In [137]:
# Memasukkan data ke tabel SQL dan menambahkan trigger untuk automated scraping
with engine.connect() as connection:
    sectors.to_sql('sectors', connection, if_exists='append', index=False)
    companies.to_sql('companies', connection, if_exists='append', index=False)
    profit.to_sql('profit', connection, if_exists='append', index=False)
    pricehistory.to_sql('pricehistory', connection, if_exists='append', index=False)
    connection.execute(text(create_function_trigger_sql))
    connection.commit()