# Import libraries

In [3]:
!pip install curl_cffi --upgrade

Collecting curl_cffi
  Using cached curl_cffi-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Using cached curl_cffi-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.1 MB)
Installing collected packages: curl_cffi
Successfully installed curl_cffi-0.7.0


In [4]:
import string
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import os
from curl_cffi import requests
import socket

# Prepare data

### Define stock exchange

In [5]:
stock_exchange = ['HOSE', 'UPCOM', 'HNX']
stock_exchange.sort()

## Crawl data

In [6]:
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0"}

In [7]:
ssi_url = "https://iboard-query.ssi.com.vn/v2/stock/exchange/"

In [8]:
fromDate = '01/01/2024'
toDate = '27/06/2024'

### Crawl more company data from SSI

In [9]:
def get_company_list(company_list, stock_exchange):
    for exchange in stock_exchange:
        url = f"{ssi_url}{exchange}"
        print(url)
        response = requests.get(url, headers = headers)
        if response.status_code == 200:
            response = response.json()
            if 'data' in response:
                data = response['data']
                company_list.extend(company['ss'].upper() for company in data if company['ss'].upper() not in company_list)

In [10]:
company_list = []
get_company_list(company_list, stock_exchange)
company_list.sort()
print(f"Number of company: {len(company_list)}")

https://iboard-query.ssi.com.vn/v2/stock/exchange/HNX
https://iboard-query.ssi.com.vn/v2/stock/exchange/HOSE
https://iboard-query.ssi.com.vn/v2/stock/exchange/UPCOM
Number of company: 1592


### Define crawler function

In [11]:
def get_company_price(company_list, fromDate, toDate, company_news):
    for company in company_list:
        company_date = list(company_news[company]['timeline'])
        company_news[company]['price'] = []
        startDate = company_date[-1]
        startDate = datetime.strptime(startDate, "%d/%m/%Y").strftime("%d%%2F%m%%2F%Y")
        endDate = company_date[0]
        endDate = datetime.strptime(endDate, "%d/%m/%Y").strftime("%d%%2F%m%%2F%Y")
        finished = False
        index = 1
        max_index = 1
        while not finished:
            # url = f"https://s.cafef.vn/Ajax/PageNew/DataHistory/PriceHistory.ashx?Symbol={company}&StartDate={startDate}&EndDate={endDate}&PageIndex={index}"
            url = f"https://iboard-api.ssi.com.vn/statistics/company/stock-price?symbol={company}&page={index}&pageSize=10&fromDate={fromDate}&toDate={endDate}"
            response = requests.get(url, headers)
            if response.status_code == 200:
                data = response.json()
                total = data['paging']['total']
                if total % 10 == 0:
                    max_index = total // 10
                else:
                    max_index = total // 10 + 1
                if 'data' in data:
                    data = data['data']
                    for date in company_date:
                        for item in data:
                            if item['tradingDate'].split(' ')[0] == date:
                                company_news[company]['price'].append(float(item['closePrice']))
            else:
                print(f"Failed to fetch price data for {company}")
            if index == max_index:
                finished = True
            else:
                index += 1
            price_len = len(company_news[company]['price'])
            date_len = len(company_date)
            if price_len < date_len:
                last_price = company_news[company]['price'][-1] if price_len > 0 else 0
                company_news[company]['price'].extend([last_price] * (date_len - price_len))

In [12]:
def check_network_connectivity(base_url):
    try:
        # Check network connectivity
        socket.create_connection((f"{base_url}", 80))
        print(f"Network connectivity to {base_url} is fine.")
    except OSError as e:
        print(f"Network connectivity issue: {e}")
        return False
    return True

In [13]:
def get_cafef_news(company_list, fromDate, toDate, site_url, company_news):
    for company in company_list:
        print(f"Fetching data for {company}")
        index = 1
        finished = False
        company_news[company] = {}
        company_news[company]['news'] = []
        company_news[company]['timeline'] = []
        company_news[company]['content'] = []
        company_news[company]['link'] = []
        while not finished:
            if not check_network_connectivity(cafef_base_url):
                print("Network connection lost. Exiting data fetching.")
                return
            url = f"{site_url}?symbol={company}&floorID=0&configID=0&PageIndex={index}&PageSize=30&Type=2"
            response = requests.get(url, headers = headers)
            if response.status_code == 200:
                news_body = BeautifulSoup(response.content, "html.parser")
                titles = news_body.find_all("a", class_="docnhanhTitle")
                titles = [title.text.strip().split(": ")[-1] for title in titles]
                links = news_body.find_all("a")
                links = [link['href'] for link in links]
                if len(company_news[company]['link']) == 0:
                    company_news[company]['link'] = links
                else:
                    company_news[company]['link'].extend(links)
                for link in links:
                    if not check_network_connectivity(cafef_base_url):
                        print("Network connection lost. Exiting data fetching.")
                        return
                    news_url = f"{cafef_base_url}{link}"
                    try:
                        news_response = requests.get(url, headers=headers, timeout=10)  # Add a timeout to the request
                    except requests.exceptions.RequestException as e:
                        print(f"Request timed out for {company} - link: {link}. Skipping to the next page.")
                        links.remove(link)  # Remove the failed link from the list
                        continue  # Skip to the next page if timeout occurs
                    print(news_url)
                    if news_response.status_code == 200:
                        content_body = BeautifulSoup(news_response.content, "html.parser")
                        contents = content_body.find_all("div", {"id": "newscontent"})
                        contents = [content.text.strip() for content in contents]
                        if len(company_news[company]['content']) == 0:
                            company_news[company]['content'] = contents
                        else:
                            company_news[company]['content'].extend(contents)
                timeline = news_body.find_all("span", class_="timeTitle")
                timeline = [time.text.strip().split(' ')[0] for time in timeline]
                if len(company_news[company]['news']) == 0:
                    company_news[company]['news'] = titles
                    company_news[company]['timeline'] = timeline
                else:
                    company_news[company]['news'].extend(titles)
                    company_news[company]['timeline'].extend(timeline)
                print(f"Fetched data for {company} - page: {index}")
            else:
                print(f"Failed to fetch data for {company}")
            if datetime.strptime(timeline[-1], "%d/%m/%Y") <= datetime.strptime(fromDate, "%d/%m/%Y"):
                finished = True
            else:
                index += 1
    get_company_price(company_list, fromDate, toDate, company_news)

In [14]:
def get_vietstock_news(company_list, fromDate, toDate, site_url, company_news):
    for company in company_list:
        print(f"Fetching data for {company}")
        index = 1
        max_index = 1
        finished = False
        company_news[company] = {}
        company_news[company]['news'] = []
        company_news[company]['timeline'] = []
        company_news[company]['content'] = []
        company_news[company]['link'] = []
        while not finished:
            if not check_network_connectivity(vietstock_base_url):
                print("Network connection lost. Exiting data fetching.")
                return
            data = {
                'view': '1',
                'code': company,
                'type': '1',
                'fromDate': fromDate,
                'toDate': toDate,
                'channelID': '-1',
                'page': index,
                'pageSize': '20'
            }
            url = site_url
            response = requests.post(url, headers = headers, data = data)
            if response.status_code == 200:
                news_body = BeautifulSoup(response.content, "html.parser")
                total_pages = news_body.find_all("div", class_="m-b pull-left")
                if (len(total_pages) > 0):
                    total_pages = int(total_pages[-1].text.split(' ')[-1])
                    max_index = total_pages
                titles = news_body.find_all("a", class_="text-link news-link")
                titles = [title.text.strip().split(": ")[-1] for title in titles]
                links = news_body.find_all("a", class_="text-link news-link")
                links = [link['href'] for link in links]
                if len(company_news[company]['link']) == 0:
                    company_news[company]['link'] = links
                else:
                    company_news[company]['link'].extend(links)
                for link in links:
                    if not check_network_connectivity(link.split('/')[2]):
                        print("Network connection lost. Exiting data fetching.")
                        return
                    if link.startswith('https:'):
                        news_url = link 
                    else:
                        news_url = f"https:{link}"
                    news_url = news_url.replace("\n", "")
                    print(news_url)
                    news_response = requests.get(news_url, headers = headers)
                    if news_response.status_code == 200:
                        content_body = BeautifulSoup(news_response.content, "html.parser")
                        contents = content_body.find_all("div", {"id": "vst_detail"})
                        contents = [content.text.strip() for content in contents]
                        if len(company_news[company]['content']) == 0:
                            company_news[company]['content'] = contents
                        else:
                            company_news[company]['content'].extend(contents)
                timeline = news_body.find_all("td", class_="col-date")
                timeline = [time.text.strip().split(' ')[0] for time in timeline]
                if len(company_news[company]['news']) == 0:
                    company_news[company]['news'] = titles
                    company_news[company]['timeline'] = timeline
                else:
                    company_news[company]['news'].extend(titles)
                    company_news[company]['timeline'].extend(timeline)
                print(f"Fetched data for {company} - page: {index}")
            else:
                print(f"Failed to fetch data for {company}")
            if index == max_index:
                finished = True
            else:
                index += 1
    get_company_price(company_list, fromDate, toDate, company_news)

In [15]:
def export_company_news(company_news):
    restructured_data = []
    for company, info in company_news.items():
        for news, content, link, price, timeline in zip(info['news'], info['content'], info['link'], info['price'], info['timeline']):
            restructured_data.append({'Company': company, 'News': news, 'Content': content, 'Link': link, 'Price': price, 'Timeline': timeline})

    df = pd.DataFrame(restructured_data)
    df.to_csv(f"{crawl_news_site}.csv", index=False)

### Crawl from CafeF

In [16]:
company_news = {}
crawl_news_site = "cafef"
cafef_base_url = "s.cafef.vn"
site_url = f"https://{cafef_base_url}/Ajax/Events_RelatedNews_New.aspx"
get_cafef_news(company_list, fromDate, toDate, site_url, company_news)
export_company_news(company_news)

Fetching data for A32
Network connectivity to s.cafef.vn is fine.
Network connectivity to s.cafef.vn is fine.
s.cafef.vn/a32-2013742/a32-tam-hoan-thoi-gian-to-chuc-dai-hoi-dong-co-dong-thuong-nien-nam-2024.chn
Network connectivity to s.cafef.vn is fine.
s.cafef.vn/a32-2004031/a32-tai-lieu-hop-dai-hoi-dong-co-dong.chn
Network connectivity to s.cafef.vn is fine.
s.cafef.vn/a32-1981286/a32-nghi-quyet-gia-han-to-chuc-dai-hoi-dong-co-dong-thuong-nien-nam-2024.chn
Network connectivity to s.cafef.vn is fine.
s.cafef.vn/a32-1960051/a32-29032024-ngay-gdkhq-to-chuc-dai-hoi-dong-co-dong-thuong-nien-nam-2024.chn
Network connectivity to s.cafef.vn is fine.
s.cafef.vn/a32-1959289/a32-bao-cao-thuong-nien-2023.chn
Network connectivity to s.cafef.vn is fine.
s.cafef.vn/a32-1956605/a32-nghi-quyet-hoi-dong-quan-tri-ve-ke-hoach-to-chuc-dai-hoi-dong-co-dong-thuong-nien-nam-2024.chn
Network connectivity to s.cafef.vn is fine.
s.cafef.vn/a32-1929662/a32-bao-cao-quan-tri-cong-ty-nam-2023.chn
Network connectiv

NameError: name 'datetime' is not defined

### Crawl from VietStock

In [None]:
company_news = {}
crawl_news_site = "vietstock"
vietstock_base_url = "finance.vietstock.vn"
site_url = f"https://{vietstock_base_url}/View/PagingNewsContent"
headers["Content-Type"] = "application/x-www-form-urlencoded"
get_vietstock_news(company_list, fromDate, toDate, site_url, company_news)
export_company_news(company_news)

## List company data

In [None]:
for company in company_list:
    print(company)
    print(f"Number of news: {len(company_news[company]['news'])}")
    print(f"Number of content: {len(company_news[company]['content'])}")
    print(f"Number of link: {len(company_news[company]['link'])}")
    print(f"Number of price: {len(company_news[company]['price'])}")
    print(f"Number of timeline: {len(company_news[company]['timeline'])}")