# Import libraries

In [29]:
import pandas as pd
from bs4 import BeautifulSoup
from curl_cffi import requests as curl_requests
from datetime import datetime
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import os
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()

True

# Connect to MongoDB

In [2]:
from requests import get
ip = get('https://api.ipify.org').content.decode('utf8')
print(ip)

115.78.231.117


In [3]:
uri = os.getenv("MONGO_URI")
if uri is None:
    raise ValueError("The MONGO_URI environment variable is not set.")

client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [4]:
db = client.get_database("stock_company")

## Store data to MongoDB

In [5]:
def save_data_to_mongo(data, collection):
    try:
        collection.insert_many(data)
        print("Data saved successfully")
    except Exception as e:
        print(e)

# Prepare data

## Choose to crawl data from source or get data from cloud

In [6]:
crawl_get = input("Do you want to crawl data from the website? (y/n): ")

## Crawl data

### Define stock exchange, headers

In [7]:
stock_exchange = ['HOSE', 'UPCOM', 'HNX']
stock_exchange.sort()

In [8]:
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0"}

In [9]:
ssi_url = "https://iboard-query.ssi.com.vn/v2/stock/exchange/"
ssi_iboard_url = f"https://iboard-api.ssi.com.vn/statistics/company/stock-price"

In [10]:
fromDate = '01/01/2023'
toDate = '01/01/2024'

### Crawl more company data from SSI

In [11]:
def get_company_list(company_list, stock_exchange):
    for exchange in stock_exchange:
        url = f"{ssi_url}{exchange}"
        print(url)
        response = curl_requests.get(url, headers = headers)
        if response.status_code == 200:
            response = response.json()
            if 'data' in response:
                data = response['data']
                company_list.extend(company['ss'].upper() for company in data if company['ss'].upper() not in company_list)

In [12]:
company_list = []
get_company_list(company_list, stock_exchange)
company_list.sort()
print(f"Number of company: {len(company_list)}")

https://iboard-query.ssi.com.vn/v2/stock/exchange/HNX
https://iboard-query.ssi.com.vn/v2/stock/exchange/HOSE
https://iboard-query.ssi.com.vn/v2/stock/exchange/UPCOM
Number of company: 1588


### Remove existed company in database

In [13]:
if crawl_get == 'y':
    existed_company = list(db.list_collections())
    existed_company = [company['name'] for company in existed_company]
    existed_company.sort()
    existed_company.pop() if len(existed_company) > 0 else None
    print(f"Number of existed company: {len(existed_company)}")
    company_list = [company for company in company_list if company not in existed_company]
    print(f"Number of new company: {len(company_list)}")

### Define crawler function

## Get, update company closed price

In [14]:
def update_price(timeline, list_price, ssi_date, close_price):
    try:
        if ssi_date in timeline:
            index = timeline.index(ssi_date)
            list_price[index] = float(close_price)
    except Exception:
        pass

In [15]:
def duplicate_price(timeline, list_price):
    price_len = len(list_price)
    timeline_len = len(timeline)
    if price_len < timeline_len:
        last_price = list_price[-1] if price_len > 0 else 0
        list_price.extend([last_price] * (timeline_len - price_len))

In [16]:
def get_company_price(company, fromDate, toDate, vietstock_news, cafef_news):
    vietstock_timeline = list(vietstock_news[company]['Timeline'])
    vietstock_price = list(0 for _ in range(len(vietstock_timeline)))
    # cafef_timeline = list(cafef_news[company]['Timeline'])
    # cafef_price = list(0 for _ in range(len(cafef_timeline)))
    # startDate = list_timeline[-1]
    # startDate = datetime.strptime(startDate, "%d/%m/%Y").strftime("%d%%2F%m%%2F%Y")
    # endDate = list_timeline[0]
    # endDate = datetime.strptime(endDate, "%d/%m/%Y").strftime("%d%%2F%m%%2F%Y")
    finished = False
    index = 1
    max_index = 1
    while not finished:
        # url = f"https://s.cafef.vn/Ajax/PageNew/DataHistory/PriceHistory.ashx?Symbol={company}&StartDate={startDate}&EndDate={endDate}&PageIndex={index}"
        url = f"{ssi_iboard_url}?symbol={company}&page={index}&pageSize=10&fromDate={fromDate}&toDate={toDate}"
        try:
            response = curl_requests.get(url, headers = headers, timeout = 10)
            if response.status_code == 200:
                data = response.json()
                total = data['paging']['total']
                max_index = (total + 9) // 10
                data = data.get('data', [])
                for item in data:
                    ssi_date = item['tradingDate'].split(' ')[0]
                    close_price = item['closePrice']
                    update_price(vietstock_timeline, vietstock_price, ssi_date, close_price)
                    # update_price(cafef_timeline, cafef_price, ssi_date, close_price)
                print(f"Fetched price data on page: {index}")
            else:
                print(f"Failed to fetch price data for {company}")
            if index == max_index:
                finished = True
            else:
                index += 1
        except Exception as e:
            print(f"Request failed for {company}. Skipping to the next page.")
            index += 1
            pass
    duplicate_price(vietstock_timeline, vietstock_price)
    # duplicate_price(cafef_timeline, cafef_price)
    vietstock_news[company]['Price'] = vietstock_price
    # cafef_news[company]['Price'] = cafef_price

## Get content from title link

In [17]:
def get_content(url, source):
    try:
        contents = list()
        response = curl_requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            body = BeautifulSoup(response.content, "html.parser")
            if source == "cafef":
                contents = body.find_all("div", {"id": "newscontent"})
            else:
                contents = body.find_all("div", {"id": "vst_detail"})
            contents = [content.text.strip() for content in contents]
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        print(f"Request timed out for link: {url}.")
    return contents

## Get title, time written and content url

### Crawl data from CafeF

In [18]:
def get_cafef_news(company, fromDate, toDate, site_url):
    index = 1
    finished = False
    list_title = list()
    list_timeline = list()
    list_link = list()
    while not finished:
        url = f"{site_url}?symbol={company}&floorID=0&configID=0&PageIndex={index}&PageSize=30&Type=2"
        try:
            response = curl_requests.get(url, headers=headers, timeout=10)
            if response.status_code == 200:
                body = BeautifulSoup(response.content, "html.parser")
                titles = body.find_all("a", class_="docnhanhTitle")
                titles = [title.text.strip().split(": ")[-1] for title in titles]
                timelines = body.find_all("span", class_="timeTitle")
                timelines = [time.text.strip().split(" ")[0] for time in timelines]
                if (
                    len(titles) != len(timelines)
                    or len(titles) == 0
                    or len(timelines) == 0
                ):
                    raise Exception({"message": "Data not found", "code": 404})
                list_title.extend(titles)
                list_timeline.extend(timelines)
                links = body.find_all("a")
                links = [link["href"] for link in links]
                list_link.extend(links)
                print(f"Fetched data on page: {index}")
            else:
                print(f"Failed to fetch data for {company} on page: {index}")
            finished = datetime.strptime(
                timelines[-1], "%d/%m/%Y"
            ) <= datetime.strptime(fromDate, "%d/%m/%Y")
            if not finished:
                index += 1
        except Exception as e:
            code = e.args[0]["code"]
            print(f"Data not found for {company} on page: {index}" if code == 404 else f"Request timed out for {company}. Skipping to the next page.")
            finished, index = (True, index) if code == 404 else (finished, index + 1)
            pass
    return list_title, list_timeline, list_link

### Crawl data from VietStock

In [19]:
def get_vietstock_news(company, fromDate, toDate, site_url):
    index = 1
    max_index = 1
    finished = False
    list_title = list()
    list_timeline = list()
    list_link = list()
    while not finished:
        data = {
            "view": "1",
            "code": company,
            "type": "1",
            "fromDate": fromDate,
            "toDate": toDate,
            "channelID": "-1",
            "page": index,
            "pageSize": "20",
        }
        url = site_url
        try:
            # Create a copy of headers and modify the copy
            request_headers = headers.copy()
            request_headers["Content-Type"] = "application/x-www-form-urlencoded"
            response = curl_requests.post(
                url, headers=request_headers, data=data, timeout=10
            )
            if response.status_code == 200:
                body = BeautifulSoup(response.content, "html.parser")
                total_pages = body.find_all("div", class_="m-b pull-left")
                if len(total_pages) > 0:
                    total_pages = int(total_pages[-1].text.split(" ")[-1])
                    max_index = total_pages
                titles = body.find_all("a", class_="text-link news-link")
                titles = [title.text.strip().split(": ")[-1] for title in titles]
                timelines = body.find_all("td", class_="col-date")
                timelines = [time.text.strip().split(" ")[0] for time in timelines]
                if (
                    len(titles) != len(timelines)
                    or len(titles) == 0
                    or len(timelines) == 0
                ):
                    raise Exception({"message": "Data not found", "code": 404})
                list_title.extend(titles)
                list_timeline.extend(timelines)
                links = body.find_all("a", class_="text-link news-link")
                links = [link["href"] for link in links]
                list_link.extend(links)
                print(f"Fetched data on page: {index}")
            else:
                print(f"Failed to fetch data for {company} on page: {index}")
            index += 1
            finished = index > max_index
        except Exception as e:
            code = e.args[0]["code"]
            print(f"Data not found for {company} on page: {index}" if code == 404 else f"Request timed out for {company}. Skipping to the next page.")
            finished, index = (True, index) if code == 404 else (finished, index + 1)
            pass
    return list_title, list_timeline, list_link

### Define data path

In [20]:
data_path = "./data/"

In [21]:
if not os.path.exists(data_path):
    os.makedirs(data_path, exist_ok=True)

### Export data to CSV function

In [22]:
def export_data_to_csv(company_news, source):
    exported_data = []
    for company, item in company_news.items():
        timeline = item['Timeline']
        title = item['Title']
        content = item['Content']
        link = item.get('Link', "No link")
        price = item.get('Price', "No price")
        
        for i in range(len(timeline)):
            news_item = {
                "Company": company,
                "Title": title[i],
                # "Content": content[i],
                "Link": link[i] if i < len(link) else "No link",
                "Price": price[i] if i < len(price) else "No price",
                "Timeline": timeline[i],
            }
            exported_data.append(news_item)
    
    # Convert the list of dictionaries to DataFrame
    company_news_df = pd.DataFrame(exported_data)
    
    # Export to CSV
    company_news_df.to_csv(f"{data_path}{source}.csv", index=False)

### Store data from list to company news

In [23]:
def store_data_to_company_news(company_news, company, list_title, list_timeline, list_link, list_content):
    company_news[company] = dict()
    company_news[company]['Title'] = list_title
    company_news[company]['Timeline'] = list_timeline
    company_news[company]['Link'] = list_link
    company_news[company]['Content'] = list_content

### Crawl from source

In [24]:
cafef_base_url = "s.cafef.vn"
vietstock_base_url = "finance.vietstock.vn"

# cafef news
cafef_news = dict()

# vietstock news
vietstock_news = dict()

In [25]:
if crawl_get == 'y':
    for company in tqdm(company_list, desc="Fetching data", unit="company"):
        print(f"Fetching data for {company}")

        # Get news data from CafeF
        cafef_news[company] = dict()
        source = "cafef"
        site_url = f"https://{cafef_base_url}/Ajax/Events_RelatedNews_New.aspx"
        print(f"Get news data from CafeF for {company}")
        list_title, list_timeline, list_link = get_cafef_news(
            company, fromDate, toDate, site_url
        )
        list_content = list()
        for link in list_link:
            content_url = f"https://{cafef_base_url}{link}"
        list_content.extend(get_content(content_url, source))
        store_data_to_company_news(
            cafef_news, company, list_title, list_timeline, list_link, list_content
        )

        # Get news data from Vietstock
        vietstock_news[company] = dict()
        source = "vietstock"
        site_url = f"https://{vietstock_base_url}/View/PagingNewsContent"
        print(f"Get news data from Vietstock for {company}")
        list_title, list_timeline, list_link = get_vietstock_news(
            company, fromDate, toDate, site_url, vietstock_news
        )
        list_content = list()
        for link in list_link:
            content_url = f"https:{link}" if not link.startswith("https:") else link
            content_url = content_url.replace("\n", "")
            list_content.extend(get_content(content_url, source))
        store_data_to_company_news(
            vietstock_news, company, list_title, list_timeline, list_link, list_content
        )

        # Get price data
        get_company_price(company, fromDate, toDate, vietstock_news, cafef_news)

        # # Convert data to document and save to MongoDB
        cafef_document = convert_data_to_document(cafef_news[company], company, "cafef")
        vietstock_document = convert_data_to_document(vietstock_news[company], company, "vietstock")

        # # Save data to MongoDB
        contain_document = cafef_document + vietstock_document
        collection = db.get_collection(company)
        save_data_to_mongo(contain_document, collection)
        
        print(f"Data fetched successfully for {company}")

## Get data from MongoDB

In [26]:
mongo_company_list = db.list_collection_names()
mongo_company_list = sorted(mongo_company_list)

In [30]:
if crawl_get == 'n':
    for company in tqdm(mongo_company_list, desc="Processing companies"):
        # records from mongodb
        records = db.get_collection(company)
        all_records = records.find()

        # cafef list data
        cafef_list_title = list()
        cafef_list_timeline = list()
        cafef_list_link = list()
        cafef_list_content = list()
        cafef_list_price = list()

        # vietstock list data
        vietstock_list_title = list()
        vietstock_list_timeline = list()
        vietstock_list_link = list()
        vietstock_list_content = list()
        vietstock_list_price = list()

        for record in tqdm(all_records, desc=f"Processing {company}"):
            for key in record.keys():
                if key != "_id":
                    if record[key]["source"] == "cafef":
                        cafef_list_timeline.append(key)
                        cafef_list_title.append(record[key]["title"])
                        cafef_list_link.append(record[key]["url"])
                        cafef_list_content.append(record[key]["content"])
                        cafef_list_price.append(record[key]["closed price"])
                    elif record[key]["source"] == "vietstock":
                        vietstock_list_timeline.append(key)
                        vietstock_list_title.append(record[key]["title"])
                        vietstock_list_link.append(record[key]["url"])
                        vietstock_list_content.append(record[key]["content"])
                        vietstock_list_price.append(record[key]["closed price"])

        cafef_news[company] = dict()
        source = "cafef"
        store_data_to_company_news(
            cafef_news,
            company,
            cafef_list_title,
            cafef_list_timeline,
            cafef_list_link,
            cafef_list_content,
        )
        cafef_news[company]["Price"] = cafef_list_price

        vietstock_news[company] = dict()
        vietstock_news[company]["Price"] = list()
        source = "vietstock"
        store_data_to_company_news(
            vietstock_news,
            company,
            vietstock_list_title,
            vietstock_list_timeline,
            vietstock_list_link,
            vietstock_list_content,
        )
        vietstock_news[company]["Price"] = vietstock_list_price

Processing A32: 229it [00:00, 509.89it/s]494 [00:00<?, ?it/s]
Processing AAA: 860it [00:00, 2389.65it/s]94 [00:00<03:43,  2.21it/s]
Processing AAH: 47it [00:00, 1017.29it/s]494 [00:00<03:16,  2.50it/s]
Processing AAM: 384it [00:00, 1479.51it/s]
Processing AAS: 377it [00:00, 2121.15it/s]94 [00:01<02:01,  4.05it/s]
Processing AAT: 396it [00:00, 2052.39it/s]94 [00:01<01:50,  4.42it/s]
Processing AAV: 717it [00:00, 2592.27it/s]94 [00:01<01:46,  4.60it/s]
Processing ABB: 897it [00:00, 1119.60it/s]94 [00:01<01:55,  4.23it/s]
Processing ABC: 414it [00:00, 2340.84it/s]94 [00:02<03:19,  2.43it/s]
Processing ABI: 457it [00:00, 1981.30it/s]94 [00:02<02:45,  2.93it/s]
Processing ABR: 292it [00:00, 1941.67it/s]494 [00:03<02:29,  3.24it/s]
Processing ABS: 527it [00:00, 1886.16it/s]494 [00:03<02:06,  3.81it/s]
Processing ABT: 382it [00:00, 1341.34it/s]494 [00:03<02:09,  3.73it/s]
Processing ABW: 152it [00:00, 1485.24it/s]494 [00:03<02:11,  3.64it/s]
Processing ACB: 1309it [00:01, 907.65it/s]494 [00:0

In [31]:
export_data_to_csv(cafef_news, "cafef")
export_data_to_csv(vietstock_news, "vietstock")