# Import libraries

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from curl_cffi import requests as curl_requests
from datetime import datetime
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

# Connect to MongoDB

In [None]:
from requests import get
ip = get('https://api.ipify.org').content.decode('utf8')
print(ip)

In [None]:
uri = "mongodb+srv://mongodb:lPCsg7TdVyNPSMU9@cluster0.e46mp6n.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

In [None]:
db = client.get_database("stock_company")

## Store data to MongoDB

In [None]:
def save_data_to_mongo(data, collection):
    try:
        collection.insert_many(data)
        print("Data saved successfully")
    except Exception as e:
        print(e)

In [None]:
def convert_data_to_document(data, company, source):
    list_timeline = data["timeline"]
    list_link = data["link"]
    list_title = data["title"]
    list_content = data["content"]
    list_price = data["price"]
    contain_document = list()
    for timeline in list_timeline:
        document = dict()
        document[timeline] = {
            "title": list_title[list_timeline.index(timeline)] if len(list_title) > list_timeline.index(timeline) else "No title",
            "content": list_content[list_timeline.index(timeline)] if len(list_content) > list_timeline.index(timeline) else "No content",
            "url": list_link[list_timeline.index(timeline)] if len(list_link) > list_timeline.index(timeline) else "No url",
            "closed price": list_price[list_timeline.index(timeline)] if len(list_price) > list_timeline.index(timeline) else "No price",
            "source": source,
        }
        contain_document.append(document)
    return contain_document

# Prepare data

## Crawl data

### Define stock exchange, headers

In [None]:
stock_exchange = ['HOSE', 'UPCOM', 'HNX']
stock_exchange.sort()

In [None]:
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0"}

In [None]:
ssi_url = "https://iboard-query.ssi.com.vn/v2/stock/exchange/"
ssi_iboard_url = f"https://iboard-api.ssi.com.vn/statistics/company/stock-price"

In [None]:
fromDate = '07/07/2019'
toDate = '07/07/2024'

### Crawl more company data from SSI

In [None]:
def get_company_list(company_list, stock_exchange):
    for exchange in stock_exchange:
        url = f"{ssi_url}{exchange}"
        print(url)
        response = curl_requests.get(url, headers = headers)
        if response.status_code == 200:
            response = response.json()
            if 'data' in response:
                data = response['data']
                company_list.extend(company['ss'].upper() for company in data if company['ss'].upper() not in company_list)

In [None]:
company_list = []
get_company_list(company_list, stock_exchange)
company_list.sort()
print(f"Number of company: {len(company_list)}")

### Remove existed company in database

In [None]:
existed_company = list(db.list_collections())
existed_company = [company['name'] for company in existed_company]
existed_company.sort()
company_list = [company for company in company_list if company not in existed_company]

### Define crawler function

## Get, update company closed price

In [None]:
def update_price(timeline, list_price, ssi_date, close_price):
    try:
        if ssi_date in timeline:
            index = timeline.index(ssi_date)
            list_price[index] = float(close_price)
    except Exception:
        pass

In [None]:
def duplicate_price(timeline, list_price):
    price_len = len(list_price)
    timeline_len = len(timeline)
    if price_len < timeline_len:
        last_price = list_price[-1] if price_len > 0 else 0
        list_price.extend([last_price] * (timeline_len - price_len))

In [None]:
def get_company_price(company, fromDate, toDate, vietstock_news, cafef_news):
    vietstock_timeline = list(vietstock_news[company]['timeline'])
    vietstock_price = list(0 for _ in range(len(vietstock_timeline)))
    cafef_timeline = list(cafef_news[company]['timeline'])
    cafef_price = list(0 for _ in range(len(cafef_timeline)))
    # startDate = list_timeline[-1]
    # startDate = datetime.strptime(startDate, "%d/%m/%Y").strftime("%d%%2F%m%%2F%Y")
    # endDate = list_timeline[0]
    # endDate = datetime.strptime(endDate, "%d/%m/%Y").strftime("%d%%2F%m%%2F%Y")
    finished = False
    index = 1
    max_index = 1
    while not finished:
        # url = f"https://s.cafef.vn/Ajax/PageNew/DataHistory/PriceHistory.ashx?Symbol={company}&StartDate={startDate}&EndDate={endDate}&PageIndex={index}"
        url = f"{ssi_iboard_url}?symbol={company}&page={index}&pageSize=10&fromDate={fromDate}&toDate={toDate}"
        try:
            response = curl_requests.get(url, headers = headers, timeout = 10)
            if response.status_code == 200:
                data = response.json()
                total = data['paging']['total']
                max_index = (total + 9) // 10
                data = data.get('data', [])
                for item in data:
                    ssi_date = item['tradingDate'].split(' ')[0]
                    close_price = item['closePrice']
                    update_price(vietstock_timeline, vietstock_price, ssi_date, close_price)
                    update_price(cafef_timeline, cafef_price, ssi_date, close_price)
                print(f"Fetched price data on page: {index}")
            else:
                print(f"Failed to fetch price data for {company}")
            if index == max_index:
                finished = True
            else:
                index += 1
        except Exception as e:
            print(f"Request failed for {company}. Skipping to the next page.")
            index += 1
            pass
    duplicate_price(vietstock_timeline, vietstock_price)
    duplicate_price(cafef_timeline, cafef_price)
    vietstock_news[company]['price'] = vietstock_price
    cafef_news[company]['price'] = cafef_price

## Get content from title link

In [None]:
def get_content(url, source):
    try:
        contents = list()
        response = curl_requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            body = BeautifulSoup(response.content, "html.parser")
            if source == "cafef":
                contents = body.find_all("div", {"id": "newscontent"})
            else:
                contents = body.find_all("div", {"id": "vst_detail"})
            contents = [content.text.strip() for content in contents]
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        print(f"Request timed out for link: {url}.")
    return contents

## Get title, time written and content url

In [None]:
def get_cafef_news(company, fromDate, toDate, site_url):
    index = 1
    finished = False
    list_title = list()
    list_timeline = list()
    list_link = list()
    while not finished:
        url = f"{site_url}?symbol={company}&floorID=0&configID=0&PageIndex={index}&PageSize=30&Type=2"
        try:
            response = curl_requests.get(url, headers=headers, timeout=10)
            if response.status_code == 200:
                body = BeautifulSoup(response.content, "html.parser")
                titles = body.find_all("a", class_="docnhanhTitle")
                titles = [title.text.strip().split(": ")[-1] for title in titles]
                timelines = body.find_all("span", class_="timeTitle")
                timelines = [time.text.strip().split(" ")[0] for time in timelines]
                if (
                    len(titles) != len(timelines)
                    or len(titles) == 0
                    or len(timelines) == 0
                ):
                    raise Exception({"message": "Data not found", "code": 404})
                list_title.extend(titles)
                list_timeline.extend(timelines)
                links = body.find_all("a")
                links = [link["href"] for link in links]
                list_link.extend(links)
                print(f"Fetched data on page: {index}")
            else:
                print(f"Failed to fetch data for {company} on page: {index}")
            finished = datetime.strptime(
                timelines[-1], "%d/%m/%Y"
            ) <= datetime.strptime(fromDate, "%d/%m/%Y")
            if not finished:
                index += 1
        except Exception as e:
            code = e.args[0]["code"]
            print(f"Data not found for {company} on page: {index}" if code == 404 else f"Request timed out for {company}. Skipping to the next page.")
            finished, index = (True, index) if code == 404 else (finished, index + 1)
            pass
    return list_title, list_timeline, list_link

In [None]:
def get_vietstock_news(company, fromDate, toDate, site_url, company_news):
    index = 1
    max_index = 1
    finished = False
    list_title = list()
    list_timeline = list()
    list_link = list()
    while not finished:
        data = {
            "view": "1",
            "code": company,
            "type": "1",
            "fromDate": fromDate,
            "toDate": toDate,
            "channelID": "-1",
            "page": index,
            "pageSize": "20",
        }
        url = site_url
        try:
            # Create a copy of headers and modify the copy
            request_headers = headers.copy()
            request_headers["Content-Type"] = "application/x-www-form-urlencoded"
            response = curl_requests.post(
                url, headers=request_headers, data=data, timeout=10
            )
            if response.status_code == 200:
                body = BeautifulSoup(response.content, "html.parser")
                total_pages = body.find_all("div", class_="m-b pull-left")
                if len(total_pages) > 0:
                    total_pages = int(total_pages[-1].text.split(" ")[-1])
                    max_index = total_pages
                titles = body.find_all("a", class_="text-link news-link")
                titles = [title.text.strip().split(": ")[-1] for title in titles]
                timelines = body.find_all("td", class_="col-date")
                timelines = [time.text.strip().split(" ")[0] for time in timelines]
                if (
                    len(titles) != len(timelines)
                    or len(titles) == 0
                    or len(timelines) == 0
                ):
                    raise Exception({"message": "Data not found", "code": 404})
                list_title.extend(titles)
                list_timeline.extend(timelines)
                links = body.find_all("a", class_="text-link news-link")
                links = [link["href"] for link in links]
                list_link.extend(links)
                print(f"Fetched data on page: {index}")
            else:
                print(f"Failed to fetch data for {company} on page: {index}")
            index += 1
            finished = index > max_index
        except Exception as e:
            code = e.args[0]["code"]
            print(f"Data not found for {company} on page: {index}" if code == 404 else f"Request timed out for {company}. Skipping to the next page.")
            finished, index = (True, index) if code == 404 else (finished, index + 1)
            pass
    return list_title, list_timeline, list_link

In [29]:
def export_data_to_csv(company_news, source):
    company_news_df = pd.DataFrame(company_news)
    company_news_df = company_news_df.T
    company_news_df.reset_index(inplace=True)
    company_news_df.rename(columns={'index': 'company'}, inplace=True)
    company_news_df.to_csv(f"data/{source}.csv", index=False)
    company_news_df

In [None]:
def store_data_to_company_news(company_news, company, list_title, list_timeline, list_link, list_content):
    company_news[company] = {}
    company_news[company]['title'] = list_title
    company_news[company]['timeline'] = list_timeline
    company_news[company]['link'] = list_link
    company_news[company]['content'] = list_content

### Crawl from CafeF & VietStock

In [None]:
cafef_base_url = "s.cafef.vn"
cafef_news = {}

In [None]:
vietstock_base_url = "finance.vietstock.vn"
vietstock_news = {}

In [None]:
for company in company_list:
    print(f"Fetching data for {company}")

    # Get news data from CafeF
    cafef_news[company] = {}
    source = "cafef"
    site_url = f"https://{cafef_base_url}/Ajax/Events_RelatedNews_New.aspx"
    list_title, list_timeline, list_link = get_cafef_news(
        company, fromDate, toDate, site_url
    )
    list_content = list()
    print(f"Get content for {company}")
    for link in list_link:
        content_url = f"https://{cafef_base_url}{link}"
    list_content.extend(get_content(content_url, source))
    store_data_to_company_news(
        cafef_news, company, list_title, list_timeline, list_link, list_content
    )

    # Get news data from Vietstock
    vietstock_news[company] = {}
    source = "vietstock"
    site_url = f"https://{vietstock_base_url}/View/PagingNewsContent"
    list_title, list_timeline, list_link = get_vietstock_news(
        company, fromDate, toDate, site_url, vietstock_news
    )
    list_content = list()
    for link in list_link:
        content_url = f"https:{link}" if not link.startswith("https:") else link
        content_url = content_url.replace("\n", "")
        list_content.extend(get_content(content_url, source))
    store_data_to_company_news(
        vietstock_news, company, list_title, list_timeline, list_link, list_content
    )

    # Get price data
    get_company_price(company, fromDate, toDate, vietstock_news, cafef_news)

    # Convert data to document and save to MongoDB
    cafef_document = convert_data_to_document(cafef_news[company], company, "cafef")
    vietstock_document = convert_data_to_document(vietstock_news[company], company, "vietstock")

    # Save data to MongoDB
    contain_document = cafef_document + vietstock_document
    collection = db.get_collection(company)
    save_data_to_mongo(contain_document, collection)
    
    print(f"Data fetched successfully for {company}")

# Export data to CSV

In [30]:
# Export data to CSV
export_data_to_csv(cafef_news, "cafef")
export_data_to_csv(vietstock_news, "vietstock")