# Dataset collection

In [None]:
# NOTE: The folder ./data/htmls/data should be built beforehand

In [20]:
import requests
import re
import time
import random
from tqdm import tqdm, trange
from nltk.tokenize import sent_tokenize

In [2]:
HEADERS = {
    "userAgents": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
}

### CSDN

##### Extract html links for news

In [3]:
with requests.get("https://www.cnbc.com/world/") as response:
    with open("data/htmls/links.html", "w", encoding="utf-8") as f:
        f.write(response.text)

In [4]:
links_set = set()
with open("data/htmls/links.html", "r", encoding="utf-8") as file:
    data = file.read().rstrip()
    matchObj = re.compile(r'<a href="(?P<title>.*?)"', re.S)
    with open("data/htmls/ref.txt", "w", encoding="utf-8") as wf:
        for it in matchObj.finditer(data):
            dic = it.groupdict()
            if dic['title'] not in links_set:
                links_set.add(dic['title'])
                wf.write(f"{dic['title']}\n" if "https" in dic["title"] and "html" in dic["title"] else "")

##### Extract html content of each html link

In [None]:
with open("data/htmls/ref.txt", "r") as rf:
    for idx, ref in enumerate(rf):
        with requests.get(ref.strip(), headers=HEADERS) as response:
            with open(f"data/htmls/data/response{idx}.html", "w", encoding="utf-8") as f:
                f.write(response.text)
        time.sleep(random.randint(20, 40))

##### Extract article content of each html content

In [None]:
from bs4 import BeautifulSoup
from glob import glob
with open("data/data.txt", "w", encoding="utf-8") as wf:
    for i in trange(len(glob("data/htmls/data/*"))):
        with open(f"data/htmls/data/response{i}.html", 'r', encoding="utf-8") as f:
            webpage = f.read()

        soup = BeautifulSoup(webpage, "html")
        wf.write(soup.title.text + '\n')
        for g in soup.find_all("div", class_="group"):
            for p in g.find_all("p"):
                wf.write(p.text + '\n')
            for li in g.find_all("li"):
                wf.write(li.text + '\n')


#### News API

##### Extract API content

In [None]:
# Replace ... with your own api key
APIKEY = "..."
# Replace ... with the company you want to know e.g. QUERY = "apple"
QUERY = "..."

print(f"https://newsapi.org/v2/everything?q={QUERY}&apiKey={APIKEY}&language=en")

titleset = set()
articleset = set()

for i in trange(1, 6):
    with requests.get(f"https://newsapi.org/v2/everything?q={QUERY}&apiKey={APIKEY}&language=en&page={i}") as response:
        resjson = response.json()
        
        for article in resjson["articles"]:
            title = article['title']
            description = article.get('description', '')
            titleset.add(title if title != "[Removed]" else "")
            articleset.add(description if description != "[Removed]" else "")

with open("data/data.txt", "a", encoding="utf-8") as file:
    for title in titleset:
        file.write(title + '\n')
    for descrpt in articleset:
        if descrpt:
            file.write(descrpt + '\n')

## Split data into meaningful sentences

In [None]:
from langdetect import detect
def lngdtct(text):
    try:
        lang=detect(text)
    except:
        return False
    return lang == "en"

lineset = set()

def line_process(line: str) -> list:
    return sent_tokenize(line)

with open("data/data.txt", "r", encoding="utf-8") as readfile:
    for line in readfile:
        if lngdtct(line):
            tmpline = re.sub(r"\.\.\.", "", line)
            tmpline = re.sub(r"…", "", tmpline)
            lineset.add(tmpline)

with open("data/processed_sentence.txt", "w", encoding="utf-8") as writefile:
    for line in lineset:
        for line in line_process(line):
            if len(line) > 63:
                writefile.write(line + '\n')


## Label data

In [None]:
import google.generativeai as genai
from google.api_core import retry

In [None]:
# Replace ... with your own api key
GOOGLE_API_KEY = "..."
genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
retry_policy = {
    "retry": retry.Retry(predicate=retry.if_transient_error, initial=10, multiplier=1.5, timeout=300)
}

model = genai.GenerativeModel(
    'gemini-1.5-flash-001',
    generation_config=genai.GenerationConfig(
        temperature=0.1,
        top_p=1,
        max_output_tokens=5,
    ))

prompt = lambda str : f"""
You are a financial expert.
Classify the financial sentence as POSITIVE, NEUTRAL or NEGATIVE.
Sentence: {str}

EXAMPLE:
Sentence: Investors prioritize Nvidia’s earnings over risk
Response: POSITIVE

Sentiment: """

sentiment_list = []
linelist = []
removed_line = []

with open("data/processed_sentence.txt") as file:
    for line in file:
        linelist.append(line)

for line in tqdm(linelist):
    response = model.generate_content(prompt(line), request_options=retry_policy)
    match = re.findall(r'\b(POSITIVE|NEGATIVE|NEUTRAL)\b', response.text)
    if match:
        sentiment_list.append(match[0])
        time.sleep(random.randint(1, 2))
    else:
        removed_line.append(line)

In [None]:
with open("data/sentiment.txt", "w") as file:
    for sentiment in sentiment_list:
        file.write(sentiment + '\n')

## Build CSV file combining sentences and sentiments

In [None]:
import csv

sentence_list = []
for line in linelist:
    if line not in removed_line:
        sentence_list.append(line)

with open("data.csv", "w", encoding="utf-8") as csvfile:
    csvwritter = csv.writer(csvfile)
    csvwritter.writerow(["Sentence", "Sentiment"])
    for sentence, sentiment in zip(sentence_list, sentiment_list):
        csvwritter.writerow([sentence, sentiment.lower()])

## Remove used files

In [None]:
import shutil

folder_path = "./data"

try:
    shutil.rmtree(folder_path)
    print(f"Folder '{folder_path}' has been removed.")
except FileNotFoundError:
    print(f"Folder '{folder_path}' does not exist.")
except PermissionError:
    print(f"Permission denied while trying to delete '{folder_path}'.")
