<a href="https://colab.research.google.com/github/thaivo02/Sentiment-analysis/blob/main/Sentiment_analysis_to_predict_stock_market.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [None]:
!pip install nltk
!pip install underthesea
!pip install transformers torch
!pip install SentencePiece

Collecting underthesea
  Downloading underthesea-6.8.4-py3-none-any.whl (20.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
Collecting underthesea-core==1.0.4 (from underthesea)
  Downloading underthesea_core-1.0.4-cp310-cp310-manylinux2010_x86_64.whl (657 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.8/657.8 kB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: underthesea-core, python-crfsuite, underthesea
Successfully installed python-crfsuite-0.9.10 underthesea-6.8.4 underthesea-core-1.0.4
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-

In [None]:
import nltk
import math
import string
import re
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.ticker import FormatStrFormatter
import underthesea
from underthesea import word_tokenize
from underthesea import text_normalize
from datetime import datetime
import timeit
from wordcloud import WordCloud, STOPWORDS
from PIL import Image

In [None]:
import torch
from transformers import RobertaForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
nltk.download('vader_lexicon')

# Prepare data

In [None]:
company_list = ['API', 'AGG', 'BID', 'FPT', 'VCB', 'ACV', 'OIL', 'ABB', 'ABC']

## Call API to get data

In [None]:
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0"}

In [None]:
company_news = {}

In [None]:
for company in company_list:
    index = 1
    finished = False
    company_news[company] = {}
    company_news[company]['news'] = []
    company_news[company]['timeline'] = []
    while not finished:
        url = f"https://s.cafef.vn/Ajax/Events_RelatedNews_New.aspx?symbol={company}&floorID=0&configID=0&PageIndex={index}&PageSize=30&Type=2"
        response = requests.get(url, headers = headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            titles = soup.find_all("a", class_="docnhanhTitle")
            titles = [title.text.strip().split(": ")[-1] for title in titles]
            timeline = soup.find_all("span", class_="timeTitle")
            timeline = [time.text.strip().split(' ')[0] for time in timeline]
            print(len(company_news[company]['news']))
            if len(company_news[company]['news']) == 0:
                company_news[company]['news'] = titles
                company_news[company]['timeline'] = timeline
            else:
                company_news[company]['news'].extend(titles)
                company_news[company]['timeline'].extend(timeline)
            print(f"Fetched data for {company}")
        else:
            print(f"Failed to fetch data for {company}")
        if len(company_news[company]['news']) >= 100:
            finished = True
        else:
            index += 1

In [None]:
for company in company_list:
    company_date = list(company_news[company]['timeline'])
    company_news[company]['price'] = []
    startDate = company_date[-1]
    startDate = datetime.strptime(startDate, "%d/%m/%Y")
    startDate = startDate.strftime("%m/%d/%Y")
    endDate = company_date[0]
    endDate = datetime.strptime(endDate, "%d/%m/%Y")
    endDate = endDate.strftime("%m/%d/%Y")
    finished = False
    index = 1
    while not finished:
      url = f"https://s.cafef.vn/Ajax/PageNew/DataHistory/PriceHistory.ashx?Symbol={company}&StartDate={startDate}&EndDate={endDate}&PageIndex={index}"
      response = requests.get(url)
      if response.status_code == 200:
        data = response.json()
        if 'Data' in data:
            data = data['Data']['Data']
            for date in company_date:
                for item in data:
                    if item['Ngay'] == date:
                        company_news[company]['price'].append(float(item['GiaDongCua']))
      else:
        print(f"Failed to fetch price data for {company}")
      if len(data) == 0:
        finished = True
      elif datetime.strptime(data[-1]['Ngay'], "%d/%m/%Y") <= datetime.strptime(company_date[-1], "%d/%m/%Y"):
        finished = True
      else:
        index += 1
    price_len = len(company_news[company]['price'])
    date_len = len(company_date)
    if price_len < date_len:
        last_price = company_news[company]['price'][-1] if price_len > 0 else 0
        company_news[company]['price'].extend([last_price] * (date_len - price_len))

In [None]:
for company in company_list:
    print(f"{company}: {company_news[company]['news']}")
    print(f"Number of price: {len(company_news[company]['price'])}")
    print(f"Number of news: {len(company_news[company]['news'])}")

## Preprocess data

### Tokenize Vietnamese

In [None]:
!wget https://raw.githubusercontent.com/stopwords/vietnamese-stopwords/master/vietnamese-stopwords.txt

In [None]:
stop_words = set()
with open('vietnamese-stopwords.txt', 'r') as f:
    for line in f:
        stop_words.add(line.strip())

In [None]:
def clean_text(text):
    text = text.lower() # lowercase text

    text = re.sub(r'([a-z]+?)\1+',r'\1', text) # reduce repeated character (e.g. 'aaabbb' -> 'ab')

    # Ensure space before and after any punctuation mark
    text = re.sub(r"(\w)\s*([" + string.punctuation + "])\s*(\w)", r"\1 \2 \3", text)
    text = re.sub(r"(\w)\s*([" + string.punctuation + "])", r"\1 \2", text)

    text = re.sub(f"([{string.punctuation}])([{string.punctuation}])+",r"\1", text) # reduce consecutive punctuation

    # Remove any leading or trailing spaces, or leading or trailing punctuation marks from the text
    text = text.strip()
    while text.endswith(tuple(string.punctuation+string.whitespace)):
        text = text[:-1]
    while text.startswith(tuple(string.punctuation+string.whitespace)):
        text = text[1:]

    text = text.translate(str.maketrans('', '', string.punctuation)) # remove all punctuation

    text = re.sub(r"\s+", " ", text) # reduce multiple spaces

    text = text_normalize(text) # make sure punctunation is in the right letter (Vietnamese case)
    text = word_tokenize(text, format="text") # tokenize the cleaned text
    # text = unidecode(text) # remove accent marks from sentences (no significant difference when accent marks is removed or kept)

    text = text.split(' ')

    text = [word for word in text if word not in stop_words] # remove stop words

    text = ' '.join(text)
    return text

In [None]:
for company in company_list:
    print(f"{company}: {company_news[company]['news']}")

In [None]:
for company, item in company_news.items():
    company_news[company]['tokenized_news'] = []
    for title in item['news']:
        title = clean_text(title)
        company_news[company]['tokenized_news'].append(title)

In [None]:
for company in company_list:
    print(f"{company}: {company_news[company]['news']}")
    print(f"Tokenized news: {company_news[company]['tokenized_news']}")
    print(f"Number of news: {len(company_news[company]['news'])}")

### Plot word cloud

In [None]:
!wget https://upload.wikimedia.org/wikipedia/commons/f/f2/Logo_Twitter.png

In [None]:
mask = np.array(Image.open('/content/Logo_Twitter.png'))

titles = []
for company, item in company_news.items():
    titles.extend(item['tokenized_news'])

titles = ' '.join(titles)

# Create WordCloud object
wordcloud = WordCloud(background_color='white',
                    mask=mask,
                    max_font_size=50,
                    contour_width=1,
                    contour_color='steelblue',
                    min_font_size=10)

# Generate word cloud
wordcloud.generate(titles)

# Plot the WordCloud image
plt.figure(figsize=(8,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

### Plot news length

In [None]:
company_names = list(company_news.keys())
# Tạo danh sách chứa 3 độ dài dài nhất của tokenized_news cho mỗi công ty
top_lengths = []
for company in company_names:
    tokenized_news_lengths = [len(item) for item in company_news[company]['tokenized_news']]
    top_lengths.append(sorted(tokenized_news_lengths, reverse=True)[:3])

# Vẽ biểu đồ đường
plt.figure(figsize=(10, 6))

# Tạo mảng các index cho từng công ty
index = np.arange(1, len(company_names) + 1)

# Vẽ biểu đồ đường cho từng độ dài dài nhất
plt.plot(index, [max(lengths) for lengths in top_lengths], marker='o', linestyle='-', color='skyblue', label='Độ dài lớn nhất')
plt.plot(index, [sorted(lengths, reverse=True)[1] if len(lengths) > 1 else 0 for lengths in top_lengths], marker='o', linestyle='-', color='orange', label='Độ dài lớn thứ hai')
plt.plot(index, [sorted(lengths, reverse=True)[2] if len(lengths) > 2 else 0 for lengths in top_lengths], marker='o', linestyle='-', color='green', label='Độ dài lớn thứ ba')

plt.title('Phân bố độ dài tin tức của từng công ty')
plt.xlabel('Công ty')
plt.ylabel('Số lượng từ')
plt.xticks(index, company_names, rotation=45, ha='right')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

## Use model VADER from NLTK

### Load model

In [None]:
analyzer = SentimentIntensityAnalyzer()

In [None]:
start = timeit.default_timer()

for company, item in company_news.items():
    company_news[company]['sentiment'] = []
    for title in item['tokenized_news']:
      sentiment = analyzer.polarity_scores(title)
      company_news[company]['sentiment'].append(sentiment['compound'])
      print(f"{company}: {title} - {sentiment}")

stop = timeit.default_timer()

vader_time_execution = stop - start

In [None]:
for company, item in company_news.items():
  print(f"{company}: {item}")
  print(f"Number of news: {len(item['news'])}")
  print(f"Number of timeline: {len(item['timeline'])}")
  print(f"Number of price: {len(item['price'])}")

In [None]:
stock_price = []
for company, item in company_news.items():
  stock_price.append(item['price'])

In [None]:
date_time = []
for company, item in company_news.items():
  date_time.append(item['timeline'])

### Plot sentiment score

In [None]:
vader_sentiment_scores = []

In [None]:
fig = plt.figure(figsize=(30, 14))

for company, item in company_news.items():
    date = item['timeline']
    price = item['price']
    sentiment_scores = item['sentiment']
    plot_index = company_list.index(company) + 1
    plot_col = 3
    plot_row = (len(company_list) + plot_col - 1) // plot_col
    ax = fig.add_subplot(plot_row, plot_col, plot_index, frameon=False)
    ax.grid(True)
    ax.set_title(company)
    ax.set_ylim(-1, 1)
    dates = pd.to_datetime(date, dayfirst=True)
    ax.plot(dates, sentiment_scores, color='c', label='Sentiment score')
    ax.legend(loc='upper left')
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    axt = ax.twinx()
    axt.plot(dates, price, color='m', label='Closed price')
    axt.legend(loc='upper right')
    axt.set_ylim(min(price) - 10, max(price) + 10)
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right")
    vader_sentiment_scores.append(sentiment_scores)

fig.suptitle('VADER sentiment scores')
plt.legend()
plt.tight_layout()
plt.savefig('vader_sentiment_scores.png')
plt.show()

### Plot volatility

In [None]:
fig, ax = plt.subplots(figsize=(18, 4))
volatility_plot = {}

for company, item in company_news.items():
    sentiment_scores = item['sentiment']
    volatility_plot[company] = np.std(sentiment_scores)

for company, volatility in volatility_plot.items():
    plt.text(company_list.index(company), volatility, f"{volatility:.2f}", ha='center', va='bottom')

ax.bar(company_list, volatility_plot.values(), 0.3, color='w', edgecolor='k')
ax.set_ylim(0, 1)

fig.suptitle('VADER volatility')
plt.savefig('vader_volatility.png')
plt.show()

## Use PhoBERT pretrained model from Wonrax

### Load model

In [None]:
wonrax = RobertaForSequenceClassification.from_pretrained("wonrax/phobert-base-vietnamese-sentiment")

wonrax_tokenizer = AutoTokenizer.from_pretrained("wonrax/phobert-base-vietnamese-sentiment", use_fast=False)

In [None]:
start = timeit.default_timer()

sentiments = {}

for company, item in company_news.items():
    company_news[company]['sentiment'] = []
    for title in item['tokenized_news']:
        input_ids = torch.tensor([wonrax_tokenizer.encode(title)])
        with torch.no_grad():
            out = wonrax(input_ids)
            sentiments['neg'] = round(out.logits.softmax(dim=-1).tolist()[-1][0], 3)
            sentiments['pos'] = round(out.logits.softmax(dim=-1).tolist()[-1][1], 3)
            sentiments['neu'] = round(out.logits.softmax(dim=-1).tolist()[-1][2], 3)
            sentiments['compound'] = round(sentiments['pos'] - sentiments['neg'], 4)
            company_news[company]['sentiment'].append(sentiments['compound'])
            print(f"{company}: {title} - {sentiments}")

stop = timeit.default_timer()

phobert_time_execution = stop - start

In [None]:
for company, item in company_news.items():
  print(f"{company}: {item}")
  print(f"Number of news: {len(item['news'])}")
  print(f"Number of timeline: {len(item['timeline'])}")
  print(f"Number of price: {len(item['price'])}")

### Plot sentiment score

In [None]:
phobert_sentiment_scores = []

In [None]:
fig = plt.figure(figsize=(30, 14))

for company, item in company_news.items():
    date = item['timeline']
    price = item['price']
    sentiment_scores = item['sentiment']
    plot_index = company_list.index(company) + 1
    plot_col = 3
    plot_row = (len(company_list) + plot_col - 1) // plot_col
    ax = fig.add_subplot(plot_row, plot_col, plot_index, frameon=False)
    ax.grid(True)
    ax.set_title(company)
    ax.set_ylim(-1, 1)
    dates = pd.to_datetime(date, dayfirst=True)
    ax.plot(dates, sentiment_scores, color='c', label='Sentiment score')
    ax.legend(loc='upper left')
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    axt = ax.twinx()
    axt.plot(dates, price, color='m', label='Closed price')
    axt.legend(loc='upper right')
    axt.set_ylim(min(price) - 10, max(price) + 10)
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right")
    phobert_sentiment_scores.append(sentiment_scores)

fig.suptitle('PHOBERT sentiment scores')
plt.legend()
plt.tight_layout()
plt.savefig('phobert_sentiment_scores.png')
plt.show()

### Plot volatility

In [None]:
fig, ax = plt.subplots(figsize=(18, 4))
volatility_plot = {}

for company, item in company_news.items():
    sentiment_scores = item['sentiment']
    volatility_plot[company] = np.std(sentiment_scores)

for company, volatility in volatility_plot.items():
    plt.text(company_list.index(company), volatility, f"{volatility:.2f}", ha='center', va='bottom')

ax.bar(company_list, volatility_plot.values(), 0.3, color='w', edgecolor='k')
ax.set_ylim(0, 1)

fig.suptitle('PHOBERT volatility')
plt.savefig('phobert_volatility.png')
plt.show()

## Use ViSoBERT

### Load model

In [None]:
uit = AutoModelForSequenceClassification.from_pretrained('uitnlp/visobert')
uit_tokenizer = AutoTokenizer.from_pretrained('uitnlp/visobert')

In [None]:
start = timeit.default_timer()

sentiments = {}

for company, item in company_news.items():
    company_news[company]['sentiment'] = []
    for title in item['tokenized_news']:
        encoding = uit_tokenizer(title, return_tensors='pt')
        with torch.no_grad():
            output = uit(**encoding)
            sentiments['pos'] = round(output.logits.softmax(dim=-1).tolist()[-1][0], 3)
            sentiments['neg'] = round(output.logits.softmax(dim=-1).tolist()[-1][1], 3)
            sentiments['compound'] = round(sentiments['pos'] - sentiments['neg'], 4)
            company_news[company]['sentiment'].append(sentiments['compound'])
            print(f"{company}: {title} - {sentiments}")

stop = timeit.default_timer()

visobert_time_execution = stop - start

### Plot sentiment score

In [None]:
visobert_sentiment_scores = []

In [None]:
fig = plt.figure(figsize=(30, 14))

for company, item in company_news.items():
    date = item['timeline']
    price = item['price']
    sentiment_scores = item['sentiment']
    plot_index = company_list.index(company) + 1
    plot_col = 3
    plot_row = (len(company_list) + plot_col - 1) // plot_col
    ax = fig.add_subplot(plot_row, plot_col, plot_index, frameon=False)
    ax.grid(True)
    ax.set_title(company)
    ax.set_ylim(-1, 1)
    dates = pd.to_datetime(date, dayfirst=True)
    ax.plot(dates, sentiment_scores, color='c', label='Sentiment score')
    ax.legend(loc='upper left')
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    axt = ax.twinx()
    axt.plot(dates, price, color='m', label='Closed price')
    axt.legend(loc='upper right')
    axt.set_ylim(min(price) - 10, max(price) + 10)
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right")
    visobert_sentiment_scores.append(sentiment_scores)

fig.suptitle('VISOBERT sentiment scores')
plt.legend()
plt.tight_layout()
plt.savefig('visobert_sentiment_scores.png')
plt.show()

### Plot volatility

In [None]:
fig, ax = plt.subplots(figsize=(18, 4))
volatility_plot = {}

for company, item in company_news.items():
    sentiment_scores = item['sentiment']
    volatility_plot[company] = np.std(sentiment_scores)

for company, volatility in volatility_plot.items():
    plt.text(company_list.index(company), volatility, f"{volatility:.2f}", ha='center', va='bottom')

ax.bar(company_list, volatility_plot.values(), 0.3, color='w', edgecolor='k')
ax.set_ylim(0, 1)

fig.suptitle('VISOBERT volatility')
plt.savefig('visobert_volatility.png')
plt.show()

## Compare 3 models

In [None]:
sentence = 'tươi'

wonrax_sentiment = {}

input_ids = torch.tensor([wonrax_tokenizer.encode(sentence)])
with torch.no_grad():
  out = wonrax(input_ids)
  wonrax_sentiment['neg'] = round(out.logits.softmax(dim=-1).tolist()[-1][0], 3)
  wonrax_sentiment['pos'] = round(out.logits.softmax(dim=-1).tolist()[-1][1], 3)
  wonrax_sentiment['neu'] = round(out.logits.softmax(dim=-1).tolist()[-1][2], 3)
  wonrax_sentiment['compound'] = round(wonrax_sentiment['pos'] - wonrax_sentiment['neg'], 4)
  print("PHOBERT", wonrax_sentiment)

uit_sentiment = {}
encoding = uit_tokenizer(sentence, return_tensors='pt')
with torch.no_grad():
  output = uit(**encoding)
  uit_sentiment['pos'] = round(output.logits.softmax(dim=-1).tolist()[-1][0], 3)
  uit_sentiment['neg'] = round(output.logits.softmax(dim=-1).tolist()[-1][1], 3)
  uit_sentiment['compound'] = round(uit_sentiment['pos'] - uit_sentiment['neg'], 4)
  print("VISOBERT", uit_sentiment)

print("VADER", analyzer.polarity_scores(sentence))

In [None]:
fig = plt.figure(figsize=(30, 14))

max_length = len(company_news[company_list[0]]['timeline'])

for idx, company in enumerate(company_list):
    date = date_time[idx]
    plot_index = idx + 1
    plot_col = 3
    plot_row = len(company_list) // plot_col
    ax = fig.add_subplot(plot_row, plot_col, plot_index, frameon=False)
    ax.grid(True)
    ax.set_title(company)
    ax.set_ylim(-1, 1)
    dates = pd.to_datetime(date, dayfirst=True)
    ax.plot(dates, vader_sentiment_scores[idx], color='c', label='VADER')
    ax.plot(dates, phobert_sentiment_scores[idx], color='g', label='PHOBERT')
    ax.plot(dates, visobert_sentiment_scores[idx], color='y', label='VISOBERT')
    ax.legend(loc='upper left')
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    axt = ax.twinx()
    axt.plot(dates, stock_price[idx], color='m', label='Closed price')
    axt.legend(loc='upper right')
    axt.set_ylim(min(stock_price[idx]) - 10, max(stock_price[idx]) + 10)
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right")

fig.suptitle('Sentiment scores comparison')
plt.legend()
plt.tight_layout()
plt.savefig('compare_sentiment_scores.png')
plt.show()

In [None]:
time_execution = {
    'VADER': vader_time_execution,
    'PHOBERT': phobert_time_execution,
    'VISOBERT': visobert_time_execution
 }

fig, ax = plt.subplots(figsize=(10, 6))
ax.yaxis.grid(zorder=0)
ax.bar(time_execution.keys(), time_execution.values(), color='lime', width=0.3, linewidth=2.5, zorder=3)
ax.set_title('Time execution')
ax.set_xlabel('Model')
ax.set_ylabel('Time (seconds)')
plt.savefig('compare_time_execution.png')
plt.show()

# Download file .png

In [None]:
# from google.colab import files
# import os

# for filename in os.listdir('/content'):
#     if filename.endswith('.png'):
#         files.download(filename)