In [1]:
import pandas as pd
import numpy as np
import nltk
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
import os
import torch

In [2]:
news_headline_cafebiz = pd.read_csv("cafebiz/cafebiz_news_sync.csv")
news_headline_vneconomy = pd.read_csv("vneconomy/vneconomy_news_sync.csv")
news_headline_vietstock = pd.read_csv("vietstock/vietstock_news_sync.csv")

news_headline = pd.concat([news_headline_cafebiz, news_headline_vneconomy, news_headline_vietstock], axis=0, ignore_index=True)


news_headline['Published_date'] = pd.to_datetime(news_headline['Published_time']).dt.date
# Group by 'Published_date' and aggregate 'news_content' by concatenating the values
news_headline = news_headline.groupby('Published_date')['news_content'].agg(lambda x: '. '.join(x)).reset_index()

news_headline = news_headline.sort_values(by='Published_date')
news_headline

Unnamed: 0,Published_date,news_content
0,2020-01-01,Đầu tư dài hạn vào doanh nghiệp chưa niêm yết ...
1,2020-01-02,Ông Trịnh Văn Quyết vào top 3 người giàu trên ...
2,2020-01-03,"Trung Quốc bơm 115 tỷ USD, chứng khoán thế giớ..."
3,2020-01-04,Chứng khoán Mỹ trượt khỏi đỉnh vì tin xấu từ T...
4,2020-01-05,"Cuối năm, một loạt công ty chứng khoán bị phạt..."
...,...,...
1304,2024-02-21,VinaCapital: Lợi nhuận doanh nghiệp sẽ tăng 50...
1305,2024-02-22,"Nhóm Dragon Capital bán tiếp 1 triệu cp GEX, c..."
1306,2024-02-23,Chứng khoán Mỹ xanh rực nhờ cổ phiếu chip Nvid...
1307,2024-02-24,Thị trường chứng khoán diễn biến ra sao sau ng...


In [15]:
# nltk.download('vader_lexicon')
from pyvi import ViTokenizer
sia = SentimentIntensityAnalyzer()

# Assuming 'news_headline' is your DataFrame
# Replace 'news_content' with the actual column name containing the news text
# Replace 'Published_time' with the actual column name containing the timestamp

# Run the polarity score on the entire dataset
res = {}
for i, row in tqdm(news_headline.iterrows(), total=len(news_headline)):
    text = ViTokenizer.tokenize(row['news_content'])
    mytime = row['Published_date']
    myid = i  # Use the index 'i' as the identifier
    res[myid] = sia.polarity_scores(text)
    res[myid]['Published_date'] = mytime

vaders = pd.DataFrame(res).T
vaders.reset_index(inplace=True, drop=True)
vaders.columns = ['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound', 'Published_date']

  0%|          | 0/1309 [00:00<?, ?it/s]

In [16]:
new_df.sort_values('neu')

Unnamed: 0,neg,neu,pos,compound,Published_date,news_content
49,0.0,0.778,0.222,0.9432,2020-05-22,Lễ ký kết hợp tác chiến lược giữa Công ty chứn...
298,0.159,0.841,0.0,-0.802,2021-04-10,Vợ Tổng Giám đốc VRC bị xử phạt 55 triệu đồng:...
6,0.0,0.842,0.158,0.7783,2020-01-07,Thị phần của 5 công ty chứng khoán lớn nhất Vi...
86,0.153,0.847,0.0,-0.802,2020-08-21,"Không công bố thông tin, Cảng Rau Quả bị xử ph..."
346,0.0,0.847,0.153,0.802,2021-05-30,Rich kid RMIT và cuộc sống của người nhiều tiề...
...,...,...,...,...,...,...
526,0.0,1.0,0.0,0.0,2021-12-03,"""Thứ 6 đen tối"" của chứng khoán Việt: 30 phút ..."
527,0.0,1.0,0.0,0.0,2021-12-04,"Sau phiên giảm sâu 39 điểm, chứng khoán Việt N..."
529,0.0,1.0,0.0,0.0,2021-12-06,Kỷ lục mới: Nhà đầu tư cá nhân trong nước mở h...
483,0.0,1.0,0.0,0.0,2021-10-20,"Chứng khoán Mỹ tăng liền 5 phiên, giá dầu gần ..."


In [3]:
from transformers import AutoTokenizer
# from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

# MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
# tokenizer = AutoTokenizer.from_pretrained(MODEL)
# model = AutoModelForSequenceClassification.from_pretrained(MODEL)

from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained("wonrax/phobert-base-vietnamese-sentiment")

tokenizer = AutoTokenizer.from_pretrained("wonrax/phobert-base-vietnamese-sentiment", use_fast=False)

os.environ['TRANSFORMERS_CACHE'] = '/huggingface/cache/'

In [7]:
from torch.nn.utils.rnn import pad_sequence
# Set the maximum sequence length for the tokenizer
max_seq_length = model.config.max_position_embeddings #258
# Set the batch size
batch_size = 100  # You can adjust this based on your system's capabilities

# Tokenize and pad sequences in batches
tokenized_batches = []
for i in range(0, len(news_headline), batch_size):
    batch = tokenizer(
        news_headline['news_content'][i:i + batch_size].tolist(),
        add_special_tokens=False,
        max_length=max_seq_length,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )
    tokenized_batches.append(batch)

# Flatten the list of batches
flat_input_ids = [batch['input_ids'] for batch in tokenized_batches]
input_ids = pad_sequence([item for sublist in flat_input_ids for item in sublist], batch_first=True, padding_value=tokenizer.pad_token_id)

# Create attention mask in batches
attention_mask = (input_ids != tokenizer.pad_token_id).float()

# Manually set position IDs based on the truncated sequence length
position_ids = torch.arange(0, max_seq_length).unsqueeze(0).expand(input_ids.shape)

# Perform sentiment analysis
with torch.no_grad():
    out = model(input_ids, attention_mask=attention_mask, position_ids=position_ids)
    probabilities = torch.nn.functional.softmax(out.logits, dim=-1).tolist()

# Create a new dataframe with sentiment analysis results
sentiment_df = pd.DataFrame(probabilities, columns=['roberta_neg', 'roberta_pos', 'roberta_neu'])
result_df = pd.concat([news_headline, sentiment_df], axis=1)

# Print the final dataframe
result_df

Unnamed: 0,Published_date,news_content,roberta_neg,roberta_pos,roberta_neu
0,2020-01-01,Đầu tư dài hạn vào doanh nghiệp chưa niêm yết ...,0.587031,0.276965,0.136004
1,2020-01-02,Ông Trịnh Văn Quyết vào top 3 người giàu trên ...,0.008298,0.917595,0.074107
2,2020-01-03,"Trung Quốc bơm 115 tỷ USD, chứng khoán thế giớ...",0.133858,0.373538,0.492604
3,2020-01-04,Chứng khoán Mỹ trượt khỏi đỉnh vì tin xấu từ T...,0.968832,0.009985,0.021183
4,2020-01-05,"Cuối năm, một loạt công ty chứng khoán bị phạt...",0.026826,0.924794,0.048380
...,...,...,...,...,...
1304,2024-02-21,VinaCapital: Lợi nhuận doanh nghiệp sẽ tăng 50...,0.003374,0.983424,0.013202
1305,2024-02-22,"Nhóm Dragon Capital bán tiếp 1 triệu cp GEX, c...",0.647978,0.170457,0.181565
1306,2024-02-23,Chứng khoán Mỹ xanh rực nhờ cổ phiếu chip Nvid...,0.032217,0.877634,0.090149
1307,2024-02-24,Thị trường chứng khoán diễn biến ra sao sau ng...,0.984123,0.005331,0.010546


In [16]:
merge_df = result_df.merge(vaders, on='Published_date')
merge_df

Unnamed: 0,Published_date,news_content,roberta_neg,roberta_pos,roberta_neu,vader_neg,vader_neu,vader_pos,vader_compound
0,2020-01-01,Đầu tư dài hạn vào doanh nghiệp chưa niêm yết ...,0.587031,0.276965,0.136004,0.0,1.0,0.0,0.0
1,2020-01-02,Ông Trịnh Văn Quyết vào top 3 người giàu trên ...,0.008298,0.917595,0.074107,0.0,0.957,0.043,0.3818
2,2020-01-03,"Trung Quốc bơm 115 tỷ USD, chứng khoán thế giớ...",0.133858,0.373538,0.492604,0.0,1.0,0.0,0.0
3,2020-01-04,Chứng khoán Mỹ trượt khỏi đỉnh vì tin xấu từ T...,0.968832,0.009985,0.021183,0.0,1.0,0.0,0.0
4,2020-01-05,"Cuối năm, một loạt công ty chứng khoán bị phạt...",0.026826,0.924794,0.048380,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
1304,2024-02-21,VinaCapital: Lợi nhuận doanh nghiệp sẽ tăng 50...,0.003374,0.983424,0.013202,0.0,1.0,0.0,0.0
1305,2024-02-22,"Nhóm Dragon Capital bán tiếp 1 triệu cp GEX, c...",0.647978,0.170457,0.181565,0.0,1.0,0.0,0.0
1306,2024-02-23,Chứng khoán Mỹ xanh rực nhờ cổ phiếu chip Nvid...,0.032217,0.877634,0.090149,0.0,1.0,0.0,0.0
1307,2024-02-24,Thị trường chứng khoán diễn biến ra sao sau ng...,0.984123,0.005331,0.010546,0.0,1.0,0.0,0.0


In [18]:
merge_df.to_csv('nlp_for_all_news_data.csv', encoding='utf-8-sig')

In [18]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict
res = {}
for i, row in tqdm(news_headline.iterrows(), total=len(news_headline)):
    try:
        text = row['news_content']  # Replace with the actual column name containing the news text
        mytime = row['Published_date']  # Replace with the actual column name containing the timestamp
        myid = i  # Use the index 'i' as the identifier
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
        roberta_result = polarity_scores_roberta(text)
        both = {**vader_result_rename, **roberta_result}
        res[myid] = both
        res[myid]['Published_date'] = mytime     
    except RuntimeError:
        print(f'Broke for id {myid}')