# Module

In [3]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from datetime import datetime
from datetime import timedelta, date
import re

# Pre-processing

In [20]:
label_news=pd.read_csv('sentiment_before.csv')

In [21]:
del label_news['href']
del label_news['subtitle']
del label_news['content']

In [22]:
label_news.head()

Unnamed: 0,title,time,result
0,"HPG tăng vùn vụt, ông Trần Đình Long bỏ túi th...",25-06-2023 - 08:08 AM,['Điểm số: 70']
1,"Tầng lớp trung lưu đang gia tăng, 1 doanh nghi...",21-06-2023 - 09:14 AM,"['Từ các thông tin đã cung cấp, tôi đánh giá b..."
2,Ông Phạm Nhật Vượng và các tỷ phú Việt giàu cỡ...,16-06-2023 - 16:25 PM,['Điểm số: 75']
3,Ông Phạm Nhật Vượng kêu gọi ‘xe tốt – giá tốt ...,14-06-2023 - 15:47 PM,['Điểm số cho bài báo: 65']
4,"Chưa chính thức chào bán, Vinfast VF3 của tỷ p...",11-06-2023 - 10:09 AM,['Điểm số: 78']


## Sentiment score to float

In [51]:
def extract_num(x):
    return x.split(':')[1]

def conv_to_num(x):
    return float(x.replace(",",".")[2:len(x)-2])

def conv_to_num2(x):
    return float(x)

def conv_to_timestamp(x):
    if isinstance(x, str):
        return datetime.strptime(x[:-3], "%d-%m-%Y - %H:%M")
    else:
        return None
def get_date_from_timestamp(x):
    return datetime.fromtimestamp(x).date() + timedelta(days=0)

In [25]:
label_news['result'] = label_news['result'].astype(str)
# Extract the number using regular expressions
for i in range(len(label_news)):
    numbers = re.findall(r'\d+', label_news['result'][i])
    if numbers:
        label_news['result'][i] = float(numbers[0])
    else:
        label_news['result'][i] = 50

In [27]:
label_news.to_csv('sentiment_after.csv')

# Group by date

In [28]:
def daily_new(x):
    if x.hour < 14:
        return x.date()
    return x.date() + timedelta(days=1)

In [34]:
label_news['time'] = label_news['time'].apply(conv_to_timestamp)
label_news.sort_values(by='time', ascending = True, inplace = True)

In [35]:
label_news.head()

Unnamed: 0,title,time,result
330,Ông Phạm Nhật Vượng chính thức trở thành người...,2010-12-29 10:50:00,50
329,Em trai Phạm Nhật Vượng muốn kinh doanh viễn t...,2011-04-06 13:32:00,50
328,VIC: Ông Phạm Nhật Vượng giảm tỷ lệ sở hữu từ ...,2011-06-13 22:56:00,50
327,Ông Phạm Nhật Vượng đứng ra đảm bảo cho Vincom...,2011-07-27 13:44:00,50
326,VIC: ĐHCĐ thông qua việc ông Phạm Nhật Vượng đ...,2011-09-12 09:08:00,50


In [36]:
label_news.rename(columns={'result':'sentiment_score'},inplace=True)

In [37]:
day_news = label_news.copy()
day_news['Date'] = day_news['time'].apply(daily_new)

In [40]:
day_news = day_news.groupby('Date')['sentiment_score'].mean().reset_index()

In [41]:
day_news

Unnamed: 0,Date,sentiment_score
0,2010-12-29,50.0
1,2011-04-06,50.0
2,2011-06-14,50.0
3,2011-07-27,50.0
4,2011-09-12,50.0
...,...,...
256,2023-06-11,78.0
257,2023-06-15,65.0
258,2023-06-17,75.0
259,2023-06-21,70.0


# Combine with date range

In [42]:
def daterange(start, end):
    return [start + timedelta(n) for n in range(int((end - start).days))]

In [43]:
full_days = pd.DataFrame(daterange(date(2008, 1, 1), date(2023, 7, 11)), columns = ['Date'])

In [44]:
day_news = full_days.merge(day_news, on=['Date'], how='left')

In [45]:
day_news

Unnamed: 0,Date,sentiment_score
0,2008-01-01,
1,2008-01-02,
2,2008-01-03,
3,2008-01-04,
4,2008-01-05,
...,...,...
5665,2023-07-06,
5666,2023-07-07,
5667,2023-07-08,
5668,2023-07-09,


# Combine with price

In [46]:
day_news = day_news.ffill()
day_news=day_news.fillna(50)

In [56]:
price_df = pd.read_csv('VIC_SSI.csv')

In [57]:
price_df['Date'] = price_df['t'].apply(get_date_from_timestamp)

In [60]:
price_df = price_df.merge(day_news, how='left', on=['Date'])

In [62]:
del price_df['Unnamed: 0']

In [63]:
price_df

Unnamed: 0,t,c,o,h,l,v,s,Date,sentiment_score
0,1199232000,6.13,6.09,6.13,6.05,156160,ok,2008-01-02,50.0
1,1199318400,6.01,6.05,6.09,6.01,111990,ok,2008-01-03,50.0
2,1199404800,5.93,5.97,5.97,5.93,66670,ok,2008-01-04,50.0
3,1199664000,5.85,5.89,5.93,5.78,117490,ok,2008-01-07,50.0
4,1199750400,5.89,5.85,6.01,5.85,109010,ok,2008-01-08,50.0
...,...,...,...,...,...,...,...,...,...
3849,1688515200,51.00,51.40,51.60,50.90,2479200,ok,2023-07-05,70.0
3850,1688601600,50.40,51.00,51.40,50.40,2599600,ok,2023-07-06,70.0
3851,1688688000,50.10,50.50,50.60,50.00,2941600,ok,2023-07-07,70.0
3852,1688947200,50.90,50.50,51.20,50.20,3345300,ok,2023-07-10,70.0


In [64]:
price_df=price_df.fillna(50)

In [68]:
price_df[['c','o','h','l','v','sentiment_score']].corr()

Unnamed: 0,c,o,h,l,v,sentiment_score
c,1.0,0.999622,0.999819,0.999754,0.42646,0.463974
o,0.999622,1.0,0.999792,0.999797,0.426444,0.464984
h,0.999819,0.999792,1.0,0.999694,0.431056,0.464558
l,0.999754,0.999797,0.999694,1.0,0.422077,0.464807
v,0.42646,0.426444,0.431056,0.422077,1.0,0.293209
sentiment_score,0.463974,0.464984,0.464558,0.464807,0.293209,1.0


In [69]:
price_df.to_csv('VIC_sentiment.csv')