In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.api import VAR
import os
import gc

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# change dir
if os.path.exists("../data"):
    os.chdir("../data")
else:
    print("The directory ../data does not exist")

In [4]:
"""whole process for data filtration"""
# if choose other data
# set path
directory = 'roberta'

# get all Parquet file in this path
parquet_files = [f for f in os.listdir(directory) if f.endswith('.parquet')]

# read and merge all the Parquet files
df_list = []
for file in parquet_files:
    file_path = os.path.join(directory, file)
    temp_df = pd.read_parquet(file_path)
    df_list.append(temp_df)

df = pd.concat(df_list, ignore_index=True)

# use map table to filter these sentiment data
map_table = pd.read_pickle('final_map_table.pkl')
df = pd.merge(df, map_table, on=['id'], how='right')
# del combined_df
gc.collect()

0

In [5]:
df['date'] = df['date'] = [x[:10] for x in df['publishedDate']]

df = df[df['date'] >= '2017-08-17']
gc.collect()

0

In [6]:
# counts data
ticker_counts = pd.read_pickle('sNv.pkl').groupby('datadate').apply(lambda x: len(x['tic'].unique()))
ticker_counts_sector = pd.read_pickle('sNv.pkl').groupby(['gsector', 'datadate']).apply(lambda x: len(x['tic'].unique()))
ticker_counts = ticker_counts[(ticker_counts.index >= '2017-08-17') & (ticker_counts.index <= '2022-09-06')]
ticker_counts_sector = ticker_counts_sector[(ticker_counts_sector.index.get_level_values(1) >= '2017-08-17'
                                            ) & (ticker_counts_sector.index.get_level_values(1) <= '2022-09-06')]

In [7]:
def sentiment_cluster(df_, stock_counts, analysis_on = 'finbert', freq='d', impact=3):
    """ps: 1. some stocks have many news during same period, we should avoid double counting.
           2. """
    df_ = df_.copy()
    if freq == 'h':
        freq_col = 'publishedDate'
        stock_counts.name = 'count'
        df_ = pd.merge(df_, stock_counts, left_on='date', right_index=True, how='left')
        stock_counts = df_[['count', freq_col]].drop_duplicates().set_index([freq_col])
    else:
        freq_col = 'date'
        stock_counts.name = 'count'
    
    pos_col = analysis_on + '_pos'
    neg_col = analysis_on + '_neg'
    neu_col = analysis_on + '_neu'

    # for the repeated news about the same stock at a certain period, we take the average.
    temp_senti_s = df_.groupby(['ticker', freq_col]).apply(lambda x: x[[pos_col, neg_col, neu_col]].mean(axis=0))
    # suppose the stocks have no news remain the same sentiment score as last period.
    temp_senti_freq = temp_senti_s.groupby(level=1).mean()
    temp_count_freq = temp_senti_s.groupby(level=1).apply(lambda x: x.shape[0])
    temp_count_freq.name = 'inner_count'
    temp_merge = pd.concat([stock_counts, temp_count_freq, temp_senti_freq], axis=1).sort_index().ffill()
    cluster_dict = {pos_col: [], neg_col: [], neu_col: []}
    last_senti = {pos_col: temp_merge.head(1)[pos_col].values[0],
                  neg_col: temp_merge.head(1)[neg_col].values[0],
                  neu_col: temp_merge.head(1)[neu_col].values[0]}
    for ind, row in temp_merge.iterrows():
        print(ind)
        for s in [pos_col, neg_col, neu_col]:
            temp_put = (row[s] * impact * row['inner_count'] + last_senti[s] * (row['count'] - row['inner_count'])
                           ) / (impact * row['inner_count'] + row['count'] - row['inner_count'])
            
            cluster_dict[s].append(temp_put)
            last_senti[s] = temp_put
    return pd.DataFrame(data=cluster_dict, index=temp_merge.index)


def sector_cluster(df_, stock_counts_sector):
    sec_dict = {}
    for sector, group in df_.groupby('gsector'):
        tc = stock_counts_sector[stock_counts_sector.index.get_level_values(0) == sector].reset_index(level=0, drop=True)
        temp_d = sentiment_cluster(group, tc, analysis_on='roberta', freq='d')
        sec_dict[sector] = temp_d
    return sec_dict

In [8]:
roberta_cluster_sector = sector_cluster(df, ticker_counts_sector)

2017-08-17
2017-08-18
2017-08-19
2017-08-20
2017-08-21
2017-08-22
2017-08-23
2017-08-24
2017-08-25
2017-08-26
2017-08-27
2017-08-28
2017-08-29
2017-08-30
2017-08-31
2017-09-01
2017-09-02
2017-09-03
2017-09-04
2017-09-05
2017-09-06
2017-09-07
2017-09-08
2017-09-09
2017-09-10
2017-09-11
2017-09-12
2017-09-13
2017-09-14
2017-09-15
2017-09-16
2017-09-17
2017-09-18
2017-09-19
2017-09-20
2017-09-21
2017-09-22
2017-09-23
2017-09-24
2017-09-25
2017-09-26
2017-09-27
2017-09-28
2017-09-29
2017-09-30
2017-10-01
2017-10-02
2017-10-03
2017-10-04
2017-10-05
2017-10-06
2017-10-07
2017-10-08
2017-10-09
2017-10-10
2017-10-11
2017-10-12
2017-10-13
2017-10-14
2017-10-15
2017-10-16
2017-10-17
2017-10-18
2017-10-19
2017-10-20
2017-10-21
2017-10-22
2017-10-23
2017-10-24
2017-10-25
2017-10-26
2017-10-27
2017-10-28
2017-10-29
2017-10-30
2017-10-31
2017-11-01
2017-11-02
2017-11-03
2017-11-04
2017-11-05
2017-11-06
2017-11-07
2017-11-08
2017-11-09
2017-11-10
2017-11-11
2017-11-12
2017-11-13
2017-11-14
2017-11-15

In [22]:
import pickle
# Combine all sector DataFrames into one multi-index DataFrame
combined_df = pd.concat(
    roberta_cluster_sector,
    keys=roberta_cluster_sector.keys(),
    names=['sector', 'date']
)

# Save the combined DataFrame
combined_df.to_pickle('roberta_sector_sentiment.pkl')

df1 = pd.read_pickle('roberta_sector_sentiment.pkl')

Unnamed: 0_level_0,Unnamed: 1_level_0,roberta_pos,roberta_neg,roberta_neu
sector,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10.0,2017-08-17,0.206346,0.114786,0.678868
10.0,2017-08-18,0.216834,0.134863,0.648303
10.0,2017-08-19,0.230936,0.134963,0.634101
10.0,2017-08-20,0.251720,0.121456,0.626824
10.0,2017-08-21,0.212612,0.135311,0.652077
...,...,...,...,...
60.0,2022-09-04,0.250048,0.127832,0.622120
60.0,2022-09-05,0.294789,0.122438,0.582773
60.0,2022-09-06,0.283463,0.125826,0.590711
60.0,2022-09-07,0.286544,0.110889,0.602568


In [11]:
roberta_cluster = sentiment_cluster(df, ticker_counts, analysis_on='roberta')

2017-08-17
2017-08-18
2017-08-19
2017-08-20
2017-08-21
2017-08-22
2017-08-23
2017-08-24
2017-08-25
2017-08-26
2017-08-27
2017-08-28
2017-08-29
2017-08-30
2017-08-31
2017-09-01
2017-09-02
2017-09-03
2017-09-04
2017-09-05
2017-09-06
2017-09-07
2017-09-08
2017-09-09
2017-09-10
2017-09-11
2017-09-12
2017-09-13
2017-09-14
2017-09-15
2017-09-16
2017-09-17
2017-09-18
2017-09-19
2017-09-20
2017-09-21
2017-09-22
2017-09-23
2017-09-24
2017-09-25
2017-09-26
2017-09-27
2017-09-28
2017-09-29
2017-09-30
2017-10-01
2017-10-02
2017-10-03
2017-10-04
2017-10-05
2017-10-06
2017-10-07
2017-10-08
2017-10-09
2017-10-10
2017-10-11
2017-10-12
2017-10-13
2017-10-14
2017-10-15
2017-10-16
2017-10-17
2017-10-18
2017-10-19
2017-10-20
2017-10-21
2017-10-22
2017-10-23
2017-10-24
2017-10-25
2017-10-26
2017-10-27
2017-10-28
2017-10-29
2017-10-30
2017-10-31
2017-11-01
2017-11-02
2017-11-03
2017-11-04
2017-11-05
2017-11-06
2017-11-07
2017-11-08
2017-11-09
2017-11-10
2017-11-11
2017-11-12
2017-11-13
2017-11-14
2017-11-15

In [23]:
roberta_cluster.reset_index().to_parquet(
    'roberta_cluster_sector.parquet',
    index=False,
    engine='pyarrow',
    compression='snappy'
)