In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.api import VAR
import os
import gc

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
# change dir
if os.path.exists("../data"):
    os.chdir("../data")
else:
    print("The directory ../data does not exist")

In [47]:
"""whole process for data filtration"""
# if choose other data
# set path
directory = 'roberta'

# get all Parquet file in this path
parquet_files = [f for f in os.listdir(directory) if f.endswith('.parquet')]

# read and merge all the Parquet files
df_list = []
for file in parquet_files:
    file_path = os.path.join(directory, file)
    temp_df = pd.read_parquet(file_path)
    df_list.append(temp_df)

df = pd.concat(df_list, ignore_index=True)

# use map table to filter these sentiment data
map_table = pd.read_pickle('final_map_table.pkl')
df = pd.merge(df, map_table, on=['id'], how='right')
# del combined_df
gc.collect()

939

In [48]:
df['date'] = df['date'] = [x[:10] for x in df['publishedDate']]

df = df[df['date'] >= '2017-08-17']
gc.collect()

0

In [49]:
# counts data
ticker_counts = pd.read_pickle('sNv.pkl').groupby('datadate').apply(lambda x: len(x['tic'].unique()))
ticker_counts_sector = pd.read_pickle('sNv.pkl').groupby(['gsector', 'datadate']).apply(lambda x: len(x['tic'].unique()))
ticker_counts = ticker_counts[(ticker_counts.index >= '2017-08-17') & (ticker_counts.index <= '2022-09-06')]
ticker_counts_sector = ticker_counts_sector[(ticker_counts_sector.index.get_level_values(1) >= '2017-08-17'
                                            ) & (ticker_counts_sector.index.get_level_values(1) <= '2022-09-06')]

In [50]:
def sentiment_cluster(df_, stock_counts, analysis_on = 'finbert', freq='d', impact=3):
    """ps: 1. some stocks have many news during same period, we should avoid double counting.
           2. """
    df_ = df_.copy()
    if freq == 'h':
        freq_col = 'publishedDate'
        stock_counts.name = 'count'
        df_ = pd.merge(df_, stock_counts, left_on='date', right_index=True, how='left')
        stock_counts = df_[['count', freq_col]].drop_duplicates().set_index([freq_col])
    else:
        freq_col = 'date'
        stock_counts.name = 'count'
    
    pos_col = analysis_on + '_pos'
    neg_col = analysis_on + '_neg'
    neu_col = analysis_on + '_neu'

    # for the repeated news about the same stock at a certain period, we take the average.
    temp_senti_s = df_.groupby(['ticker', freq_col]).apply(lambda x: x[[pos_col, neg_col, neu_col]].mean(axis=0))
    # suppose the stocks have no news remain the same sentiment score as last period.
    temp_senti_freq = temp_senti_s.groupby(level=1).mean()
    temp_count_freq = temp_senti_s.groupby(level=1).apply(lambda x: x.shape[0])
    temp_count_freq.name = 'inner_count'
    temp_merge = pd.concat([stock_counts, temp_count_freq, temp_senti_freq], axis=1).sort_index().ffill()
    cluster_dict = {pos_col: [], neg_col: [], neu_col: []}
    last_senti = {pos_col: temp_merge.head(1)[pos_col].values[0],
                  neg_col: temp_merge.head(1)[neg_col].values[0],
                  neu_col: temp_merge.head(1)[neu_col].values[0]}
    for ind, row in temp_merge.iterrows():
        print(ind)
        for s in [pos_col, neg_col, neu_col]:
            temp_put = (row[s] * impact * row['inner_count'] + last_senti[s] * (row['count'] - row['inner_count'])
                           ) / (impact * row['inner_count'] + row['count'] - row['inner_count'])
            
            cluster_dict[s].append(temp_put)
            last_senti[s] = temp_put
    return pd.DataFrame(data=cluster_dict, index=temp_merge.index)


def sector_cluster(df_, stock_counts_sector):
    sec_dict = {}
    for sector, group in df_.groupby('gsector'):
        tc = stock_counts_sector[stock_counts_sector.index.get_level_values(0) == sector].reset_index(level=0, drop=True)
        temp_d = sentiment_cluster(group, tc, analysis_on='roberta', freq='d')
        sec_dict[sector] = temp_d
    return sec_dict

In [None]:
see = sector_cluster(df, ticker_counts_sector)

In [18]:
roberta_cluster = sentiment_cluster(df, ticker_counts)