In [1]:
import numpy as np
from scipy.stats import pearsonr
from collections import defaultdict
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import os
from tqdm import tqdm
import re
from nltk.corpus import stopwords

In [92]:
stop_words = set(stopwords.words('english'))
punctuations = set(['.', ',', '!', '?', ';', ':', '"', "'", '(', ')', '[', ']', '{', '}']) 

In [93]:
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

In [94]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [2]:
same=pd.read_csv('word importance/2000-2003.csv') 
# this table contains the stock volatility and company index in these years
data_index1=same['2000'].tolist()
data_index2=same['2001'].tolist()
data_index3=same['2002'].tolist()
data_index4=same['2003'].tolist()
trainY=same['2000-12'].tolist()+same['2001-12'].tolist()+same['2002-12'].tolist()+same['2003-12'].tolist()

In [3]:
trainX=[]
# get the tokenized text
folder_path1='token/2000'
folder_path2='token/2001'
folder_path3='token/2002'
folder_path4='token/2003'

In [4]:
for file_name in data_index1:
    file_path = os.path.join(folder_path1, file_name+'.mda')
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            trainX.append(content)

In [98]:
for file_name in data_index2:
    file_path = os.path.join(folder_path2, file_name+'.mda')
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            trainX.append(content)

In [99]:
for file_name in data_index3:
    file_path = os.path.join(folder_path3, file_name+'.mda')
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            trainX.append(content)

In [100]:
for file_name in data_index4:
    file_path = os.path.join(folder_path4, file_name+'.mda')
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            trainX.append(content)

In [101]:
len(trainX),len(trainY)

(4492, 4492)

In [102]:
model_name = 'distilbert-base-uncased'  
batch_size = 16                   
max_length = 128                 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [103]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, output_attentions=True).to(device)
model.eval()

x = trainX
y = trainY

word_correlations = defaultdict(lambda: {'scores': [], 'ys': []})

In [None]:
for batch_idx in tqdm(range(0, len(x), batch_size), desc="Processing"):
    batch_texts = x[batch_idx:batch_idx+batch_size]
    batch_labels = y[batch_idx:batch_idx+batch_size]
    
    preprocessed_texts = [preprocess_text(text) for text in batch_texts]
    
    inputs = tokenizer(
        preprocessed_texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
        return_offsets_mapping=True
    )
    # offset_mapping is needed to map tokens to words
    offset_mapping = inputs.pop('offset_mapping').cpu().numpy() 
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        attentions = [a.cpu() for a in outputs.attentions]
    del outputs
    
    # get the last layer attention and average over heads
    last_layer_att = attentions[-1].mean(1).mean(1).numpy() 
    
    # process each sample in the batch
    for i in range(len(batch_texts)):
        text = preprocessed_texts[i]
        original_text = batch_texts[i]  
       
        words = text.split()
      
        word_offsets = []
        current_pos = 0
        for word in words:
            if word in stop_words:
                current_pos += len(word) + 1  
                continue
            start = current_pos
            end = current_pos + len(word)
            word_offsets.append( (start, end) )
            current_pos = end + 1

        valid_words = [word for word in words if word not in stop_words]
    
        sample_offsets = offset_mapping[i]
        sample_scores = last_layer_att[i]
        word_scores = [[] for _ in valid_words]
        
        # map tokens to words
        for token_idx, (start, end) in enumerate(sample_offsets):
            token_text = tokenizer.decode(inputs['input_ids'][i][token_idx])
            if token_text in punctuations:
                continue
            if start == end == 0:
                continue
            # find the word that the token belongs to
            for word_idx, (word_start, word_end) in enumerate(word_offsets):
                if start >= word_start and end <= word_end:
                    word_scores[word_idx].append(sample_scores[token_idx])
                    break
        # calculate the average score for each word
        for word_idx, scores in enumerate(word_scores):
            if not scores:
                continue
            word = valid_words[word_idx]
            avg_score = np.mean(scores)
            word_correlations[word]['scores'].append(avg_score)
            word_correlations[word]['ys'].append(batch_labels[i])

Processing: 100%|██████████| 281/281 [03:21<00:00,  1.39it/s]


In [None]:
correlation_results = []
# calculate the correlation between the word importance and the stock volatility
for word, data in word_correlations.items():
    if len(data['scores']) < 2:
        continue
    corr, p_value = pearsonr(data['scores'], data['ys'])
    if p_value > 0.05: 
        continue
    correlation_results.append( (word, corr) )

  corr, p_value = pearsonr(data['scores'], data['ys'])


In [None]:
correlation_results = [(word, corr if not np.isnan(corr) else 0) for word, corr in correlation_results]
correlation_results.sort(key=lambda x: abs(x[1]), reverse=True)

In [107]:
print("\nTop influential words:")
for word, corr in correlation_results[:10]:
    print(f"{word:<15} | Correlation: {corr:.3f}")


Top influential words:
appraisal       | Correlation: -1.000
court           | Correlation: 1.000
emphasis        | Correlation: -1.000
hubandspoke     | Correlation: -1.000
stringent       | Correlation: -1.000
longform        | Correlation: -1.000
broadly         | Correlation: 1.000
reselling       | Correlation: 1.000
achieving       | Correlation: -1.000
month           | Correlation: -1.000


In [None]:
df = pd.DataFrame(correlation_results, columns=["Word", "Correlation"])
df.to_csv('word importance/2000-2003result-12.csv',index=False)