In [1]:
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords
import string

In [None]:
# This code is used for the comparison between different models.
# using data from 1996 and 1997

In [132]:
# download the stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zzz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
trainX=[]
testX=[]

In [2]:
# this table contains the stock log volatility of 1996 and 1997 and related company index
same=pd.read_csv('logvol/9697same.csv')

In [5]:
data96_index=same['1'].tolist() # the company index of 1996
data97_index=same['3'].tolist() # the company index of 1997

In [3]:
folder_path96='origin/1996'
folder_path97='origin/1997'

In [8]:
# read the data from 1996
for file_name in data96_index:
    file_path = os.path.join(folder_path96, file_name+'.mda')
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            trainX.append(content)

In [None]:
# read the data from 1997
for file_name in data97_index:
    file_path = os.path.join(folder_path97, file_name+'.mda')
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            testX.append(content)

In [139]:
len(trainX),len(testX)

(997, 997)

In [None]:
train_X=same['1996-12'].tolist() # the stock log volatility of preceding year of 1996
test_X=same['1997-12'].tolist()
trainY=same['1996+12'].tolist()
testY=same['1997+12'].tolist()

In [141]:
len(trainY),len(testY)

(997, 997)

In [142]:
def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(words)

In [143]:
trainX = [preprocess_text(text) for text in trainX]
testX = [preprocess_text(text) for text in testX]

In [144]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [145]:
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW

In [None]:
from transformers import AutoModel, AutoTokenizer

model_name = "distilbert-base-uncased" 
# distilbert-base-uncased contains amlost all functions of bert-base-uncased but with fewer parameters
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name)

In [147]:
class EarlyStopping:
    def __init__(self, patience=10):
        self.patience = patience
        
        self.best_score = None
        self.counter = 0
        
    def __call__(self, val_loss):
        if self.best_score is None:
            self.best_score = val_loss
        elif val_loss > self.best_score:
            self.counter += 1
            if self.counter >= self.patience:
                return True
        else:
            self.best_score = val_loss
            self.counter = 0
        return False                        

In [148]:
class TransformerRegressor(nn.Module):
    def __init__(self, base_model):  
        super().__init__()
        self.base = base_model
        self.regressor = nn.Linear(base_model.config.hidden_size+1, 1)
        self.to(device)

    def forward(self, input_ids, attention_mask, features):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        features = features.to(device).unsqueeze(1)
        outputs = self.base(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        combined_output = torch.cat((pooled_output, features), dim=1)
        return self.regressor(combined_output)


In [149]:
class TextDataset(Dataset):
    def __init__(self, texts, features, labels, tokenizer, max_length):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.features = features
        self.labels = labels

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.encodings["input_ids"][idx]).to(device),
            "attention_mask": torch.tensor(self.encodings["attention_mask"][idx]).to(device),
            "features": torch.tensor(self.features[idx], dtype=torch.float).to(device),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float).to(device)
        }
   
    def __len__(self):
        return len(self.labels) 

In [150]:
model = TransformerRegressor(base_model).to(device)
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), 
                lr=5e-5)
criterion = nn.MSELoss()



In [151]:
dataset = TextDataset(trainX, train_X, trainY, tokenizer, max_length=512)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [152]:
early_stopping = EarlyStopping(patience=10)

In [None]:
model.train()

for epoch in range(20):
    
    for batch in dataloader: 
        optimizer.zero_grad()
        outputs = model(batch["input_ids"], batch["attention_mask"], batch["features"])
        loss = criterion(outputs.squeeze(), batch["labels"])
        
        loss.backward()
        optimizer.step()  
    
    print(f"Epoch {epoch}, Loss: {loss.item()}")


Epoch 0, Loss: 0.07373645901679993
Epoch 1, Loss: 0.3753010928630829
Epoch 2, Loss: 0.19525334239006042
Epoch 3, Loss: 0.45261621475219727
Epoch 4, Loss: 0.21517781913280487
Epoch 5, Loss: 0.14003035426139832
Epoch 6, Loss: 0.026722243055701256
Epoch 7, Loss: 0.03200840577483177
Epoch 8, Loss: 0.012472349219024181
Epoch 9, Loss: 0.012616624124348164
Epoch 10, Loss: 0.02600136399269104
Epoch 11, Loss: 0.004939985927194357
Epoch 12, Loss: 0.03155200555920601
Epoch 13, Loss: 0.0035327679943293333
Epoch 14, Loss: 0.01031049806624651
Epoch 15, Loss: 0.005409530829638243
Epoch 16, Loss: 0.008338897489011288
Epoch 17, Loss: 0.006625916343182325
Epoch 18, Loss: 0.00439546350389719
Epoch 19, Loss: 0.009470946155488491


In [154]:
#---------------------------------------------------------------------------

In [155]:
def predict(texts, features, batch_size=16):
    model.eval()
    predictions = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_features = features[i:i+batch_size]
        
        inputs = tokenizer(
            batch_texts, 
            return_tensors="pt", 
            truncation=True, 
            max_length=512, 
            padding=True  
        )
        inputs.pop("token_type_ids", None)  
        inputs = {k: v.to(device) for k, v in inputs.items()}
        batch_features = torch.tensor(batch_features, dtype=torch.float).to(device)
        with torch.no_grad():
            outputs = model(**inputs
                            ,features=batch_features
                            )
        predictions.extend(outputs.squeeze().cpu().tolist())  
    return predictions  

In [156]:
predictions = predict(testX,test_X)  

In [157]:
torch.cuda.empty_cache()

In [158]:
from sklearn.metrics import mean_squared_error, r2_score

In [159]:
mse=mean_squared_error(testY, predictions)
print('Mean Squared Error:', mse)

r2 = r2_score(testY, predictions)
print(f'R² Score: {r2}')

Mean Squared Error: 0.16201278283325635
R² Score: 0.5677439045532094
