In [None]:
'''
imports
'''
import pandas as pd
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import spacy
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from collections import Counter
import matplotlib.pyplot as plt

## 1: Preprocessing

### 1.1 Prepare Dataframe

In [None]:
'''
load data into dataframe
'''
df = pd.read_csv("./data/Combined_News_DJIA.csv")

In [None]:
'''
drop rows containing null values
'''
df[df.isnull().any(axis=1)]
#can drop rows before melting, removing 9/15/2009, 12/24/2009,4/21/2011.
df= df.dropna()

In [None]:
'''
sort dataframe by date, check distribution of up and down days. Will ensure even split in training dataframe.
'''
df["Date"] = pd.to_datetime(df["Date"])
#already sorted i believe, just in case.
df = df.sort_values(by="Date")
print(f"number of days price fell: {int((df['Label'] == 0).sum())}, number of days price rose {int((df['Label'] == 1).sum())}")
df.head(5)

### 1.2 Clean and Tokenize

In [None]:
'''
load spacy, clean posts and split into tokens.
'''

nlp = spacy.load("en_core_web_sm")

def clean_post(post):
    '''
    post:str
    clean reminants of b"" assuming when data was scraped, it was binary str.
    '''
    #starts with ', ", b", b', (capture this), end with ',".
    pattern = r"^['\"]*b?['\"]?(.*?)['\"]?$"
    try:
        match  = re.search(pattern,post,re.DOTALL)
        if not match:
            print("vital error")
        else:
            #just convert to lowercase here, going to use spacy to tokenize
            return match.group(1).strip().lower()
    except Exception as e:
        print(post)

def tokenize(post):
    '''
    post:str 
    tokenizes given post. (spacy)
    '''
    #removing punctuation and white space, stop words may help with context.
    tokens = [token.text for token in nlp(post) if not (token.is_space or token.is_punct or token.is_stop)] #
    return tokens

#clean and tokenize
for i in range(1,26):
    df[f"Top{i}"] = df[f"Top{i}"].apply(clean_post)
    #tokenize
    df[f"Top{i}"] = df[f"Top{i}"].apply(tokenize)

### 1.3: Create Vocabulary

In [None]:
import random 
random.seed(19)

def balance_training_dataframe(train_df):
    '''
    train_df: pandas dataframe
    balances data distribution
    '''
    copy = train_df.copy()

    ones_indices = copy[copy["Label"] == 1].index.tolist()

    while (copy["Label"] == 0).sum() != (copy["Label"] == 1).sum():
        if len(ones_indices) > 0:
            rn = random.choice(ones_indices)
            copy = copy.drop(rn)  
            ones_indices.remove(rn)
    print((copy["Label"] == 0).sum(), (copy["Label"] == 1).sum())
    return copy

In [None]:
'''
set up test and train dataframe. set up vocabulary with tokens in training dataframe.
'''

train_size = int(len(df) * 0.8)
train_df = df.iloc[0:train_size]
test_df = df.iloc[train_size:]

train_df = balance_training_dataframe(train_df)

all_tokens = []
unique_tokens = set()

for _,row in train_df.iterrows():
    for i in range(1,26):
        col = row[f"Top{i}"]
        for token in col:
            all_tokens.append(token)
            unique_tokens.add(token)

vocab = sorted(list(unique_tokens))

#remove any tokens from test not in train
for i in range(1,26):
    test_df.loc[:,f"Top{i}"] = test_df[f"Top{i}"].apply(lambda post: [token for token in post if token in vocab])

#padding char is going to be zero, add 1 to idx
encoder = {token:idx+1 for idx,token in enumerate(vocab)}
encode = lambda post: [encoder[token] for token in post]

In [None]:
#no entries were completely removed
cols = [f"Top{i}" for i in range(1,26)]
len(test_df) == test_df[cols].map(lambda x : len(x) > 0 ).all(axis=1).sum()

In [None]:
#max len of post is in our train data
max_len_of_post = float("-inf")
for _,row in train_df.iterrows():
    for i in range(1,26):
        max_len_of_post = max(len(encode(row[f"Top{i}"])), max_len_of_post)
max_len_of_post

## 2: Visualization Of Data

In [None]:
token_count = Counter(all_tokens)
def plot_freq_k(k=20,stop=False):
    '''
    k: int 
    plots bar chart of top k tokens frequencies.
    '''
    most_common = token_count.most_common(k)
    tokens,counts = zip(*most_common)
    plt.figure(figsize=(20,10))
    plt.bar(tokens,counts)
    plt.title(f"Top {k} Most Common Tokens")
    plt.show()

plot_freq_k()

In [None]:
def plot_token_distribution():
    '''
    plots dist of tokens
    '''
    plt.figure(figsize=(12,8))
    plt.hist(list(token_count.values()), bins=100)
    #was way too big cuz of bin
    plt.xlim(0,500)
    plt.xlabel("Number of Tokens")
    plt.ylabel("Token Frequency")
    
    plt.title("Token Frequency Distribution")
    plt.show()
plot_token_distribution()

## 3: Model

### 3.1: Hyperparameters

In [None]:
#****************************#
#******hyper parameters******#
embed_size = 128
batch_size = 64 
block_size = max_len_of_post
learning_rate = 1e-5
n_heads,n_layers = 8, 4
vocab_size = len(vocab)
epochs = 170
#******hyper parameters******#
#****************************#

### 3.2: Tensor Creation & Batch Handling

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

def handle_padding(post):
    '''
    post:str
    adds padding tokens to each post.
    '''
    post = encode(post)
    
    #add padding char (0)
    if len(post) < max_len_of_post:
        return post + [0] * (max_len_of_post -len(post))
    else:
        return post

def get_batch(batch,df=train_df):
    '''
    batch_num:int [0, len(df) // batch_size]
    df:dataframe
    gets batch B,P=25,T
    '''
    cols = [f"Top{i}" for i in range(1,26)]
    x = []
    y = []
    #inputs
    batch = df.iloc[batch:batch+batch_size]
  
    for _,row in batch.iterrows():
        #encodes and adds padding token(0)
        x.append([handle_padding(post) for post in row[cols].to_list()])
        y.append(row["Label"])

    x = torch.tensor(x).to(device)
    y = torch.tensor(y).to(device).unsqueeze(1)
    #padding mask
    padding_mask  = (x !=0).byte()
    return padding_mask,x,y

### 3.3: Model Implementation

In [None]:
'''
Wanted to cite Andrej Kaparthy's lecture as it essentially walked me through the code and concepts.
https://www.youtube.com/watch?v=kCc8FmEb1nY&t=3423s 
https://github.com/karpathy/ng-video-lecture
'''


'''
maybe add dropouts
'''
#for reproduction
torch.manual_seed(19)

class WeightedSumTokenPooling(nn.Module):
    def __init__(self):
        '''
        w: fully connected layer
        '''
        super().__init__()
        self.w = nn.Linear(embed_size,1) #kind of shallow
    def forward(self,x,padding_mask):
        attn = self.w(x) # B,P,T,1
        padding_mask = padding_mask.unsqueeze(3) # B,P,T,1
        attn = attn.masked_fill(padding_mask == 0, float("-inf")) # get rid of padding
        attn = F.softmax(attn,dim=2) # calc weights
        x = torch.sum(x * attn,dim=2) #weights broadcast horizontally scaling tokens
        return x 


class WeightedSumPostPooling(nn.Module):
    def __init__(self):
        '''
        w: fully connected layer
        '''
        super().__init__()
        self.w = nn.Linear(embed_size,1)
    def forward(self,x):
        attn = self.w(x)# B,P,1
        attn = F.softmax(attn,dim=1)
        x = torch.sum(x * attn,dim=1) #weights broadcast horizontally scaling posts
        return x 



class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        '''
        seq : fully connected layer
        '''
        self.seq = nn.Sequential(
            nn.Linear(embed_size,embed_size),
            nn.ReLU(),
            nn.Linear(embed_size,embed_size)
        )
    #x is B,P,T,C
    def forward(self,x):
        out = self.seq(x)
        return out
    
class Head(nn.Module):
    '''
    the bread and butter...
    query: linear (embed => head), what each token is looking for
    key: linear(embed => head), what each token has
    value: linear(embed => head), the actual contexualized value of each token
    '''
    def __init__(self,head_size):
        super().__init__()
        self.query = nn.Linear(embed_size,head_size,bias=False)
        self.key = nn.Linear(embed_size,head_size,bias=False)
        self.value = nn.Linear(embed_size,head_size,bias=False)
    #x is B,P,T,C
    def forward(self,x,padding_mask):
        
        #linear pass
        #B,P,T,C => B,P,T,head_size
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)

        #attention scores
        #(kinda cosine similiarty with different scaling?),
        #B,P,T,head_size @ B,P,head_size,T => B,P,T,T
        attn = (q @ k.transpose(-2,-1)) * (k.shape[-1] ** -0.5)

        #***************************#
        #**casual mask implemented**#
        #***************************#

        attn = torch.tril(attn)
        attn = attn.masked_fill(attn ==0, float("-inf"))
        
        '''
        thinking: x is of shape B,P,T,T. padding mask is B,P,T.
        unsqueezing at dim 2 will make padding mask to be B,P,1,T. so we have a row representation of mask. (essentially we pad out the attn scores involved with padding tokens not the actual padding tokens.)
        let x = real y = fake.
        [
            y y y y      y x x       y y y
            x x x x  @   y x x    =  y x x 
            x x x x      y x x       y x x  
        ]                y x x
        '''
        
        
        col_mask = padding_mask.unsqueeze(2) #should broadcast
        attn = attn.masked_fill(col_mask == 0, float("-inf"))

        #prob
        attn = F.softmax(attn,dim=-1)

        #feed information into value vectors
        out = attn @v
        return out

class MultiHeadAttention(nn.Module):
    '''
    head_size: replaces C dimensions, need trainable weights so C => head_size is q,k,v matrices.
    heads: module list so diff parameters for each instance
    lin: combine everything each head learned then linear layer to combine it as a single representation.
    '''
    def __init__(self,head_size):
        super().__init__()
        self.heads = nn.ModuleList(Head(head_size) for _ in range(n_heads))
        self.lin = nn.Linear(head_size * n_heads,embed_size)
    #x is B,P,T,C
    def forward(self,x,padding_mask):
        #concat head's on last dimension
        out = torch.cat([h(x,padding_mask) for h in self.heads],dim=-1)
        out  = self.lin(out)
        return out

class Block(nn.Module):
    '''
    sa_heads: self attention heads
    feed_forward: fully connected layer
    l1,l2:layer norms
    '''
    def __init__(self):
        super().__init__()
        head_size = embed_size // n_heads
        self.sa_heads =  MultiHeadAttention(head_size)
        
        self.feed_forward= FeedForward()
        
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)
    def forward(self,x,padding_mask):
        #add residuals, tokens are going to change, but since it may be better to keep some of the old information we add it as a residual.
        x = self.ln1(x + self.sa_heads(x,padding_mask))
        out = self.ln2(x + self.feed_forward(x))
        return out

class SentimentModel(nn.Module):
    '''
    token_embedding: embed for each token in vocab
    position_embedding: embed for index of token in range [0-max_len_of_post]
    ln1,ln2: layer norm
    t_pool: weighted sum pooling across T
    p_pool: weighted sum pooling across P
    blocks: a simplified block. the amount of actual mha passes were going to have (n_layers)
    classifier: embed size -> 1 for classification
    '''
    
    def __init__(self):
        super().__init__()
        
        self.token_embedding = nn.Embedding(vocab_size+1,embed_size) # add one for padding
        self.position_embedding = nn.Embedding(block_size,embed_size)

        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)

        self.t_pool = WeightedSumTokenPooling()

        '''
        important! remove this if loading WeightedSumT_MeanP
        ∇ ∇ ∇ ∇
        '''
        self.p_pool = WeightedSumPostPooling()


        self.blocks = nn.Sequential(*[Block() for _ in range(n_layers)])

        self.classifier = nn.Linear(embed_size,1)

    def forward(self,padding_mask,x,labels=None):
        T = x.shape[2]

        #embeddings
        x = self.token_embedding(x)
        x = x + self.position_embedding(torch.arange(T,device=device))


        #layers
        for block in self.blocks:
            x = block(x,padding_mask)

        '''
        My thought process here. at this point we have a B,P,T,C which we need to dial down to two dimensions. I attempted doing mean pooling for the tokens after mha pass,
        which I believe constricted the amount of information in the sense that it did not weigh the actual tokens importance. ex: "the" had the same importance of "bearish".

        For the post & token dimension, I think a combo of both weighted sum and (mean or max) pooling is the way to go since the model may be able to find certain tokens/posts that were more important 
        based off the actual context thanks to the mha layer. Better than taking the avg or max across T,P dimensions.
        '''
     
        #pool tokens
        x = self.t_pool(x,padding_mask) #B,P,T,C => B,P,C

        x = self.ln1(x) #maybe

        '''
        important! must change to mean if loading WeightedSumT_MeanP
        x = x.mean(dim=1)
        ∇ ∇ ∇ ∇
        '''
        #pool posts
        x = self.p_pool(x) #B,P,C => B,C 

        x = self.ln2(x) #maybe
   
        #logits
        logits = self.classifier(x) # B,C => B,1

        if labels == None:
            loss = None
        else:
            #bce instead of ce since there are only two classes
            loss = F.binary_cross_entropy_with_logits(logits,labels.float())
        return logits,loss

In [None]:
m = SentimentModel()
m = m.to(device)
#print total params
print(f"number of params: {sum(p.numel() for p in m.parameters())}")
optimizer = torch.optim.AdamW(m.parameters(),lr = learning_rate)

### 3.4: Training

#### 3.4.1: Helper Functions For Visualizing/Understanding Model's Progression/Predictive Power

In [None]:
visfp = "./visualization/"

def visualize_metrics_trend(accuracy,precision,recall,f1,losses):
    '''
    accuracy: per epoch
    precision: per epoch
    recall: per epoch
    f1: per epoch
    losses: per epoch
    line plot visualize training metrics for training over epochs
    '''
    plt.plot(accuracy, label="Accuracy")
    plt.plot(precision, label="Precision")
    plt.plot(recall, label="Recall")
    plt.plot(f1, label="F1")
    plt.plot(losses, label="Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Score")
    plt.title("Train Metrics over epochs")
    plt.legend()
    plt.show()


def handle_metrics(labels,preds):
    '''
    labels: correct labels
    preds: predictions
    calcuates metrics accuracy, precision, recall, and f1
    '''
    accuracy = accuracy_score(labels,preds)
    precision = precision_score(labels,preds,zero_division=0)
    recall = recall_score(labels,preds,zero_division=0)
    f1 = f1_score(labels,preds,zero_division=0)
    print(f"accuracy: {accuracy}, precision: {precision}, recall: {recall}, f1: {f1}")
    return accuracy,precision,recall,f1

#### 3.4.2: Training Implementation

In [None]:
fp = "./tensors/"
def train():
    m.train()
    losses = []
    accuracy = []
    precision = []
    recall =[]
    f1 = []
    for epoch in range(epochs):
        avg_loss = 0
        preds = []
        labels = []
        #each row is a batch
        for i in range((len(train_df) // batch_size)):
            #forward pass
            batch = i * batch_size
            padding_mask,x,y = get_batch(batch,df=train_df)
            logits,loss = m(padding_mask,x,y)
            #manually calc predictions
            p = F.sigmoid(logits)
            p = (p>=0.5).int() 
            p = p.squeeze(1).cpu().tolist()
            y = y.squeeze(1).cpu().tolist()
            #save predictions and labels
            preds.extend(p)
            labels.extend(y)
            #step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #calc avg loss
            avg_loss += loss.detach()
        avg_loss = avg_loss / (len(train_df) // batch_size)
        print(f"epoch: {epoch}, avg_loss: {avg_loss}")
        #met = accuracy,precision,recall,f1
        met = handle_metrics(labels,preds)
        losses.append(avg_loss.cpu())
        #prepare arrays for visualization
        accuracy.append(met[0])
        precision.append(met[1])
        recall.append(met[2])
        f1.append(met[3])
        if avg_loss < 0.3:
          torch.save({
              "epoch": epoch,
              "model_state_dict":m,
              "optimizer_state_dict":optimizer.state_dict(),
          },fp + "WeightedSumT_MeanP.pth")
          visualize_metrics_trend(accuracy,precision,recall,f1,losses)
          return
          

In [None]:
#train()
###
# training will take a really long time without gpu. I saved two model states:
#    -   one with weighted sum pooling on both T and P. 'WeightedSumTP.pth'
#    -   weighted sum pooling on both T and mean on P. 'WeightedSumT_MeanP.pth'
###


### 3.5: Testing

#### 3.5.1 Visualize Model's Result Metrics

In [None]:
def visualize_metric_score(met):
    '''
    met: metric scores
    bar plot for visualizing test metrics
    '''
    metrics = ["Accuracy","Precision","Recall","F1"]
    plt.bar(metrics,met)
    plt.title("Test Metrics")
    plt.show()

#### 3.5.2: Testing Implementation

In [None]:
checkpoint  = torch.load("./tensors/WeightedSumTP.pth",weights_only=False,map_location=device) #or WeightedSumT_MeanP.pth, make sure to change model. I left comments on what to change.
m = checkpoint["model_state_dict"]
print(f"training epoch: {checkpoint["epoch"]}")
@torch.no_grad()
def test():
  m.eval()
  preds = []
  labels = []
  avg_loss = 0
  for i in range((len(test_df) // batch_size)):
    batch = i * batch_size
    padding_mask,x,y = get_batch(batch,df=test_df)
    logits ,loss = m(padding_mask,x,y)
    #manually calculate preds
    p = F.sigmoid(logits)
    p = (p>=0.5).int() 
    #save preds and labels
    p = p.squeeze(1).cpu().tolist()
    y = y.squeeze(1).cpu().tolist()
    preds.extend(p)
    labels.extend(y)
    avg_loss += loss.detach()
    print(f"batch : {i}")
  avg_loss = avg_loss / (len(train_df) // batch_size)
  print(f"avg_loss: {avg_loss}")
  #visualize metrics
  met = handle_metrics(labels,preds)
  visualize_metric_score(met)

In [None]:
test()

### 4: Deallocate Memory

In [None]:
torch.cuda.empty_cache()