In [None]:
import collections.abc
import json
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import math
import tqdm as progressbar
import time
import enum
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import re

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
DETAILS_JSON = "/kaggle/input/subject-generation/email_thread_details.json"
# SUMMARIES_JSON = "data/email_thread_summaries.json"

kThreadId = "thread_id"
kSubject = "subject"
kTimestamp = "timestamp"
kFrom = "from"
kTo = "to"
kBody = "body"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Utils

In [None]:


class Utils():
    @staticmethod
    def load_dataset(DETAILS_FILE):
        '''
            This function loads the dataset from the file
            ARGS:
                filename: the name of the file
            RETURN:
                dataset: the dataset
        '''
        with open(DETAILS_FILE, 'r') as f:
            details = json.load(f)
        
        
        dataset = {}
        for i in range(len(details)):
            item = details[i]
            thread_id = item[kThreadId]
            body= Utils.tokenize(item)
            dataset[thread_id] = dataset.get(thread_id, []) + [body[kBody]]
            
        
        
        curr_thread = 0
        for i in range(len(details)):
            item = details[i]
            thread_id = item[kThreadId]
            if thread_id != curr_thread:
                curr_thread = thread_id
                subject = Utils.tokenize_subject(item)
                dataset[thread_id] = (dataset.get(thread_id), subject)


        return dataset
    
    @staticmethod
    def tokenize(item):
        item[kBody] = word_tokenize(item[kBody])
        item[kBody] = [re.sub(r'[^\w\s.]', '', word) for word in item[kBody]]
        item[kBody] = [word.strip() for word in item[kBody] if word.strip() and word.strip() not in ['--', '=']]
        # Lowercase the email body
        item[kBody] = [word.lower() for word in item[kBody]]
        item[kBody] = ["<BOS>"] + item[kBody] + ["<EOS>"]
        item[kBody] = " ".join(item[kBody])
        return item
        
    def tokenize_subject(item):
        # repeat with the subject which is the same for each thread id
        item[kSubject] = word_tokenize(item[kSubject])
        item[kSubject] = [re.sub(r'[^\w\s.]', '', word) for word in item[kSubject]]
        item[kSubject] = [word.strip() for word in item[kSubject] if word.strip() and word.strip() not in ['--', '=']]
        # Lowercase the email subject
        item[kSubject] = [word.lower() for word in item[kSubject]]
        item[kSubject] = ["<BOS>"] + item[kSubject] + ["<EOS>"]
        item[kSubject] = " ".join(item[kSubject])
        return item[kSubject]
    
    @staticmethod
    def build_vocab(data):
        '''
            This function builds the vocabulary from the data
            ARGS:
                data: the data to build the vocabulary from ([Email], EmailSummaries)
            RETURN:
                vocab: the vocabulary
        '''
        vocab = Vocab()
        for _, (email_list, subject) in data.items():
            for email in email_list:
#                 print(email_list)
                for word in email:
#                     print(word)
                    vocab.add(word)
                for word in subject.split():
                    vocab.add(word)
            
        
        return vocab

class Vocab(collections.abc.MutableSet):
    """
        Set-like data structure that can change words into numbers and back.
        From Prof. David Chiang Code
    """
    def __init__(self):
        words = {'<BOS>', '<EOS>', '<UNK>'}
        self.num_to_word = list(words)
        self.word_to_num = {word:num for num, word in enumerate(self.num_to_word)}
    def add(self, word):
        if word in self: return
        num = len(self.num_to_word)
        self.num_to_word.append(word)
        self.word_to_num[word] = num
    def discard(self, word):
        raise NotImplementedError()
    def update(self, words):
        self |= words
    def __contains__(self, word):
        return word in self.word_to_num
    def __len__(self):
        return len(self.num_to_word)
    def __iter__(self):
        return iter(self.num_to_word)

    def numberize(self, word):
        """Convert a word into a number."""
        if word in self.word_to_num:
            return self.word_to_num[word]
        else:
            return self.word_to_num['<UNK>']

    def denumberize(self, num):
        """Convert a number into a word."""
        return self.num_to_word[num]

## Pre Process

In [None]:
# Load the data
d = Utils.load_dataset(DETAILS_JSON)
for i, (key, value) in enumerate(d.items()):
    if key == 4:
        print(key,value)
    if key > 4:
        break
vocab = Utils.build_vocab(d)

# 4 (['<BOS> thanks for the update . pl <EOS>', '<BOS> that is ok . thanks for the update . pl <EOS>', '<BOS> i just went to look at the file and the data is yesterday s data . we need the current prices that are set each day at roughly 1210 for the gas day of the next day . please let me know if this does not make sense . thanks . pl <EOS>', '<BOS> today s file looks good . thanks for your help . pl <EOS>'], '<BOS> eol data <EOS>')

In [115]:
# Split the dictionary into train and test
data = list(d.items())
# print(data[3])
random.shuffle(data)
train, test = train_test_split(data, test_size=0.2, random_state=42)

train = [email_list for _, email_list in train]
# print(train[1])
test = [email_list for _, email_list in test]

(4, (['<BOS> thanks for the update . pl <EOS>', '<BOS> that is ok . thanks for the update . pl <EOS>', '<BOS> i just went to look at the file and the data is yesterday s data . we need the current prices that are set each day at roughly 1210 for the gas day of the next day . please let me know if this does not make sense . thanks . pl <EOS>', '<BOS> today s file looks good . thanks for your help . pl <EOS>'], '<BOS> eol data <EOS>'))
(['<BOS> paul you say 5000day plus fuel . the delivery point is in the south texas pool . where does the fuel come in and how is it determined also did we ever determine the name of the reliant energy entity that was doing the deal paul burgener enron 09252000 0445 pm to dan j hyvlhouect ect cc subject trunkline deals wreliant dan attached are the details for the enovate transaction s purchases of physical gas from reliant trunkline stx pool . i received your draft of the agreed master for the 30000 day parcel . could you draft a form for the 5000day parce

# Models

In [131]:
class SubjectGenerator(nn.Transformer):
    '''
    This class implements the SubjectGenerator

    
    '''

    def __init__(self, vocab_size, vocab, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6):
        '''
        This function initializes the model
        ARGS:
            vocab_size: the size of the vocabulary
            d_model: the dimension of the model
            nhead: the number of heads
            num_encoder_layers: the number of encoder layers
            num_decoder_layers: the number of decoder layers
        RETURN:
            None
        '''

        super(SubjectGenerator, self).__init__()
        self.model_type = 'Transformer'
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model) # Embedding layer
        self.transformer = nn.Transformer(
            d_model=d_model, 
            nhead=nhead, 
            num_encoder_layers=num_encoder_layers, 
            num_decoder_layers=num_decoder_layers
        )
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.vocab = vocab

    def forward(self, src, target=None):
        '''
        This function performs the forward pass of the model
        ARGS:
            src: the source input
            target: the target input (optional, used during training)
        RETURN:
            output: the output of the model
        '''
        src = self.embedding(src)
        
        if target is not None:
            target = self.embedding(target)
            output = self.transformer(src, target)
        else:
            # In generation mode, don't use target
            output = self.transformer(src, src)  # Use src as both source and target TODO: 
        
        output = self.fc_out(output)

        return output
    
    def summarize(self, src, max_len=100, mode="greedy"):
        '''
        This function summarize the input text
            args:
                src: the source input
                max_len: the maximum length of the output
                mode: the mode of generation (greedy or beam search)
            return:
                output: the output of the model
        '''
        o = self.forward(src)
        output = None
        
        if mode == "greedy":
            output =  self.greedy_decoding(o, max_len)
        elif mode == "beam":
            output = self.beam_search(o, max_len)

        return output
        
    def greedy_decoding(self, o, max_len):
        '''
        This function performs greedy decoding
        ARGS:
            o: the output of the model
            max_len: the maximum length of the output
        RETURN:
            output: the output of the model
        '''
        output = []
        words = 0
        for i in o:
            if words >= max_len:
                break
            a = torch.argmax(i)
            a = self.vocab.denumberize(a)
            output.append(a)
            words += 1
        return output

    def beam_search(self, o, max_len):
        '''
        
        '''
        pass

## Training

In [132]:
ntokens = len(vocab) # size of vocabulary
emsize = 100 # embedding dimension
nhid = 100 # the dimension of the feedforward network model in nn.TransformerEncoder
n_encoder_layers = 6 # the number of encoder layers
n_decoder_layers = 6 # the number of decoder layers
nhead = 2 # the number of heads in the multiheadattention models
lr = 0.02 # learning rate

model = SubjectGenerator(ntokens, vocab, emsize, nhead, n_encoder_layers, n_decoder_layers)

# RECENT_MODEL = "models/model.pt-2023-12-04_00:29:43.pt"
# model.load_state_dict(torch.load(RECENT_MODEL))

In [133]:
def train_SubjectGenerator(model: SubjectGenerator, train_data, criterion, max_input_len = 150, max_output_len = 50, lr=0.001, threshold_norm=0.5):
    model.train()  # Turn on the train mode
    total_loss = 0.

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)  # Initialize Adam optimizer

    for item in progressbar.tqdm(train_data, desc="Thread Training", total=len(train_data)):
        optimizer.zero_grad()  # Zero the gradients
        thread = item
#         print(thread)
        bodies, subject = thread
#         print(bodies, subject)

        body = " ".join(bodies)
        email_tensor = torch.tensor([model.vocab.numberize(word) for word in body.split()]) # Convert email to tensor
        subject_tensor = torch.tensor([model.vocab.numberize(word) for word in subject.split()]) # Convert subject to tensor


        if email_tensor.nelement() > max_input_len: # Truncate email if it is too long
            email_tensor = email_tensor[:max_input_len]

        output = model(email_tensor, subject_tensor)  # Forward pass

        output = output.view(-1, ntokens) # Reshape output 
        loss = criterion(output, subject_tensor) # Calculate loss
        loss.backward() # Backward pass
        torch.nn.utils.clip_grad_norm_(model.parameters(), threshold_norm) # Clip gradients
        optimizer.step() # Update weights
        total_loss += loss.item() 
    return model


In [134]:
curr_time = time.strftime("%Y-%m-%d_%H:%M:%S")
MODEL_PATH = f"models/model.pt-{curr_time}.pt"

model = train_SubjectGenerator(model, train, nn.CrossEntropyLoss(), lr=lr)
# model.train()  # Turn on the train mode
#     total_loss = 0.

#     optimizer = torch.optim.Adam(model.parameters(), lr=lr)  # Initialize Adam optimizer

#     for item in progressbar.tqdm(train_data, desc="Thread Training", total=len(train_data)):
#         optimizer.zero_grad()  # Zero the gradients
#         thread = item
# #         print(thread)
#         bodies, subject = thread
# #         print(bodies, subject)

#         body = " ".join(bodies)
#         email_tensor = torch.tensor([model.vocab.numberize(word) for word in body.split()]) # Convert email to tensor
#         subject_tensor = torch.tensor([model.vocab.numberize(word) for word in subject.split()]) # Convert subject to tensor


#         if email_tensor.nelement() > max_input_len: # Truncate email if it is too long
#             email_tensor = email_tensor[:max_input_len]

#         output = model(email_tensor, subject_tensor)  # Forward pass

#         output = output.view(-1, ntokens) # Reshape output 
#         loss = criterion(output, subject_tensor) # Calculate loss
#         loss.backward() # Backward pass
#         torch.nn.utils.clip_grad_norm_(model.parameters(), threshold_norm) # Clip gradients
#         optimizer.step() # Update weights
#         total_loss += loss.item()
# save the model
torch.save(model.state_dict(), MODEL_PATH)

Thread Training: 100%|██████████| 3333/3333 [07:55<00:00,  7.01it/s]


RuntimeError: Parent directory models does not exist.

In [136]:
torch.save(model.state_dict(), MODEL_PATH)

## Evaluation

In [138]:
!pip install evaluate
!pip install rouge_score

Collecting evaluate
  Obtaining dependency information for evaluate from https://files.pythonhosted.org/packages/70/63/7644a1eb7b0297e585a6adec98ed9e575309bb973c33b394dae66bc35c69/evaluate-0.4.1-py3-none-any.whl.metadata
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.1
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=025d51010b7d1abdbadabc4c3b62e465da80390f62a2055e701af510b4cde459
  Stored in directory: /root/.cache/pip/wheels/

In [140]:
import evaluate
rouge = evaluate.load('rouge')

In [142]:
# load the model
RECENT_MODEL = "/kaggle/working/models/model.pt-2023-12-06_17:51:44.pt"
model.load_state_dict(torch.load(RECENT_MODEL))

<All keys matched successfully>

In [148]:
# Evaluate the output

def evaluate(model: SubjectGenerator, test_data, criterion, rouge, max_input_len = 150, max_output_len = 50, mode="greedy"):
    model.eval()  # Turn on the evaluation mode
    
    total_loss = 0.
    evals = []
    with torch.no_grad():
        for i, item in enumerate(progressbar.tqdm(test_data, desc="Thread Evaluation", total=len(test_data))):
            thread = item
            bodies, subject = thread
#             email_tensors = []
            subject_string = subject
            
            # create tensors for bodies
            body = " ".join(bodies)
            email_tensor = torch.tensor([model.vocab.numberize(word) for word in body.split()]) # Convert email to tensor
            # trim email if it is too long
            trim_len = math.ceil(max_input_len / len(thread))
            if email_tensor.nelement() > trim_len: # Truncate email if it is too long
                email_tensor = email_tensor[:trim_len]
                
            # Concatenate email tensors
#             email_tensors.append(email_tensor)

#             email_tensor_final = torch.cat(email_tensors, dim=0)

            output = model.summarize(email_tensor)
            # loss = criterion(output, summary_string)
            # total_loss += loss.item()
            output_str = " ".join(output)
            rouge_score = rouge.compute(predictions=[output_str], references=[subject_string])

            evals.append((i+1, output_str, subject_string ,rouge_score))
    
    return evals

In [149]:
evals = evaluate(model, test, nn.CrossEntropyLoss(), rouge)

Thread Evaluation: 100%|██████████| 834/834 [03:33<00:00,  3.91it/s]


In [150]:
for i in evals:
    thread_id = i[0]
    output = i[1]
    subject = i[2]
    score = i[3]
    print(f"Thread ID: {thread_id}")
    print(f"Output: {output}")
    print(f"Subject: {subject}")
    print(f"Score: {score}")
    if thread_id > 3:
        break

Thread ID: 1
Output: <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS>
Subject: <BOS> re devon sfs <EOS>
Score: {'rouge1': 0.025, 'rouge2': 0.0, 'rougeL': 0.025, 'rougeLsum': 0.025}
Thread ID: 2
Output: <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <BOS> <