# Main File

In [1]:
#!/usr/bin/env python3

import copy
from utils import *
from models import *
import random
import torch
import tqdm
import sys

EMAIL_DETAILS = "data/email_thread_details.json"
EMAIL_SUMMARIES = "data/email_thread_summaries.json"
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
DEVICE = torch.device("cpu")

## Loading Dataset

In [2]:
email_objects = Utils.read_csv(EMAIL_DETAILS, asObject=True, objectType=objects.EMAIL_DETAILS)    
summaries_objects = Utils.read_csv(EMAIL_SUMMARIES, asObject=True, objectType=objects.EMAIL_SUMMARIES)

if len(email_objects) != len(summaries_objects):
    print("Error: Length of email objects and summaries objects are not equal")
    sys.exit(1)
data_objects = [(email_objects[i], summaries_objects[i]) for i in range(1, len(email_objects))]

# Split data into train, test and validation
random.shuffle(data_objects)

train_data = data_objects[:int(0.8*len(data_objects))]
dev_data = data_objects[int(0.8*len(data_objects)):int(0.9*len(data_objects))]
vocab = Utils.build_vocab(train_data)


## Training Summarizer

In [3]:
def train_model_summarizer(model:Summarizer, loss_func, train_set, dev_set, epochs=50, lr=0.0001, device="cpu"):

    optimizer = torch.optim.Adam(model.parameters(), lr)

    model.to(device)
    prev_dev_loss = best_dev_loss = None
    best_model = model

    for epoch in tqdm.tqdm(range(epochs), desc="Epoch"):
        model.train()
        running_loss = 0.0
        random.shuffle(train_set)
        for batch in tqdm.tqdm(train_set, desc="Batch"):
            
            content = batch[0]
            good_summary = batch[1]

            optimizer.zero_grad()

            summarization = model.summarize(content=content, date=content[0].timestamp)
            good_summary_tensor = torch.tensor([model.vocab.numberize(word) for word in good_summary.summary.split()])
            loss = loss_func(summarization.view(-1, model.n_token), good_summary_tensor)
            
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        dev_loss = 0
        dev_failed = 0
        model.eval()
        dev_summaries = []

        # for batch in tqdm.tqdm(dev_set, desc="Dev Batch"):
        #     content = batch[0]
        #     content_str = [str(o.thread) for o in content]
        #     good_summary = batch[1]
            
        #     summarization = model.summarize(content)
            
        #     loss = loss_func(summarization.view(-1, nTo)
            
        #     dev_loss += loss.item()
            
        #     if summarization is None:
        #         dev_failed += 1
        #     else:
        #         dev_summaries.append(summarization)
        
        # print("Epoch: ", epoch, "Loss: ", loss.item())
    
        # if best_dev_loss is None or dev_loss < best_dev_loss:
        #     best_dev_loss = dev_loss
        #     best_model = copy.deepcopy(model)
        #     torch.save(best_model.state_dict(), "models/summarizer.pt")
        #     print("Saved model with dev loss: ", dev_loss)
        
        # if prev_dev_loss is not None and dev_loss > prev_dev_loss:
        #     print('halving learning rate', file=sys.stderr)
        #     optimizer.param_groups[0]['lr'] /= 2
        # prev_dev_loss = dev_loss

    return best_model

In [4]:
nTokens = len(vocab)
emb_size = 200
n_hidden = 200
n_layers = 2
n_heads = 2
dropout = 0.2
model = Summarizer(nTokens,emb_size, n_heads, n_hidden, n_layers, vocab, dropout)

loss_func = nn.CrossEntropyLoss()


# TODO - figure out which loss function to use
best_model = train_model_summarizer(model,loss_func, train_data, dev_data, epochs=50, lr=0.0001, device="cpu")

'''
    summing over output of nn.transformer

    confirm that the output

    

'''

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

In [None]:


def test_baseline(emailList, s):
    for key, value in emailList.items():
        print('-------------------')
        print(s.summarize(value))
        break