In [1]:
import json
import torch
import time
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
from torch.utils.data import DataLoader
import torch
import os
from src.eventtriplet_dataset import EventTripletDataset
from src.eventtype_finetune import EventRetrieverFineTune, EventRetrieverTrainer
from src.utils.device_util import getDeviceInfo
from src.utils.data_utils import build_label_maps, build_labels
from src.eventtype_retriever import EventTypeRetriever
from src.wikievents_dataset import WikiEventsSentenceDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = getDeviceInfo()
print(f"Device info::: {device}")

Device info::: mps


In [3]:
MODEL_NAME = "roberta-base"

MAX_LENGTH = 128
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
EPOCHS = 10

CONTEXT_PATH = ""
CHECKPOINT_DIR = f"{CONTEXT_PATH}checkpoints"

TRAIN_JSON_PATH = f"{CONTEXT_PATH}data/train.jsonl"
VAL_JSON_PATH = f"{CONTEXT_PATH}data/dev.jsonl"
TEST_JSON_PATH = f"{CONTEXT_PATH}data/test.jsonl"

LABEL_CACHE_PATH = f"{CONTEXT_PATH}processing_data/event_types.json"
TRAIN_CACHE_PATH = f"{CONTEXT_PATH}processing_data/train.json"
VAL_CACHE_PATH = f"{CONTEXT_PATH}processing_data/dev.json"
TEST_CACHE_PATH = f"{CONTEXT_PATH}processing_data/test.json"

In [4]:
#get event type
event_types = build_labels(TRAIN_JSON_PATH, LABEL_CACHE_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
special_tokens = ["<tgr>"]
tokenizer.add_tokens(special_tokens)

Loaded labels from processing_data/event_types.json


1

In [5]:
train_dataset = WikiEventsSentenceDataset(TRAIN_JSON_PATH, tokenizer, MAX_LENGTH, TRAIN_CACHE_PATH)
val_dataset = WikiEventsSentenceDataset(VAL_JSON_PATH, tokenizer, MAX_LENGTH, VAL_CACHE_PATH)
test_dataset = WikiEventsSentenceDataset(TEST_JSON_PATH, tokenizer, MAX_LENGTH, TEST_CACHE_PATH)

Loading processed dataset from processing_data/train.json
Loading processed dataset from processing_data/dev.json
Loading processed dataset from processing_data/test.json


In [6]:
train_triplet_dataset = EventTripletDataset(train_dataset, event_types, tokenizer, MAX_LENGTH)
val_triplet_dataset = EventTripletDataset(val_dataset, event_types, tokenizer, MAX_LENGTH)
test_triplet_dataset = EventTripletDataset(test_dataset, event_types, tokenizer, MAX_LENGTH)

In [7]:
train_loader = DataLoader(train_triplet_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_triplet_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_triplet_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [8]:
sentence = "Roadside IED <tgr> kills </tgr> Russian major general in Syria"
top_k = 3

In [9]:
model = EventRetrieverFineTune(MODEL_NAME)
trainer = EventRetrieverTrainer(
    model = model,
    tokenizer = tokenizer,
    train_loader=train_loader,  
    val_loader = val_loader,
    event_types = event_types,
    device = device,
    batch_size = BATCH_SIZE,
    lr = LEARNING_RATE,
    epochs = EPOCHS,
    checkpoint_dir = CHECKPOINT_DIR
)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: 'EventRetrieverFineTune' object has no attribute 'resize_token_embeddings'

In [None]:
start = time.time()
trainer.train()
end = time.time()

elapsed = end - start
print(f"⏱ Training finished in {elapsed:.2f} seconds ({elapsed/60:.2f} minutes)")

In [None]:
avg_test_loss = trainer.evaluate(test_loader)
print(f"Test Loss: {avg_test_loss:.4f}")

In [None]:
retriever = EventTypeRetriever(
    model_name=f"{CHECKPOINT_DIR}/retrieve_best_model",
    device=device,
    event_types=event_types
)

In [None]:
retriever.retrieve(sentence, topk=top_k)