In [None]:
from collections import defaultdict
from itertools import count
import itertools
import random
import math
import json
import logging
import pathlib
import sys
import re

import pandas as pd
import numpy as np
import tqdm # progree bar

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, Sampler
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

from transformers import AutoModel, AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM
from transformers.modeling_outputs import Seq2SeqLMOutput


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize



from google.colab import drive

In [None]:
logger = logging.getLogger(pathlib.Path("x").name)
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(
    logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s")
)
logger.addHandler(handler)

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_json("/content/drive/MyDrive/data_for_KKSA_NLP_CH/dev.json")

In [None]:
df.head()

Unnamed: 0,id,word,pos,gloss,electra,bertseg,bertmsa
0,ar.962714,أَكْمَدَ,V,غَمَّ وَأَمْرَضَ القَلْبَ,"[0.2265725881, -0.3225077391, 0.5538389087, 0....","[0.1318395436, -0.203442499, 0.0180867836, -0....","[-1.0540555716, 0.7359393239, -0.4855850637000..."
1,ar.994971,ب,,حرف جرّ زائد في التوكيد بالنفس والعين .,"[0.1076807305, -0.0218244735, -0.3933216333000...","[0.3571605384, -0.1502435058, -0.0939490572, -...","[-0.8878438473, 1.0998017788, 1.085614562, 1.3..."
2,ar.989034,ذَكَا,V,ذَكَا الشَّاةَ ونحوَها: ذَبَحَها,"[0.6839492321, -0.0342893749, 0.5485743284, 0....","[0.0577338375, -0.1058739722, -0.1061836034, 0...","[-1.9565393925, 1.2845952511, 0.3016183376, 0...."
3,ar.994539,وَرَع,V,وَرَعَ فلانٌ: جَبُنَ.,"[-0.08947025980000001, 0.3989391923, -0.360960...","[0.06446439030000001, 0.0653506145, 0.21089071...","[0.3653689027, 1.1957067251, -0.0340094306, 0...."
4,ar.992756,قَلَص,V,قَلَصَ الظِّلُّ عنِّي: انْقَبَض ونَقَصَ.,"[-0.7175729275, -0.4579240382, 0.1205300912000...","[0.058123521500000004, -0.0278340597, 0.124133...","[-0.3557761014, 0.22549220920000002, -0.043720..."


# DATA


In [None]:
nltk.download('stopwords')
# nltk.download('arabic')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:

stop_words = list(set(stopwords.words('arabic')))
a_dict = r"""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                            ـ    | # Tatwil/Kashida
                     """

def preprocess_text(text):
    # Remove special characters {& $ @} and punctuation {. , ? !}
    text = re.sub(r'[^\w\s]', '', text)

    # Remove Arabic diacritics
    text = re.sub(a_dict, '', text)

    # Tokeniz The Sentence into tokens
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text



In [None]:
def read_json(path):
    with open(path, 'r', encoding='utf-8') as fin:
        data = json.load(fin)
    return data

def write_json(path, data):
    with open(path, 'w', encoding='utf-8') as fout:
        json.dump(data, fout)


class ARDDataset(Dataset):
    def __init__(self, path, is_test=False) -> None:
        super().__init__()
        self.is_test = is_test
        self.data = read_json(path)

    def __getitem__(self, index):
        sample = self.data[index]
        if self.is_test:
            return sample["id"], sample["word"], preprocess_text(sample["gloss"]),
        else:
            return sample["id"], sample["word"], preprocess_text(sample["gloss"]), sample["electra"], sample["bertseg"], sample['bertmsa']

    def __len__(self):
        return len(self.data)

In [None]:
BOS = "<seq>"
EOS = "</seq>"
PAD = "<pad/>"
UNK = "<unk/>"

SUPPORTED_ARCHS = (["sgns"])

# A dataset is a container object for the actual data
class JSONDataset(Dataset):
    """Reads a CODWOE JSON dataset"""

    def __init__(self, file, vocab=None, freeze_vocab=False, maxlen=256):
        """
        Construct a torch.utils.data.Dataset compatible with torch data API and
        codwoe data.
        args: `file` the path to the dataset file
              `vocab` a dictionary mapping strings to indices
              `freeze_vocab` whether to update vocabulary, or just replace unknown items with OOV token
              `maxlen` the maximum number of tokens per gloss
        """
        if vocab is None:
            self.vocab = defaultdict(count().__next__)
        else:
            self.vocab = defaultdict(count(len(vocab)).__next__)
            self.vocab.update(vocab)
        pad, eos, bos, unk = (
            self.vocab[PAD],
            self.vocab[EOS],
            self.vocab[BOS],
            self.vocab[UNK],
        )
        if freeze_vocab:
            self.vocab = dict(vocab)
        with open(file, "r") as istr:
            self.items = json.load(istr)
        # preparse data
        for json_dict in self.items:
            # in definition modeling test datasets, gloss targets are absent
            if "gloss" in json_dict:
                json_dict["gloss_tensor"] = torch.tensor(
                    [bos]
                    + [
                        self.vocab[word]
                        if not freeze_vocab
                        else self.vocab.get(word, unk)
                        for word in json_dict["gloss"].split()
                    ]
                    + [eos]
                )
                if maxlen:
                    json_dict["gloss_tensor"] = json_dict["gloss_tensor"][:maxlen]
            # in reverse dictionary test datasets, vector targets are absent
            for arch in SUPPORTED_ARCHS:
                if arch in json_dict:

                    json_dict[f"{arch}_tensor"] = torch.tensor(json_dict[arch])
            if "electra" in json_dict:
                json_dict["electra_tensor"] = torch.tensor(json_dict["electra"])
            elif "bertseg" in json_dict:
                json_dict["bertseg_tensor"] = torch.tensor(json_dict["bertseg"])
            elif "bertmsa" in json_dict:
                json_dict["bertmsa_tensor"] = torch.tensor(json_dict["bertmsa"])
        self.has_gloss = "gloss" in self.items[0]
        self.has_vecs = SUPPORTED_ARCHS[0] in self.items[0]
        self.has_electra = "electra" in self.items[0]
        self.has_bertseg = "bertseg" in self.items[0]
        self.has_bertmsa = "bertmsa" in self.items[0]
        self.itos = sorted(self.vocab, key=lambda w: self.vocab[w])

    def __len__(self):
        return len(self.items)

    def __getitem__(self, index):
        return self.items[index]

    # we're adding this method to simplify the code in our predictions of
    # glosses
    def decode(self, tensor):
        """Convert a sequence of indices (possibly batched) to tokens"""
        with torch.no_grad():
            if tensor.dim() == 2:
                # we have batched tensors of shape [Seq x Batch]
                decoded = []
                for tensor_ in tensor.t():
                    decoded.append(self.decode(tensor_))
                return decoded
            else:
                return " ".join(
                    [self.itos[i.item()] for i in tensor if i != self.vocab[PAD]]
                )

    def save(self, file):
        torch.save(self, file)

    @staticmethod
    def load(file):
        return torch.load(file)


# A sampler allows you to define how to select items from your Dataset. Torch
# provides a number of default Sampler classes
class TokenSampler(Sampler):
    """Produce batches with up to `batch_size` tokens in each batch"""

    def __init__(
        self, dataset, batch_size=200, size_fn=len, drop_last=False, shuffle=True
    ):
        """
        args: `dataset` a torch.utils.data.Dataset (iterable style)
              `batch_size` the maximum number of tokens in a batch
              `size_fn` a callable that yields the number of tokens in a dataset item
              `drop_last` if True and the data can't be divided in exactly the right number of batch, drop the last batch
              `shuffle` if True, shuffle between every iteration
        """
        self.dataset = dataset
        self.batch_size = batch_size
        self.size_fn = size_fn
        self._len = None
        self.drop_last = drop_last
        self.shuffle = True

    def __iter__(self):
        indices = range(len(self.dataset))
        if self.shuffle:
            indices = list(indices)
            random.shuffle(indices)
        i = 0
        selected = []
        numel = 0
        longest_len = 0
        for i in indices:
            if numel + self.size_fn(self.dataset[i]) > self.batch_size:
                if selected:
                    yield selected
                selected = []
                numel = 0
            numel += self.size_fn(self.dataset[i])
            selected.append(i)
        if selected and not self.drop_last:
            yield selected

    def __len__(self):
        if self._len is None:
            self._len = (
                sum(self.size_fn(self.dataset[i]) for i in range(len(self.dataset)))
                // self.batch_size
            )
        return self._len


# DataLoaders give access to an iterator over the dataset, using a sampling
# strategy as defined through a Sampler.
def get_dataloader(dataset, batch_size=200, shuffle=True):
    """produce dataloader.
    args: `dataset` a torch.utils.data.Dataset (iterable style)
          `batch_size` the maximum number of tokens in a batch
          `shuffle` if True, shuffle between every iteration
    """
    # some constants for the closures
    has_gloss = dataset.has_gloss
    has_vecs = dataset.has_vecs
    has_electra = dataset.has_electra
    has_bertseg = dataset.has_bertseg
    has_bertmsa = dataset.has_bertmsa
    PAD_idx = dataset.vocab[PAD]

    # the collate function has to convert a list of dataset items into a batch
    def do_collate(json_dicts):
        """collates example into a dict batch; produces ands pads tensors"""
        batch = defaultdict(list)
        for jdict in json_dicts:
            for key in jdict:
                batch[key].append(jdict[key])
        if has_gloss:
            batch["gloss_tensor"] = pad_sequence(
                batch["gloss_tensor"], padding_value=PAD_idx, batch_first=False
            )
        if has_vecs:
            for arch in SUPPORTED_ARCHS:
                batch[f"{arch}_tensor"] = torch.stack(batch[f"{arch}_tensor"])
        if has_electra:
            batch["electra_tensor"] = torch.stack(batch["electra_tensor"])
        if has_bertseg:
            batch["bertseg_tensor"] = torch.stack(batch["bertseg_tensor"])
        if has_bertmsa:
            batch["bertmsa_tensor"] = torch.stack(batch["bertmsa_tensor"])
        return dict(batch)

    if dataset.has_gloss:
        # we try to keep the amount of gloss tokens roughly constant across all
        # batches.
        def do_size_item(item):
            """retrieve tensor size, so as to batch items per elements"""
            return item["gloss_tensor"].numel()

        return DataLoader(
            dataset,
            collate_fn=do_collate,
            batch_sampler=TokenSampler(
                dataset, batch_size=batch_size, size_fn=do_size_item, shuffle=shuffle
            ),
        )
    else:
        # there's no gloss, hence no gloss tokens, so we use a default batching
        # strategy.
        return DataLoader(
            dataset, collate_fn=do_collate, batch_size=batch_size, shuffle=shuffle
        )


# MODELS


In [None]:
class AraT5RevDict(nn.Module):
    def __init__(self, args) -> None:
        super().__init__()
        if args.resume_train:
            self.base_model = AutoModelForSeq2SeqLM.from_pretrained(args.resume_file)
            raise NotImplementedError()
        else:
            if args.from_pretrained:
                self.base_model = AutoModelForSeq2SeqLM.from_pretrained("UBC-NLP/AraT5v2-base-1024")
            else:
                model_config = AutoConfig.from_pretrained("UBC-NLP/AraT5v2-base-1024")
                self.base_model = AutoModelForSeq2SeqLM.from_config(model_config)

        self.linear = nn.Linear(self.base_model.config.hidden_size, args.max_len)

    def forward(self, input_ids, attention_mask, labels):
        outputs:Seq2SeqLMOutput = self.base_model(input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            return_dict=True
        )

        pooled_emb = (outputs.encoder_last_hidden_state * attention_mask.unsqueeze(2)).sum(dim=1) / attention_mask.sum(dim=1).unsqueeze(1)

        embedding = self.linear(pooled_emb)
        return outputs.loss, embedding

    def save(self, file):
        torch.save(self, file)
        print("\n--\nsave1\n--\n")

    @staticmethod
    def load(file):
        return torch.load(file)

class ARBERTRevDict(nn.Module):
    def __init__(self, args) -> None:
        super().__init__()
        if args.resume_train:
            self.base_model = AutoModel.from_pretrained(args.resume_file)
            raise NotImplementedError()
        else:
            if args.from_pretrained:
                self.base_model = AutoModel.from_pretrained(args.model_name)
            else:
                model_config = AutoConfig.from_pretrained(args.model_name)
                self.base_model = AutoModel.from_config(model_config)

        self.linear = nn.Linear(self.base_model.config.hidden_size, args.max_len)

    def forward(self, input_ids, token_type_ids , attention_mask):
        feats = self.base_model(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        embedding = self.linear(feats)
        return embedding

    def save(self, file):
        self.base_model.save_pretrained(file,from_pt=True)
        print("\n--\nsave_pretrained\n--\n")
        # torch.save(self, file)

    @staticmethod
    def load(file):
        return AutoModel.from_pretrained(file)

class PositionalEncoding(nn.Module):
    """From PyTorch"""

    def __init__(self, d_model, dropout=0.1, max_len=4096):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[: x.size(0)]
        return self.dropout(x)



class RevdictModel(nn.Module):
    """A transformer architecture for Reverse Dictionary"""

    def __init__(
        self, vocab, d_model=256, n_head=4, n_layers=4, dropout=0.3, maxlen=512
    ):
        super(RevdictModel, self).__init__()
        self.d_model = d_model
        self.padding_idx = vocab[PAD]
        self.eos_idx = vocab[EOS]
        self.maxlen = maxlen

        self.embedding = nn.Embedding(len(vocab), d_model, padding_idx=self.padding_idx)
        self.positional_encoding = PositionalEncoding(
            d_model, dropout=dropout, max_len=maxlen
        )
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_head, dropout=dropout, dim_feedforward=d_model * 2
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=n_layers
        )
        self.dropout = nn.Dropout(p=dropout)
        self.e_proj = nn.Linear(d_model, d_model)
        for name, param in self.named_parameters():
            if param.dim() > 1:
                nn.init.xavier_uniform_(param)
            elif "bias" in name:
                nn.init.zeros_(param)
            else:  # gain parameters of the layer norm
                nn.init.ones_(param)

    def forward(self, gloss_tensor):
        src_key_padding_mask = gloss_tensor == self.padding_idx
        embs = self.embedding(gloss_tensor)
        src = self.positional_encoding(embs)
        transformer_output = self.dropout(
            self.transformer_encoder(src, src_key_padding_mask=src_key_padding_mask.t())
        )
        summed_embs = transformer_output.masked_fill(
            src_key_padding_mask.unsqueeze(-1), 0
        ).sum(dim=0)
        return self.e_proj(F.relu(summed_embs))

    @staticmethod
    def load(file):
        return torch.load(file)

    def save(self, file):
        torch.save(self, file)
        print("\n--\nsave2\n--\n")

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
We will use the GPU: NVIDIA A100-SXM4-40GB


In [None]:
!PYTORCH_CUDA_ALLOC_CONF=True

In [None]:
def rank_cosine(preds, targets):
    assocs = F.normalize(preds) @ F.normalize(targets).T
    refs = torch.diagonal(assocs, 0).unsqueeze(1)
    ranks = (assocs >= refs).sum(1).float()
    assert ranks.numel() == preds.size(0)
    ranks = ranks.mean().item()
    return ranks / preds.size(0)



In [None]:
def train(args):
    assert args.train_file is not None, "Missing dataset for training"
    # 1. get data, vocabulary, summary writer
    logger.debug("Preloading data")
    ## make datasets
    train_dataset = ARDDataset(args.train_file)
    valid_dataset = ARDDataset(args.dev_file)

    ## make dataloader
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size)
    valid_dataloader = DataLoader(valid_dataset, batch_size=args.batch_size)
    ## make summary writer
    summary_writer = SummaryWriter(args.save_dir / args.summary_logdir)
    train_step = itertools.count()  # to keep track of the training steps for logging

    # 2. construct model
    ## Hyperparams
    logger.debug("Setting up training environment")

    model = AraT5RevDict(args).to(args.device)
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    model.train()

    # 3. declare optimizer & loss_fn
    ## Hyperparams
    EPOCHS, LEARNING_RATE, BETA1, BETA2, WEIGHT_DECAY = args.num_epochs, 1.0e-4, 0.9, 0.999, 1.0e-6
    optimizer = optim.AdamW(
        model.parameters(),
        lr=LEARNING_RATE,
        betas=(BETA1, BETA2),
        weight_decay=WEIGHT_DECAY,
    )

    loss_fn = nn.MSELoss()

    vec_tensor_key = f"{args.target_arch}_tensor"

    best_cosine = 0

    # 4. train model
    for epoch in tqdm.trange(EPOCHS, desc="Epochs"):
        ## train loop
        pbar = tqdm.tqdm(
            desc=f"Train {epoch}", total=len(train_dataset), disable=None, leave=False
        )
        for ids, word, gloss, electra, bertseg, bertmsa in train_dataloader:
            optimizer.zero_grad()

            word_tokens = tokenizer(word, padding=True, return_tensors='pt').to(args.device)
            gloss_tokens = tokenizer(gloss, padding=True, return_tensors='pt').to(args.device)

            if args.target_arch == "electra":
                target_embs = torch.stack(electra, dim=1).to(args.device)
            elif args.target_arch =="bertseg":
                target_embs = torch.stack(bertseg, dim=1).to(args.device)
            elif args.target_arch =="bertmsa":
                target_embs = torch.stack(bertmsa, dim=1).to(args.device)

            target_embs = target_embs.float()

            ce_loss, pred_embs = model(
                gloss_tokens["input_ids"],
                gloss_tokens["attention_mask"],
                word_tokens["input_ids"],
            )

            mse_loss = loss_fn(pred_embs, target_embs)
            loss = args.ce_loss_weight * ce_loss + mse_loss
            loss.backward()

            # keep track of the train loss for this step
            next_step = next(train_step)
            summary_writer.add_scalar(
                "revdict-train/cos",
                F.cosine_similarity(pred_embs, target_embs).mean().item(),
                next_step,
            )
            summary_writer.add_scalar("revdict-train/mse", loss.item(), next_step)
            optimizer.step()
            pbar.update(target_embs.size(0))

        pbar.close()


        ## eval loop
        if args.dev_file:
            model.eval()
            with torch.no_grad():
                sum_dev_loss, sum_cosine, sum_rnk = 0.0, 0.0, 0.0
                pbar = tqdm.tqdm(
                    desc=f"Eval {epoch}",
                    total=len(valid_dataset),
                    disable=None,
                    leave=False,
                )
                pred_embs_list, target_embs_list = [], []
                for ids, word, gloss, electra, bertseg, bertmsa in valid_dataloader:
                    # word_tokens = tokenizer(word, padding=True, return_tensors='pt').to(args.device)
                    # gloss_tokens = tokenizer(gloss, max_length=512, padding=True, truncation=True, return_tensors='pt').to(args.device)

                    word_tokens = tokenizer(word, padding=True, return_tensors='pt').to(args.device)
                    gloss_tokens = tokenizer(gloss, padding=True, return_tensors='pt').to(args.device)

                    if args.target_arch == "electra":
                        target_embs = torch.stack(electra, dim=1).to(args.device)
                    elif args.target_arch == "bertseg":
                        target_embs = torch.stack(bertseg, dim=1).to(args.device)
                    elif args.target_arch == "bertmsa":
                        target_embs = torch.stack(bertmsa, dim=1).to(args.device)

                    target_embs = target_embs.float()

                    ce_loss, pred_embs = model(
                        gloss_tokens["input_ids"],
                        gloss_tokens["attention_mask"],
                        word_tokens["input_ids"],
                    )

                    mse_loss = loss_fn(pred_embs, target_embs)
                    loss = args.ce_loss_weight * ce_loss + mse_loss

                    # sum_dev_loss += (
                    #     F.mse_loss(pred_embs, target_embs, reduction="none").mean(1).sum().item()
                    # )
                    sum_dev_loss += loss.item()
                    sum_cosine += F.cosine_similarity(pred_embs, target_embs).sum().item()

                    # sum_rnk += rank_cosine(pred_embs, target_embs)

                    pred_embs_list.append(pred_embs.cpu())
                    target_embs_list.append(target_embs.cpu())

                    pbar.update(target_embs.size(0))

                sum_rnk = rank_cosine(torch.cat(pred_embs_list, dim=0), torch.cat(target_embs_list, dim=0))

                pbar = tqdm.tqdm(
                    desc=f"Eval {epoch} cos: "+str(sum_cosine / len(valid_dataset))+" mse: "+str( sum_dev_loss / len(valid_dataset) )+" rnk: "+str(sum_rnk/ len(valid_dataset))+ " sum_rnk: "+str(sum_rnk)+" len of dev: "+str(len(valid_dataset)) +"\n",
                    total=len(valid_dataset),
                    disable=None,
                    leave=False,
                )

                if sum_cosine >= best_cosine:
                    best_cosine = sum_cosine
                    print(f"Saving Best Checkpoint at Epoch {epoch} best cosine {best_cosine} .")
                    model.save(args.save_dir / args.best_model)


                # keep track of the average loss on dev set for this epoch
                summary_writer.add_scalar(
                    "revdict-dev/cos", sum_cosine / len(valid_dataset), epoch
                )
                summary_writer.add_scalar(
                    "revdict-dev/mse", sum_dev_loss / len(valid_dataset), epoch
                )
                summary_writer.add_scalar(
                    "revdict-dev/rnk", sum_rnk / len(valid_dataset), epoch
                )
                pbar.close()
                model.train()

        model.save(args.save_dir / "model_epoch.pt")

    # 5. save result
    model.save(args.save_dir / args.last_state_model)



In [None]:
class Args(object):
  def __init__(self, **kwargs):
    self.__dict__ = kwargs

# python revdict.py --do_train --train_file ../../../dev.json --dev_file ../../../dev.json  --model_name "aubmindlab/bert-base-arabertv02"

In [None]:
args_train = Args(
    train_file = pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/train.json"),
    dev_file = pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/dev.json"),
    save_dir = pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/"),
    summary_logdir =pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/logs/"),
    # model_name="aubmindlab/bert-base-arabertv02",
    model_name="UBC-NLP/AraT5v2-base-1024",
    device="cuda",
    # device="cpu",
    target_arch="electra", # choices=("sgns", "electra", "bertseg", "bertmsa"),

    batch_size=32,
    # resume_train="/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/model_epoch.pt",
    resume_train=None,
    resume_file=None,
    from_pretrained=False,
    max_len=256, # choices=(300, 256, 768),
    num_epochs=10,
    ce_loss_weight=1,
    best_model="best_model_electra_v5.pt",
    last_state_model="model_electra_v5.pt",
)

In [None]:
train(args_train)

2024-05-11 21:44:44,835 [DEBUG] x: Preloading data


DEBUG:x:Preloading data


2024-05-11 21:45:08,494 [DEBUG] x: Setting up training environment


DEBUG:x:Setting up training environment
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Saving Best Checkpoint at Epoch 0 best cosine 1977.8435878753662 .

--
save1
--



Epochs:  10%|█         | 1/10 [03:18<29:44, 198.32s/it]


--
save1
--

Saving Best Checkpoint at Epoch 1 best cosine 1980.9522876739502 .

--
save1
--



Epochs:  20%|██        | 2/10 [06:07<24:08, 181.08s/it]


--
save1
--

Saving Best Checkpoint at Epoch 2 best cosine 1985.2531242370605 .

--
save1
--



Epochs:  30%|███       | 3/10 [08:56<20:28, 175.44s/it]


--
save1
--

Saving Best Checkpoint at Epoch 3 best cosine 2010.2019815444946 .

--
save1
--



Epochs:  40%|████      | 4/10 [11:44<17:16, 172.68s/it]


--
save1
--

Saving Best Checkpoint at Epoch 4 best cosine 2047.6730089187622 .

--
save1
--



Epochs:  50%|█████     | 5/10 [14:30<14:10, 170.19s/it]


--
save1
--

Saving Best Checkpoint at Epoch 5 best cosine 2078.3959102630615 .

--
save1
--



Epochs:  60%|██████    | 6/10 [17:16<11:14, 168.67s/it]


--
save1
--

Saving Best Checkpoint at Epoch 6 best cosine 2096.2745237350464 .

--
save1
--



Epochs:  70%|███████   | 7/10 [20:05<08:26, 168.84s/it]


--
save1
--

Saving Best Checkpoint at Epoch 7 best cosine 2115.41246509552 .

--
save1
--



Epochs:  80%|████████  | 8/10 [22:52<05:36, 168.46s/it]


--
save1
--

Saving Best Checkpoint at Epoch 8 best cosine 2128.0112295150757 .

--
save1
--



Epochs:  90%|█████████ | 9/10 [25:42<02:48, 168.69s/it]


--
save1
--

Saving Best Checkpoint at Epoch 9 best cosine 2143.3946676254272 .

--
save1
--



Epochs: 100%|██████████| 10/10 [28:28<00:00, 170.84s/it]


--
save1
--







--
save1
--



In [None]:
args_train_bertseg = Args(
    train_file = pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/train.json"),
    dev_file = pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/dev.json"),
    save_dir = pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/"),
    summary_logdir =pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/logs/"),
    # model_name="aubmindlab/bert-base-arabertv02",
    model_name="UBC-NLP/AraT5v2-base-1024",
    device="cuda",
    # device="cpu",
    target_arch="bertseg", # choices=("sgns", "electra", "bertseg", "bertmsa"),

    batch_size=16,
    resume_train=None,#"/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/modelepoch.pt",
    resume_file=None,
    from_pretrained=False,
    max_len=768, # choices=(300, 256, 768),
    num_epochs=10,
    ce_loss_weight=1,
    best_model="best_model_bertseg_v1.pt",
    last_state_model="model_bertseg_v1.pt",
)

args_train_bertmsa = Args(
    train_file = pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/train.json"),
    dev_file = pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/dev.json"),
    save_dir = pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/"),
    summary_logdir =pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/logs/"),
    # model_name="aubmindlab/bert-base-arabertv02",
    model_name="UBC-NLP/AraT5v2-base-1024",
    device="cuda",
    # device="cpu",
    target_arch="bertmsa", # choices=("sgns", "electra", "bertseg", "bertmsa"),

    batch_size=16,
    resume_train=None,#"/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/modelepoch.pt",
    resume_file=None,
    from_pretrained=False,
    max_len=768, # choices=(256, 768),
    num_epochs=10,
    ce_loss_weight=1,
    best_model="best_model_bertmsa_v1.pt",
    last_state_model="model_bertmsa_v1.pt",
)

In [None]:
train(args_train_bertmsa)

2024-05-11 16:28:21,716 [DEBUG] x: Preloading data


DEBUG:x:Preloading data


2024-05-11 16:28:43,746 [DEBUG] x: Setting up training environment


DEBUG:x:Setting up training environment
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Saving Best Checkpoint at Epoch 0 best cosine 2711.5827325582504 .

--
save1
--



Epochs:  10%|█         | 1/10 [05:51<52:43, 351.49s/it]


--
save1
--

Saving Best Checkpoint at Epoch 1 best cosine 2742.4336924552917 .

--
save1
--



Epochs:  20%|██        | 2/10 [10:41<42:04, 315.57s/it]


--
save1
--

Saving Best Checkpoint at Epoch 2 best cosine 2754.6135692596436 .

--
save1
--



Epochs:  30%|███       | 3/10 [15:30<35:23, 303.40s/it]


--
save1
--

Saving Best Checkpoint at Epoch 3 best cosine 2778.0931819677353 .

--
save1
--



Epochs:  40%|████      | 4/10 [20:18<29:43, 297.24s/it]


--
save1
--

Saving Best Checkpoint at Epoch 4 best cosine 2783.845418691635 .

--
save1
--



Epochs:  50%|█████     | 5/10 [25:06<24:28, 293.75s/it]


--
save1
--

Saving Best Checkpoint at Epoch 5 best cosine 2793.5461208224297 .

--
save1
--



Epochs:  60%|██████    | 6/10 [29:53<19:25, 291.45s/it]


--
save1
--

Saving Best Checkpoint at Epoch 6 best cosine 2806.433346271515 .

--
save1
--



Epochs:  70%|███████   | 7/10 [34:39<14:29, 289.71s/it]


--
save1
--

Saving Best Checkpoint at Epoch 7 best cosine 2814.487755358219 .

--
save1
--



Epochs:  80%|████████  | 8/10 [39:30<09:40, 290.08s/it]


--
save1
--

Saving Best Checkpoint at Epoch 8 best cosine 2815.592159807682 .

--
save1
--



Epochs:  90%|█████████ | 9/10 [44:23<04:51, 291.13s/it]


--
save1
--

Saving Best Checkpoint at Epoch 9 best cosine 2818.450243294239 .

--
save1
--



Epochs: 100%|██████████| 10/10 [49:11<00:00, 295.12s/it]


--
save1
--







--
save1
--



In [None]:
train(args_train_bertseg)

2024-05-11 17:18:11,207 [DEBUG] x: Preloading data


DEBUG:x:Preloading data


2024-05-11 17:18:26,784 [DEBUG] x: Setting up training environment


DEBUG:x:Setting up training environment
Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Saving Best Checkpoint at Epoch 0 best cosine 2911.631629049778 .

--
save1
--



Epochs:  10%|█         | 1/10 [04:45<42:53, 285.92s/it]


--
save1
--

Saving Best Checkpoint at Epoch 1 best cosine 2948.6491066217422 .

--
save1
--



Epochs:  20%|██        | 2/10 [09:33<38:17, 287.14s/it]


--
save1
--

Saving Best Checkpoint at Epoch 2 best cosine 2987.8024659752846 .

--
save1
--



Epochs:  30%|███       | 3/10 [14:24<33:40, 288.64s/it]


--
save1
--

Saving Best Checkpoint at Epoch 3 best cosine 3008.286734223366 .

--
save1
--



Epochs:  40%|████      | 4/10 [19:17<29:01, 290.23s/it]


--
save1
--

Saving Best Checkpoint at Epoch 4 best cosine 3013.2874339818954 .

--
save1
--



Epochs:  50%|█████     | 5/10 [24:10<24:16, 291.25s/it]


--
save1
--

Saving Best Checkpoint at Epoch 5 best cosine 3021.785058915615 .

--
save1
--



Epochs:  60%|██████    | 6/10 [28:59<19:22, 290.71s/it]


--
save1
--

Saving Best Checkpoint at Epoch 6 best cosine 3027.34661757946 .

--
save1
--



Epochs:  70%|███████   | 7/10 [33:50<14:31, 290.57s/it]


--
save1
--

Saving Best Checkpoint at Epoch 7 best cosine 3030.0560260415077 .

--
save1
--



Epochs:  80%|████████  | 8/10 [38:36<09:38, 289.36s/it]


--
save1
--

Saving Best Checkpoint at Epoch 8 best cosine 3036.8032430410385 .

--
save1
--



Epochs:  90%|█████████ | 9/10 [43:29<04:50, 290.38s/it]


--
save1
--



Epochs: 100%|██████████| 10/10 [48:13<00:00, 289.35s/it]


--
save1
--







--
save1
--



# Predcion

In [None]:
def pred(args):
    assert args.test_file is not None, "Missing dataset for test"
    # 1. retrieve vocab, dataset, model
    ## make datasets
    test_dataset = ARDDataset(args.test_file, is_test=True)

    ## make dataloader
    test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size)

    # model = models.ARBERTRevDict(args).load(f"{args.save_dir}")
    # model.to(args.device)
    # model.eval()

    model = AraT5RevDict(args).load(args.save_dir / args.best_model)
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    model.to(args.device)
    model.eval()

    vec_tensor_key = f"{args.target_arch}_tensor"

    # 2. make predictions
    predictions = []
    with torch.no_grad():
        pbar = tqdm.tqdm(desc="Pred.", total=len(test_dataset))
        for ids, words, gloss in test_dataloader:
            word_tokens = tokenizer(words, padding=True, return_tensors='pt').to(args.device)
            gloss_tokens = tokenizer(gloss, padding=True, return_tensors='pt').to(args.device)

            _, vecs = model(
                gloss_tokens["input_ids"],
                gloss_tokens["attention_mask"],
                word_tokens["input_ids"],
            )

            vecs = vecs.cpu()
            # # Extract the last hidden states
            for id, word, vec in zip(ids, words, vecs.unbind()):
                predictions.append(
                    {"id": id, "word": word, args.target_arch: vec.view(-1).tolist()}
                )

            pbar.update(vecs.size(0))
        pbar.close()

    logger.debug("writing predction file")
    with open(args.save_dir /args.pred_file, "w") as ostr:
        json.dump( predictions, ostr)
    logger.debug("writing finished")


In [None]:
args_pred_electra_dev = Args(
    test_file="/content/drive/MyDrive/data_for_KKSA_NLP_CH/dev.json",
    save_dir = pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/"),
    summary_logdir =pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/logs/"),
    model_name="UBC-NLP/AraT5v2-base-1024",
    device="cuda",
    target_arch="electra", # choices=("sgns", "electra", "bertseg", "bertmsa"),
    batch_size=32,
    resume_train=None,#"/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/modelepoch.pt",
    resume_file=None,
    from_pretrained=False,
    max_len=256, # choices=(300, 256, 768),
    best_model="best_model_electra_v5.pt",
    last_state_model="model_electra_v5.pt",
    pred_file="dev_pred_electra_v5.json",
)
args_pred_electra_test = Args(
    test_file="/content/drive/MyDrive/data_for_KKSA_NLP_CH/test.json",
    save_dir = pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/"),
    summary_logdir =pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/logs/"),
    model_name="UBC-NLP/AraT5v2-base-1024",
    device="cuda",
    target_arch="electra", # choices=("sgns", "electra", "bertseg", "bertmsa"),
    batch_size=32,
    resume_train=None,#"/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/modelepoch.pt",
    resume_file=None,
    from_pretrained=False,
    max_len=256, # choices=(300, 256, 768),
    best_model="best_model_electra_v5.pt",
    last_state_model="model_electra_v5.pt",
    pred_file="test_pred_electra_v5.json",
)

In [None]:
pred(args_pred_electra_dev)
pred(args_pred_electra_test)

Pred.: 100%|██████████| 3921/3921 [00:06<00:00, 649.95it/s]

2024-05-11 22:14:11,841 [DEBUG] x: writing predction file



DEBUG:x:writing predction file


2024-05-11 22:14:13,362 [DEBUG] x: writing finished


DEBUG:x:writing finished


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/data_for_KKSA_NLP_CH/test.json'

## Dev

In [None]:
args_pred_electra = Args(
    test_file="/content/drive/MyDrive/data_for_KKSA_NLP_CH/dev.json",
    save_dir = pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/"),
    summary_logdir =pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/logs/"),
    model_name="UBC-NLP/AraT5v2-base-1024",
    device="cuda",
    target_arch="electra", # choices=("sgns", "electra", "bertseg", "bertmsa"),
    batch_size=16,
    resume_train=None,#"/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/modelepoch.pt",
    resume_file=None,
    from_pretrained=False,
    max_len=256, # choices=(300, 256, 768),
    best_model="best_model_electra_v3.pt",
    last_state_model="model_electra_v3.pt",
    pred_file="dev_pred_electra_v3.json",
)

args_pred_bertmsa = Args(
    test_file="/content/drive/MyDrive/data_for_KKSA_NLP_CH/dev.json",
    save_dir = pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/"),
    summary_logdir =pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/logs/"),
    model_name="UBC-NLP/AraT5v2-base-1024",
    device="cuda",
    target_arch="bertmsa", # choices=("sgns", "electra", "bertseg", "bertmsa"),
    batch_size=16,
    resume_train=None,#"/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/modelepoch.pt",
    resume_file=None,
    from_pretrained=False,
    max_len=768, # choices=(300, 256, 768),
    best_model="best_model_bertmsa_v1.pt",
    last_state_model="model_bertmsa_v1.pt",
    pred_file="dev_pred_bertmsa_v1.json",
)
args_pred_bertseg = Args(
    test_file="/content/drive/MyDrive/data_for_KKSA_NLP_CH/dev.json",
    save_dir = pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/"),
    summary_logdir =pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/logs/"),
    model_name="UBC-NLP/AraT5v2-base-1024",
    device="cuda",
    target_arch="bertseg", # choices=("sgns", "electra", "bertseg", "bertmsa"),
    batch_size=16,
    resume_train=None,#"/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/modelepoch.pt",
    resume_file=None,
    from_pretrained=False,
    max_len=768, # choices=(300, 256, 768),
    best_model="best_model_bertseg_v1.pt",
    last_state_model="model_bertseg_v1.pt",
    pred_file="dev_pred_bertseg_v1.json",
)

In [None]:
pred(args_pred_electra)

Pred.: 100%|██████████| 3921/3921 [00:09<00:00, 423.05it/s]

2024-05-11 18:17:01,022 [DEBUG] x: writing predction file



DEBUG:x:writing predction file


2024-05-11 18:17:02,533 [DEBUG] x: writing finished


DEBUG:x:writing finished


In [None]:
pred(args_pred_bertmsa)

Pred.: 100%|██████████| 3921/3921 [00:09<00:00, 414.92it/s]

2024-05-11 18:17:21,303 [DEBUG] x: writing predction file



DEBUG:x:writing predction file


2024-05-11 18:17:25,681 [DEBUG] x: writing finished


DEBUG:x:writing finished


In [None]:
pred(args_pred_bertseg)

Pred.: 100%|██████████| 3921/3921 [00:09<00:00, 418.87it/s]

2024-05-11 18:17:44,637 [DEBUG] x: writing predction file



DEBUG:x:writing predction file


2024-05-11 18:17:49,009 [DEBUG] x: writing finished


DEBUG:x:writing finished


## Testing

In [None]:
args_pred_electra = Args(
    test_file="/content/drive/MyDrive/data_for_KKSA_NLP_CH/test_rd.json",
    save_dir = pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/"),
    summary_logdir =pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/logs/"),
    model_name="UBC-NLP/AraT5v2-base-1024",
    device="cuda",
    target_arch="electra", # choices=("sgns", "electra", "bertseg", "bertmsa"),
    batch_size=16,
    resume_train=None,#"/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/modelepoch.pt",
    resume_file=None,
    from_pretrained=False,
    max_len=256, # choices=(300, 256, 768),
    best_model="best_model_electra_v3.pt",
    last_state_model="model_electra_v3.pt",
    pred_file="test_pred_electra_v3.json",
)

args_pred_bertmsa = Args(
    test_file="/content/drive/MyDrive/data_for_KKSA_NLP_CH/test_rd.json",
    save_dir = pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/"),
    summary_logdir =pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/logs/"),
    model_name="UBC-NLP/AraT5v2-base-1024",
    device="cuda",
    target_arch="bertmsa", # choices=("sgns", "electra", "bertseg", "bertmsa"),
    batch_size=16,
    resume_train=None,#"/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/modelepoch.pt",
    resume_file=None,
    from_pretrained=False,
    max_len=768, # choices=(256, 768),
    best_model="best_model_bertmsa_v1.pt",
    last_state_model="model_bertmsa_v1.pt",
    pred_file="test_pred_bertmsa_v1.json",
)
args_pred_bertseg = Args(
    test_file="/content/drive/MyDrive/data_for_KKSA_NLP_CH/test_rd.json",
    save_dir = pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/"),
    summary_logdir =pathlib.Path("/content/drive/MyDrive/data_for_KKSA_NLP_CH/logs/"),
    model_name="UBC-NLP/AraT5v2-base-1024",
    device="cuda",
    target_arch="bertseg", # choices=("sgns", "electra", "bertseg", "bertmsa"),
    batch_size=16,
    resume_train=None,#"/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/modelepoch.pt",
    resume_file=None,
    from_pretrained=False,
    max_len=768, # choices=(256, 768),
    best_model="best_model_bertseg_v1.pt",
    last_state_model="model_bertseg_v1.pt",
    pred_file="test_pred_bertseg_v1.json",
)


In [None]:
pred(args_pred_electra)

Pred.: 100%|██████████| 3922/3922 [00:09<00:00, 402.78it/s]

2024-05-11 18:18:06,865 [DEBUG] x: writing predction file



DEBUG:x:writing predction file


2024-05-11 18:18:08,347 [DEBUG] x: writing finished


DEBUG:x:writing finished


In [None]:
pred(args_pred_bertmsa)

Pred.: 100%|██████████| 3922/3922 [00:09<00:00, 414.47it/s]

2024-05-11 18:18:25,981 [DEBUG] x: writing predction file



DEBUG:x:writing predction file


2024-05-11 18:18:30,377 [DEBUG] x: writing finished


DEBUG:x:writing finished


In [None]:
pred(args_pred_bertseg)

Pred.: 100%|██████████| 3922/3922 [00:09<00:00, 413.10it/s]

2024-05-11 18:18:47,784 [DEBUG] x: writing predction file



DEBUG:x:writing predction file


2024-05-11 18:18:52,221 [DEBUG] x: writing finished


DEBUG:x:writing finished


In [None]:
df = pd.read_json("/content/drive/MyDrive/data_for_KKSA_NLP_CH/models/test_pred_electra_v3.json")
print(df.loc[0]["word"])
print(len(df.loc[0]["electra"]))

يَخْطُب
256
