<a href="https://colab.research.google.com/github/tomsabag/MSc/blob/main/HebSBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

TODOs:
[Losses](https://sbert.net/docs/sentence_transformer/loss_overview.html)

softmax/cosinesimilarity/triplet losses/mnrl

add warmup

pooling

parameters importance test

write: abstract, intro, experiments, results, discussion


In [1]:
!pip install wandb -qU
!pip install sentence_transformers
!pip install accelerate datasets
#TODO: pip install --upgrade sentence-transformers. see if it fetches the models and not initalize new one with max pooling ...


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m103.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.1/309.1 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==

In [2]:
# Data analysis, engineering and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
import wandb
from datasets import Dataset
from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr, pearsonr

# Torch
import torch
import torch.nn as nn
from torch.nn import CosineEmbeddingLoss
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.functional import cosine_similarity
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR

# Sentence bert
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments, SentenceTransformerModelCardData
from sentence_transformers import models, datasets, losses, evaluation, util
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SequentialEvaluator, SimilarityFunction, TripletEvaluator

In [3]:
# Use gpu for training
def get_device() -> torch.device:
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")

In [4]:
device = get_device()
device

device(type='cuda')

# Datasets

## NLI Dataset

In [5]:
# Removing unlabeled data
def clean_nli_df(df : pd.DataFrame) -> pd.DataFrame:
  df = df[df["original_label"] != '-']
  df = df.reset_index(drop=True)
  return df

In [6]:
# Load nli datasets
train_df_nli = pd.read_json("https://huggingface.co/datasets/HebArabNlpProject/HebNLI/resolve/main/HebNLI_train.jsonl", lines=True).sort_values(by="promptID")#.head(1000)
dev_df_nli = pd.read_json("https://huggingface.co/datasets/HebArabNlpProject/HebNLI/raw/main/HebNLI_val.jsonl", lines=True)
test_df_nli = pd.read_json("https://huggingface.co/datasets/HebArabNlpProject/HebNLI/raw/main/HebNLI_test.jsonl", lines=True)

In [7]:
# Create triplets data set:
# (Anchor, Positive & Negative)
def create_triplets_df(df: pd.DataFrame) -> pd.DataFrame:
  entailments = df[df.original_label == "entailment"][["translation1", "translation2"]]
  contradictions = df[df.original_label == "contradiction"][["translation1", "translation2"]]
  triplets = pd.merge(entailments, contradictions, on=["translation1"])
  triplets.rename(columns={
    'translation1': 'anchor',
    'translation2_x': 'positive',
    'translation2_y': 'negative'}, inplace=True)
  return triplets

## STS Dataset

In [8]:
def create_sts_df() -> pd.DataFrame:
  # Read and shuffle the data
  sts_df = pd.read_csv("https://drive.google.com/uc?id=1KVbqWmGV8f54P1bzltX3RAlZJSNmMI3p")
  sts_df = sts_df.sample(frac=1, random_state=42).reset_index(drop=True)

  # Split the data into test and dev sets of equal lengths
  split_index = len(sts_df) // 2
  sts_test_df = sts_df[:split_index].reset_index(drop=True)
  sts_dev_df = sts_df[split_index:].reset_index(drop=True)
  return sts_dev_df, sts_test_df

## Dataset Preprocessing

In [9]:
# Preprocess all datasets.
def data_preprocessing() -> tuple[Dataset]:
  # Nli dataset (to triplets)
  train_df_nli = pd.read_json("https://huggingface.co/datasets/HebArabNlpProject/HebNLI/resolve/main/HebNLI_train.jsonl", lines=True).sort_values(by="promptID")#.head(1000)
  # Clean
  train_df_nli = clean_nli_df(train_df_nli)

  train_df_triplets_nli = create_triplets_df(train_df_nli)
  train_df_nli, dev_df_nli = train_test_split(train_df_triplets_nli, test_size=0.1, random_state=42)

  # Sts dataset
  sts_dev_df, sts_test_df = create_sts_df()
  return Dataset.from_pandas(train_df_triplets_nli), Dataset.from_pandas(sts_dev_df), Dataset.from_pandas(dev_df_nli) ,Dataset.from_pandas(sts_test_df)

In [10]:
train_df_triplets_nli, sts_dev_df, nli_dev_df, sts_test_df = data_preprocessing()

#Model Pipeline

In [11]:
# Create one of three base models:
# Alephbert, Dictabert and Multillingual-bert
def create_model(config) -> SentenceTransformer:
  model_names = {
    "alephbert" : "onlplab/alephbert-base",
    "dictabert" : "dicta-il/dictabert",
    "mbert" : "google-bert/bert-base-multilingual-cased",
    "bert-base-nli-mean-tokens" : "bert-base-nli-mean-tokens" #TODO: large
  }
  # Create a bert model.
  # Hyperparameters, architecture and loss function are defined in a 'config' object' down below.
  bert = models.Transformer(model_names[config["bert"]]).to(device)
  pooling_model = models.Pooling(bert.get_word_embedding_dimension(),
                                 pooling_mode_mean_tokens=config.pooling_method=="mean",
                                 pooling_mode_cls_token=config.pooling_method=="cls",
                                 pooling_mode_max_tokens=config.pooling_method=="max")
  bert = SentenceTransformer(modules=[bert, pooling_model], device=device, similarity_fn_name=config["similarity_fn_name"])
  #bert = SentenceTransformer(models[config["bert"]], device=device, similarity_fn_name=config["similarity_fn_name"])
  return bert

In [12]:
# Data preprocessing, Model Architecture, Evaluation, Loss function and Training
# Are done and defined within the trainer supplier objects.
def pipeline(config): #TODO: return model ->:
  trainer_supplier = {
    "softmax" : SoftmaxTrainer,
    "cosinesimilarity" : CosineSimilarityTrainer,
    "triplet" : TripletLossTrainer,
    "mnrl" : MultipleNegativesRankingTrainer
  }
  trainer = trainer_supplier[config.trainer](config).get_trainer()
  trainer.train()
  model = trainer.model
  return model

In [13]:
#TODO: remove?
def create_loss(config: dict, model: SentenceTransformer) -> losses.MultipleNegativesRankingLoss:
  #TODO: return loss by config
  return MultipleNegativesRankingLoss(model)

In [14]:
#TODO: remove?
def create_optimizer(config: dict, model: SentenceTransformer) -> torch.optim:
  #TODO: return optimizer by config
  return Adam(model.parameters(), lr=config.learning_rate)

In [15]:
# During the training process, evaluate performence on both nli triplets, and sts dev set
def create_dev_evaluator(config, model) -> SequentialEvaluator:
  # Nli dataset. We took 10% of the training nli dataset as dev set
  # Evaluate the model during the training process on triplets of (anchor, positive, negative)
  triplets_evaluator = TripletEvaluator(
    anchors   = nli_dev_df["anchor"],
    positives = nli_dev_df["positive"],
    negatives = nli_dev_df["negative"],
    main_distance_function = config["similarity_fn_name"],
    batch_size = config["batch_size"],
    #show_progress_bar = True,
    name = "Nli")

  # Sts dataset. we took half of the sts dataset as dev set to evaluate our model during training.
  # Evaluate the model during the training process on pairs of sentences, measure similarity
  sts_evaluator = EmbeddingSimilarityEvaluator(
    sentences1 = sts_dev_df["sentence1"],
    sentences2 = sts_dev_df["sentence2"],
    scores     = sts_dev_df["score"],
    main_similarity = config["similarity_fn_name"],
    batch_size = config["batch_size"],
    #show_progress_bar = True,
    name = "Sts",
  )
  seq_evaluator = SequentialEvaluator([sts_evaluator, triplets_evaluator])
  return seq_evaluator

In [16]:
# Test the trained model on the sts dataset

def test(config, model, test_dataset: pd.DataFrame) -> None:
  # Define a test evaluator
  normalized_scores = np.array(test_dataset["score"]) / 5
  test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1 = test_dataset["sentence1"],
    sentences2 = test_dataset["sentence2"],
    scores     = normalized_scores,
    main_similarity = config.similarity_fn_name,
    batch_size=config.batch_size,
    show_progress_bar=True,
    name="Sts")
  # Test model
  print("testing now..")
  test_results = test_evaluator(model)
  print("test results:", test_results)
  wandb.log({f"test/{config['trainer']}/results": test_results})


In [17]:
# # Save the model
# def save(config, model: SentenceTransformer, run_name:str) -> None:
#   model.save_pretrained(f"models/{config.bert}/lr={config.learning_rate}/similarity_fn_name={config.similarity_fn_name}/batch_size={config.batch_size}")

In [18]:
def run_sweep(config=None):
  print("config:", config)
  # tell wandb to get started
  with wandb.init(project="NLP-final-project", config=config) as run:
    # Sweep hyperparameters
    config = wandb.config

    model = pipeline(config)

    # Testing
    test(config, model, sts_test_df)

    #save(model, run.name)

  return model

In [19]:
# Prepare nli dataset for classification based learning
# End up with pairs of sentences and a label: (Sentence1, Sentence2, Label)
# Where 'entailment', 'neutral' and 'contradiction' are mapped to 0, 1 and 2 respectively
def create_softmax_dataset(nli_dataframe):
  dataframe = clean_nli_df(nli_dataframe)
  dataframe['original_label'] = dataframe['original_label'].map({
                                                              'entailment': 0,
                                                              'neutral': 1,
                                                              'contradiction': 2
                                                              })
  dataframe = dataframe[["translation1", "translation2", "original_label"]]
  dataframe = dataframe.rename(columns={
                                    'translation1': 'sentence1',
                                    'translation2': 'sentence2',
                                    'original_label': 'label'}, inplace=False)
  return Dataset.from_pandas(dataframe)

# A 3-way softmax based model
# Use bert as a feature extractor, and a linear layer as a classifier.
# We experimented with adding 1, 2 and 3 linear layers on top as a classifier,
# But didn't get better results
class SoftmaxTrainer():
  def __init__(self, config):
    self.config = config
    self.model = create_model(config)

  def get_trainer(self):
    loss = losses.SoftmaxLoss(self.model, self.model.get_sentence_embedding_dimension(), num_labels=3) # Use cross entropy based loss
    trainer = SentenceTransformerTrainer(
        model = self.model,
        args = SentenceTransformerTrainingArguments(**(self.config["trainer_args"])),
        train_dataset = create_softmax_dataset(train_df_nli),
        eval_dataset = create_softmax_dataset(dev_df_nli),  # TODO: change dataset
        evaluator = create_dev_evaluator(self.config, self.model),
        loss = loss
    )
    return trainer




In [20]:
# Prepare nli dataset for regression based learning
# End up with pairs of sentences and a label: (Sentence1, Sentence2, Score)
# Where the similarity scores are normalized in [0, 1]
def create_cosinesimilarity_dataset(nli_dataframe):
  dataframe = clean_nli_df(nli_dataframe)
  dataframe['original_label'] = dataframe['original_label'].map({
                                                              'entailment': 1,
                                                              'neutral': 0.5,
                                                              'contradiction': 0
                                                              })
  dataframe = dataframe[["translation1", "translation2", "original_label"]]
  dataframe = dataframe.rename(columns={
                                    'translation1': 'sentence1',
                                    'translation2': 'sentence2',
                                    'original_label': 'score'}, inplace=False)
  return Dataset.from_pandas(dataframe)


# Use bert as a feature extractor, and finetune its embeddings.
# Using cosine similarity on bert's embeddings,
# Encouraging sentences which hold 'entailment' relationship closer in the latent space,
# and sentences which hold contradictive relationship further,
# Which is what the model is tested for on sts dataset
class CosineSimilarityTrainer():
  def __init__(self, config):
    self.config = config
    self.model = create_model(config)


  def get_trainer(self):
    loss = losses.CosineSimilarityLoss(self.model) # Use cosine similarity as a loss function: || score - cos(similarity(sentence1, sentence2) ||_2
    trainer = SentenceTransformerTrainer(
        model = self.model,
        args = SentenceTransformerTrainingArguments(**(self.config["trainer_args"])),
        train_dataset = create_cosinesimilarity_dataset(train_df_nli),
        eval_dataset = create_cosinesimilarity_dataset(dev_df_nli),
        evaluator = create_dev_evaluator(self.config, self.model),
        loss = loss
    )
    return trainer


In [21]:
# Prepare nli dataset for multi negative ranking loss based learning
# End up with triplets of: (Anchor, Positive, Negative)
# Where the anchor holds an 'entailment' relationship to the positive sample,
# And a 'contradiction' relationship to the negative sample
def create_contrastive_dataset(nli_dataframe):
  entailments = nli_dataframe[nli_dataframe.original_label == "entailment"][["translation1", "translation2"]]
  contradictions = nli_dataframe[nli_dataframe.original_label == "contradiction"][["translation1", "translation2"]]
  triplets = pd.merge(entailments, contradictions, on=["translation1"])
  triplets.rename(columns={
    'translation1': 'anchor',
    'translation2_x': 'positive',
    'translation2_y': 'negative'}, inplace=True)
  return Dataset.from_dict(triplets)

class TripletLossTrainer():
  def __init__(self, config):
    self.config = config
    self.model = create_model(config)


  def get_trainer(self):
    # Use triplet loss, which is a type of contrastive loss.
    # In this case, anchors are contrasted with both positive and negative samples
    # to better learn sentence representations and similarities
    loss = losses.TripletLoss(self.model)
    trainer = SentenceTransformerTrainer(
        model = self.model,
        args = SentenceTransformerTrainingArguments(**(self.config["trainer_args"])),
        train_dataset = create_contrastive_dataset(train_df_nli),
        eval_dataset = create_contrastive_dataset(dev_df_nli),
        evaluator = create_dev_evaluator(self.config, self.model),
        loss = loss
    )
    return trainer

In [22]:
# Prepare nli dataset for multi negative ranking loss based learning
# End up with triplets of: (Anchor, Positive, Negative)
# Where the anchor holds an 'entailment' relationship to the positive sample,
# And a 'contradiction' relationship to the negative sample
def create_mlnr_dataset(nli_dataframe):
  entailments = nli_dataframe[nli_dataframe.original_label == "entailment"][["translation1", "translation2"]]
  contradictions = nli_dataframe[nli_dataframe.original_label == "contradiction"][["translation1", "translation2"]]
  triplets = pd.merge(entailments, contradictions, on=["translation1"])
  triplets.rename(columns={
    'translation1': 'anchor',
    'translation2_x': 'positive',
    'translation2_y': 'negative'}, inplace=True)
  return Dataset.from_dict(triplets)

class MultipleNegativesRankingTrainer():
  def __init__(self, config):
    self.config = config
    self.model = create_model(config)


  def get_trainer(self):
    # Use multiple negative ranking loss, which is also type of contrastive loss.
    # In this case, a positive sample is contrasted with multiple negative samples,
    # And negative sample is contrasted with multiple positive samples to better learn
    # sentence representations and similarities
    loss = losses.MultipleNegativesRankingLoss(self.model)
    trainer = SentenceTransformerTrainer(
        model = self.model,
        args = SentenceTransformerTrainingArguments(**(self.config["trainer_args"])),
        train_dataset = create_mlnr_dataset(train_df_nli),
        eval_dataset = create_mlnr_dataset(dev_df_nli),
        evaluator = create_dev_evaluator(self.config, self.model),
        loss = loss
    )
    return trainer


In [None]:
'''config = {
    "method": "bayes", #TODO: bayesian
    "metric": {"goal": "maximize", "name": "Dev/Sts Spearman Cosine"},
    "parameters": {
        "trainer": {"values": ["softmax", "cosinesimilarity", "triplet", "mnrl"]},
        "similarity_fn_name": {"values": ["cosine", "dot", "euclidean", "manhattan"]},
        "epochs":{"value": 1},
        "batch_size": {"value": 16},
        "bert": {"values": ["mbert", "dictabert","dictabert"]},
        "trainer_args": {
            "parameters" : {
              "output_dir" : {"value": "models"},
              "learning_rate" : {"min": 1e-7, "max": 1e-4},
              "num_train_epochs": {"value": 1},
              "per_device_train_batch_size": {"values": [8, 16]},
              "per_device_eval_batch_size": {"value" : 16},
              "warmup_ratio": {"value": 0.1},
              "eval_strategy":{"value":"steps"},
              "eval_steps": {"value":500},
              "save_strategy": {"value":"steps"},
              "save_steps": {"value":500},
              "save_total_limit":{"value":1},
              "logging_steps":{"value":100}
          }
        }
    }
}

# Use weights and biases to monitor training process,
# And finetune the network experimenting with different hyperparameters
sweep_id = wandb.sweep(sweep=config, project="NLP-final-project")
#sweep_id = "qai1hafw"
wandb.agent(sweep_id, function=lambda: run_sweep(config), count=10)'''

'config = {\n    "method": "bayes", #TODO: bayesian\n    "metric": {"goal": "maximize", "name": "Dev/Sts Spearman Cosine"},\n    "parameters": {\n        "trainer": {"values": ["softmax", "cosinesimilarity", "triplet", "mnrl"]},\n        "similarity_fn_name": {"values": ["cosine", "dot", "euclidean", "manhattan"]},\n        "epochs":{"value": 1},\n        "batch_size": {"value": 16},\n        "bert": {"values": ["mbert", "dictabert","dictabert"]},\n        "trainer_args": {\n            "parameters" : {\n              "output_dir" : {"value": "models"},\n              "learning_rate" : {"min": 1e-7, "max": 1e-4},\n              "num_train_epochs": {"value": 1},\n              "per_device_train_batch_size": {"values": [8, 16]},\n              "per_device_eval_batch_size": {"value" : 16},\n              "warmup_ratio": {"value": 0.1},\n              "eval_strategy":{"value":"steps"},\n              "eval_steps": {"value":500},\n              "save_strategy": {"value":"steps"},\n         

# Training and evaluation of best model architecture for 3 epochs

**Model**: Dictabert

**Loss function**: MNRL

**Epochs**: 3

**Learning rates**: grid search $\in [2*10^{-6}, 7*10^{-6}, 2*10^{-5}]$


In [27]:
config = {
    "method": "grid",
    "metric": {"goal": "maximize", "name": "eval/Sts_spearman_cosine"},
    "parameters": {
        "trainer": {"values": ["mnrl"]},
        "similarity_fn_name": {"values": ["cosine"]},
        "epochs":{"value": 3},
        "batch_size": {"value": 16},
        "pooling_method": {"value": "mean"},
        "bert": {"values": ["dictabert"]},
        "trainer_args": {
            "parameters" : {
              "output_dir" : {"value": "models"},
              "learning_rate" : {"values" : [2e-6, 7e-6, 2e-5]},
              "num_train_epochs": {"value": 3},
              "per_device_train_batch_size": {"value": 16},
              "per_device_eval_batch_size": {"value" : 16},
              "warmup_ratio": {"value": 0.1},
              "eval_strategy":{"value":"steps"},
              "eval_steps": {"value":500},
              "save_strategy": {"value":"steps"},
              "save_steps": {"value":5000},
              "save_total_limit":{"value":1},
              "logging_steps":{"value":500}
          }
        }
    }
}

# Use weights and biases to monitor training process,
# And finetune the network experimenting with different hyperparameters
sweep_id = wandb.sweep(sweep=config, project="NLP-final-project")
#sweep_id = "qai1hafw"
wandb.agent(sweep_id, function=lambda: run_sweep(config), count=1)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Create sweep with ID: vfmm84g6
Sweep URL: https://wandb.ai/HebSBert/NLP-final-project/sweeps/vfmm84g6


[34m[1mwandb[0m: Agent Starting Run: 4bi0om5b with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	bert: dictabert
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	similarity_fn_name: cosine
[34m[1mwandb[0m: 	trainer: mnrl
[34m[1mwandb[0m: 	trainer_args: {'eval_steps': 500, 'eval_strategy': 'steps', 'learning_rate': 2e-06, 'logging_steps': 500, 'num_train_epochs': 3, 'output_dir': 'models', 'per_device_eval_batch_size': 16, 'per_device_train_batch_size': 16, 'save_steps': 5000, 'save_strategy': 'steps', 'save_total_limit': 1, 'warmup_ratio': 0.1}


config: {'method': 'grid', 'metric': {'goal': 'maximize', 'name': 'eval/Sts_spearman_cosine'}, 'parameters': {'trainer': {'values': ['mnrl']}, 'similarity_fn_name': {'values': ['cosine']}, 'epochs': {'value': 3}, 'batch_size': {'value': 16}, 'bert': {'values': ['dictabert']}, 'trainer_args': {'parameters': {'output_dir': {'value': 'models'}, 'learning_rate': {'values': [2e-06, 7e-06, 2e-05]}, 'num_train_epochs': {'value': 3}, 'per_device_train_batch_size': {'value': 16}, 'per_device_eval_batch_size': {'value': 16}, 'warmup_ratio': {'value': 0.1}, 'eval_strategy': {'value': 'steps'}, 'eval_steps': {'value': 500}, 'save_strategy': {'value': 'steps'}, 'save_steps': {'value': 5000}, 'save_total_limit': {'value': 1}, 'logging_steps': {'value': 500}}}}}


[34m[1mwandb[0m: Currently logged in as: [33mtomsabag99[0m ([33mHebSBert[0m). Use [1m`wandb login --relogin`[0m to force relogin


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at dicta-il/dictabert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]



Step,Training Loss,Validation Loss,Sts Pearson Cosine,Sts Spearman Cosine,Sts Pearson Manhattan,Sts Spearman Manhattan,Sts Pearson Euclidean,Sts Spearman Euclidean,Sts Pearson Dot,Sts Spearman Dot,Sts Pearson Max,Sts Spearman Max,Nli Cosine Accuracy,Nli Dot Accuracy,Nli Manhattan Accuracy,Nli Euclidean Accuracy,Nli Max Accuracy,Sequential Score
500,1.0219,0.174539,0.684056,0.661446,0.676078,0.657609,0.675861,0.657979,0.406422,0.393096,0.684056,0.661446,0.757797,0.243647,0.747016,0.743839,0.757797,0.757797
1000,0.7016,0.096945,0.720782,0.695604,0.715511,0.694475,0.715828,0.69504,0.493939,0.47325,0.720782,0.695604,0.815749,0.188872,0.810647,0.811032,0.815749,0.815749
1500,0.5105,0.073704,0.744079,0.722062,0.74418,0.725631,0.74476,0.725666,0.565678,0.546292,0.74476,0.725666,0.857817,0.142472,0.851174,0.852618,0.857817,0.857817
2000,0.4529,0.061026,0.760347,0.743256,0.758319,0.743196,0.759546,0.744346,0.619813,0.601044,0.760347,0.744346,0.87707,0.125144,0.873508,0.874759,0.87707,0.87707
2500,0.4139,0.040307,0.765225,0.750761,0.75991,0.747869,0.761685,0.750319,0.64973,0.631946,0.765225,0.750761,0.889007,0.11032,0.884001,0.885156,0.889007,0.889007
3000,0.4037,0.029605,0.762675,0.749296,0.758993,0.748063,0.761134,0.750611,0.647175,0.628716,0.762675,0.750611,0.896804,0.104255,0.892568,0.893916,0.896804,0.896804
3500,0.3942,0.022161,0.770135,0.75973,0.765453,0.757271,0.76776,0.760024,0.654239,0.637983,0.770135,0.760024,0.901617,0.099153,0.897478,0.897959,0.901617,0.901617
4000,0.3925,0.018709,0.769048,0.759393,0.764188,0.758194,0.766478,0.76111,0.658192,0.642662,0.769048,0.76111,0.905275,0.094243,0.900462,0.900943,0.905275,0.905275
4500,0.3576,0.012551,0.767613,0.758948,0.764238,0.758573,0.766492,0.761033,0.651638,0.636512,0.767613,0.761033,0.910859,0.090008,0.906142,0.907971,0.910859,0.910859
5000,0.3647,0.008383,0.773095,0.764705,0.767746,0.763,0.769886,0.765709,0.65527,0.640345,0.773095,0.765709,0.911918,0.087312,0.904313,0.905564,0.911918,0.911918


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

testing now..


Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

test results: {'Sts_pearson_cosine': 0.79561475926572, 'Sts_spearman_cosine': 0.790537848779515, 'Sts_pearson_manhattan': 0.7895269298550275, 'Sts_spearman_manhattan': 0.7820546040795404, 'Sts_pearson_euclidean': 0.7909038254191036, 'Sts_spearman_euclidean': 0.7827202382669715, 'Sts_pearson_dot': 0.7365371164293609, 'Sts_spearman_dot': 0.724092815039966, 'Sts_pearson_max': 0.79561475926572, 'Sts_spearman_max': 0.790537848779515}


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/Nli_cosine_accuracy,▁▃▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████████████████
eval/Nli_dot_accuracy,█▆▄▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/Nli_euclidean_accuracy,▁▃▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇█████████████████████
eval/Nli_manhattan_accuracy,▁▃▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇█████████████████████
eval/Nli_max_accuracy,▁▃▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████████████████
eval/Sts_pearson_cosine,▁▄▆▇▇▇█████▇▇▇█▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇
eval/Sts_pearson_dot,▁▃▅▆▇▇▇▇▇▇▇███████████████████████████
eval/Sts_pearson_euclidean,▁▄▆▇▇▇██████▇▇█▇▇▇▇▇▇▇▇▇▇█▇▇▇▇▇▇▇██▇██
eval/Sts_pearson_manhattan,▁▄▆▇▇▇██████▇▇█▇▇▇▇▇▇▇▇▇▇█▇▇██▇▇██████
eval/Sts_pearson_max,▁▄▆▇▇▇█████▇▇▇█▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇

0,1
eval/Nli_cosine_accuracy,0.94118
eval/Nli_dot_accuracy,0.06151
eval/Nli_euclidean_accuracy,0.93502
eval/Nli_manhattan_accuracy,0.93483
eval/Nli_max_accuracy,0.94118
eval/Sts_pearson_cosine,0.76651
eval/Sts_pearson_dot,0.67804
eval/Sts_pearson_euclidean,0.7637
eval/Sts_pearson_manhattan,0.7625
eval/Sts_pearson_max,0.76651


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


# Training and evaluation of model architecture

**Models**: grid search over Dictabert, Alephbert, Mbert

**Loss functions**: MNRL

**Epochs**: 1

**Learning rate**:$ [2*10^{-5}]$


In [23]:
config = {
    "method": "grid",
    "metric": {"goal": "maximize", "name": "eval/Sts_spearman_cosine"},
    "parameters": {
        "trainer": {"value":"mnrl"},
        "similarity_fn_name": {"value": "cosine"},
        "epochs":{"value": 1},
        "batch_size": {"value": 16},
        "pooling_method": {"value": "mean"},
        "bert": {"values": ["dictabert", "mbert", "alephbert"]},
        "trainer_args": {
            "parameters" : {
              "output_dir" : {"value": "models"},
              "learning_rate" : {"value" : 2e-5},
              "num_train_epochs": {"value": 1},
              "per_device_train_batch_size": {"value": 16},
              "per_device_eval_batch_size": {"value" : 16},
              "warmup_ratio": {"value": 0.1},
              "eval_strategy":{"value":"steps"},
              "eval_steps": {"value":500},
              "save_strategy": {"value":"steps"},
              "save_steps": {"value":5000},
              "save_total_limit":{"value":1},
              "logging_steps":{"value":500}
          }
        }
    }
}

# Use weights and biases to monitor training process,
# And finetune the network experimenting with different hyperparameters
sweep_id = wandb.sweep(sweep=config, project="NLP-final-project")
#sweep_id = "qai1hafw"
wandb.agent(sweep_id, function=lambda: run_sweep(config), count=3)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Create sweep with ID: aboshe7x
Sweep URL: https://wandb.ai/HebSBert/NLP-final-project/sweeps/aboshe7x


[34m[1mwandb[0m: Agent Starting Run: p9pdfxqj with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	bert: dictabert
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	pooling_method: mean
[34m[1mwandb[0m: 	similarity_fn_name: cosine
[34m[1mwandb[0m: 	trainer: mnrl
[34m[1mwandb[0m: 	trainer_args: {'eval_steps': 500, 'eval_strategy': 'steps', 'learning_rate': 2e-05, 'logging_steps': 500, 'num_train_epochs': 1, 'output_dir': 'models', 'per_device_eval_batch_size': 16, 'per_device_train_batch_size': 16, 'save_steps': 5000, 'save_strategy': 'steps', 'save_total_limit': 1, 'warmup_ratio': 0.1}


config: {'method': 'grid', 'metric': {'goal': 'maximize', 'name': 'eval/Sts_spearman_cosine'}, 'parameters': {'trainer': {'value': 'mnrl'}, 'similarity_fn_name': {'value': 'cosine'}, 'epochs': {'value': 1}, 'batch_size': {'value': 16}, 'pooling_method': {'value': 'mean'}, 'bert': {'values': ['dictabert', 'mbert', 'alephbert']}, 'trainer_args': {'parameters': {'output_dir': {'value': 'models'}, 'learning_rate': {'value': 2e-05}, 'num_train_epochs': {'value': 1}, 'per_device_train_batch_size': {'value': 16}, 'per_device_eval_batch_size': {'value': 16}, 'warmup_ratio': {'value': 0.1}, 'eval_strategy': {'value': 'steps'}, 'eval_steps': {'value': 500}, 'save_strategy': {'value': 'steps'}, 'save_steps': {'value': 5000}, 'save_total_limit': {'value': 1}, 'logging_steps': {'value': 500}}}}}


[34m[1mwandb[0m: Currently logged in as: [33mtomsabag99[0m ([33mHebSBert[0m). Use [1m`wandb login --relogin`[0m to force relogin


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at dicta-il/dictabert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]



Step,Training Loss,Validation Loss,Sts Pearson Cosine,Sts Spearman Cosine,Sts Pearson Manhattan,Sts Spearman Manhattan,Sts Pearson Euclidean,Sts Spearman Euclidean,Sts Pearson Dot,Sts Spearman Dot,Sts Pearson Max,Sts Spearman Max,Nli Cosine Accuracy,Nli Dot Accuracy,Nli Manhattan Accuracy,Nli Euclidean Accuracy,Nli Max Accuracy,Sequential Score
500,0.6204,0.029589,0.772318,0.758284,0.769898,0.758265,0.770952,0.759888,0.635886,0.61135,0.772318,0.759888,0.887178,0.1134,0.881787,0.882268,0.887178,0.887178
1000,0.3785,0.010804,0.748059,0.740905,0.747185,0.742693,0.748013,0.743413,0.624004,0.611402,0.748059,0.743413,0.91134,0.090393,0.90412,0.903639,0.91134,0.91134
1500,0.3495,0.002894,0.776661,0.76948,0.774855,0.770274,0.774651,0.770633,0.663498,0.647921,0.776661,0.770633,0.920196,0.083654,0.909992,0.912206,0.920196,0.920196
2000,0.3277,0.008581,0.746782,0.749493,0.763822,0.760485,0.764023,0.760893,0.598226,0.590574,0.764023,0.760893,0.935406,0.068733,0.928668,0.930786,0.935406,0.935406
2500,0.303,0.001235,0.748725,0.747811,0.754591,0.752706,0.754726,0.752868,0.656343,0.649248,0.754726,0.752868,0.941471,0.0567,0.935695,0.935695,0.941471,0.941471
3000,0.3068,0.002215,0.756437,0.751477,0.758752,0.759644,0.759212,0.759321,0.660109,0.649387,0.759212,0.759644,0.948402,0.053331,0.943011,0.944455,0.948402,0.948402
3500,0.3003,0.002628,0.761024,0.757535,0.767667,0.76793,0.7682,0.768005,0.653063,0.640279,0.7682,0.768005,0.950039,0.051502,0.945803,0.947151,0.950039,0.950039
4000,0.3089,0.006383,0.776167,0.769291,0.772383,0.773667,0.773149,0.774775,0.686037,0.673706,0.776167,0.774775,0.954082,0.046496,0.950809,0.951579,0.954082,0.954082
4500,0.2707,0.002281,0.751833,0.750649,0.760173,0.760031,0.761701,0.761602,0.65375,0.645508,0.761701,0.761602,0.959569,0.040913,0.956585,0.957836,0.959569,0.959569
5000,0.2755,0.000779,0.765117,0.760586,0.767485,0.767672,0.768534,0.768918,0.657473,0.647798,0.768534,0.768918,0.966307,0.035425,0.960531,0.961398,0.966307,0.966307


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

testing now..


Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

test results: {'Sts_pearson_cosine': 0.7932420023036347, 'Sts_spearman_cosine': 0.7910961531105941, 'Sts_pearson_manhattan': 0.7871293529984481, 'Sts_spearman_manhattan': 0.7828040049951362, 'Sts_pearson_euclidean': 0.7884106120206235, 'Sts_spearman_euclidean': 0.7839322019573002, 'Sts_pearson_dot': 0.735430211604571, 'Sts_spearman_dot': 0.7244337406616848, 'Sts_pearson_max': 0.7932420023036347, 'Sts_spearman_max': 0.7910961531105941}


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/Nli_cosine_accuracy,▁▃▄▅▆▆▆▇▇███
eval/Nli_dot_accuracy,█▆▅▄▃▃▃▂▂▁▁▁
eval/Nli_euclidean_accuracy,▁▃▄▅▅▆▆▇▇███
eval/Nli_manhattan_accuracy,▁▃▃▅▆▆▆▇▇███
eval/Nli_max_accuracy,▁▃▄▅▆▆▆▇▇███
eval/Sts_pearson_cosine,▇▁█▁▁▃▄█▂▅▄▄
eval/Sts_pearson_dot,▄▃▆▁▆▆▅█▅▆▆▆
eval/Sts_pearson_euclidean,▇▁█▅▃▄▆█▅▆▅▅
eval/Sts_pearson_manhattan,▇▁█▅▃▄▆▇▄▆▅▅
eval/Sts_pearson_max,▇▁█▅▃▄▆█▄▆▅▅

0,1
eval/Nli_cosine_accuracy,0.96948
eval/Nli_dot_accuracy,0.03109
eval/Nli_euclidean_accuracy,0.96573
eval/Nli_manhattan_accuracy,0.96467
eval/Nli_max_accuracy,0.96948
eval/Sts_pearson_cosine,0.75828
eval/Sts_pearson_dot,0.65881
eval/Sts_pearson_euclidean,0.76432
eval/Sts_pearson_manhattan,0.7636
eval/Sts_pearson_max,0.76432


[34m[1mwandb[0m: Agent Starting Run: o8i2lk1j with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	bert: mbert
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	pooling_method: mean
[34m[1mwandb[0m: 	similarity_fn_name: cosine
[34m[1mwandb[0m: 	trainer: mnrl
[34m[1mwandb[0m: 	trainer_args: {'eval_steps': 500, 'eval_strategy': 'steps', 'learning_rate': 2e-05, 'logging_steps': 500, 'num_train_epochs': 1, 'output_dir': 'models', 'per_device_eval_batch_size': 16, 'per_device_train_batch_size': 16, 'save_steps': 5000, 'save_strategy': 'steps', 'save_total_limit': 1, 'warmup_ratio': 0.1}


config: {'method': 'grid', 'metric': {'goal': 'maximize', 'name': 'eval/Sts_spearman_cosine'}, 'parameters': {'trainer': {'value': 'mnrl'}, 'similarity_fn_name': {'value': 'cosine'}, 'epochs': {'value': 1}, 'batch_size': {'value': 16}, 'pooling_method': {'value': 'mean'}, 'bert': {'values': ['dictabert', 'mbert', 'alephbert']}, 'trainer_args': {'parameters': {'output_dir': {'value': 'models'}, 'learning_rate': {'value': 2e-05}, 'num_train_epochs': {'value': 1}, 'per_device_train_batch_size': {'value': 16}, 'per_device_eval_batch_size': {'value': 16}, 'warmup_ratio': {'value': 0.1}, 'eval_strategy': {'value': 'steps'}, 'eval_steps': {'value': 500}, 'save_strategy': {'value': 'steps'}, 'save_steps': {'value': 5000}, 'save_total_limit': {'value': 1}, 'logging_steps': {'value': 500}}}}}




config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Step,Training Loss,Validation Loss,Sts Pearson Cosine,Sts Spearman Cosine,Sts Pearson Manhattan,Sts Spearman Manhattan,Sts Pearson Euclidean,Sts Spearman Euclidean,Sts Pearson Dot,Sts Spearman Dot,Sts Pearson Max,Sts Spearman Max,Nli Cosine Accuracy,Nli Dot Accuracy,Nli Manhattan Accuracy,Nli Euclidean Accuracy,Nli Max Accuracy,Sequential Score
500,0.9748,0.083686,0.654038,0.65369,0.665508,0.668005,0.663769,0.666833,0.438685,0.422864,0.665508,0.668005,0.809973,0.206681,0.806604,0.805449,0.809973,0.809973
1000,0.7187,0.024574,0.651609,0.657215,0.63912,0.639,0.640808,0.641089,0.563824,0.567047,0.651609,0.657215,0.839334,0.179245,0.842703,0.842896,0.842896,0.839334
1500,0.6367,0.002758,0.670926,0.667753,0.658863,0.660299,0.659271,0.66114,0.585953,0.582409,0.670926,0.667753,0.849538,0.165094,0.851174,0.851078,0.851174,0.849538
2000,0.5938,0.007833,0.678108,0.683334,0.666393,0.672707,0.667167,0.673585,0.59996,0.597486,0.678108,0.683334,0.869754,0.144397,0.870331,0.87216,0.87216,0.869754
2500,0.5598,0.015209,0.660846,0.658514,0.64049,0.638999,0.641311,0.640644,0.593278,0.586697,0.660846,0.658514,0.880343,0.138044,0.878706,0.879765,0.880343,0.880343
3000,0.5523,0.004357,0.695916,0.69214,0.656753,0.65729,0.657807,0.659363,0.652257,0.648592,0.695916,0.69214,0.886696,0.126396,0.885734,0.88737,0.88737,0.886696
3500,0.5265,0.005358,0.664215,0.663436,0.63518,0.633356,0.635466,0.634733,0.604762,0.604233,0.664215,0.663436,0.88968,0.126011,0.889488,0.889873,0.889873,0.88968
4000,0.5236,0.002974,0.693226,0.692168,0.648933,0.651705,0.6488,0.651063,0.643168,0.643121,0.693226,0.692168,0.899403,0.111282,0.899307,0.899692,0.899692,0.899403
4500,0.5071,0.002322,0.676627,0.674233,0.646761,0.64669,0.64692,0.647707,0.620202,0.614924,0.676627,0.674233,0.908645,0.105603,0.90903,0.90826,0.90903,0.908645
5000,0.4867,0.001627,0.692747,0.690478,0.661629,0.663699,0.662674,0.664946,0.640693,0.6348,0.692747,0.690478,0.915961,0.09896,0.914035,0.913169,0.915961,0.915961


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

testing now..


Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

test results: {'Sts_pearson_cosine': 0.7082300846579794, 'Sts_spearman_cosine': 0.7101908967060202, 'Sts_pearson_manhattan': 0.6916673478952285, 'Sts_spearman_manhattan': 0.6888059211532023, 'Sts_pearson_euclidean': 0.6911977917602958, 'Sts_spearman_euclidean': 0.6884803860249976, 'Sts_pearson_dot': 0.6435393795861863, 'Sts_spearman_dot': 0.6529084157672649, 'Sts_pearson_max': 0.7082300846579794, 'Sts_spearman_max': 0.7101908967060202}


VBox(children=(Label(value='0.002 MB of 0.012 MB uploaded\r'), FloatProgress(value=0.17403314917127072, max=1.…

0,1
eval/Nli_cosine_accuracy,▁▃▄▅▅▆▆▇▇███
eval/Nli_dot_accuracy,█▆▅▄▄▃▃▂▂▁▁▁
eval/Nli_euclidean_accuracy,▁▃▄▅▆▆▆▇▇███
eval/Nli_manhattan_accuracy,▁▃▄▅▆▆▆▇▇███
eval/Nli_max_accuracy,▁▃▄▅▅▆▆▇▇███
eval/Sts_pearson_cosine,▁▁▄▅▂█▃█▅▇▆▅
eval/Sts_pearson_dot,▁▅▆▆▆█▆█▇█▇▇
eval/Sts_pearson_euclidean,▇▂▆█▂▆▁▄▄▇▄▃
eval/Sts_pearson_manhattan,█▂▆█▂▆▁▄▄▇▄▃
eval/Sts_pearson_max,▃▁▄▅▂█▃█▅▇▆▅

0,1
eval/Nli_cosine_accuracy,0.92039
eval/Nli_dot_accuracy,0.09309
eval/Nli_euclidean_accuracy,0.91846
eval/Nli_manhattan_accuracy,0.91846
eval/Nli_max_accuracy,0.92039
eval/Sts_pearson_cosine,0.67565
eval/Sts_pearson_dot,0.62254
eval/Sts_pearson_euclidean,0.64477
eval/Sts_pearson_manhattan,0.64362
eval/Sts_pearson_max,0.67565


[34m[1mwandb[0m: Agent Starting Run: 7bjv63cp with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	bert: alephbert
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	pooling_method: mean
[34m[1mwandb[0m: 	similarity_fn_name: cosine
[34m[1mwandb[0m: 	trainer: mnrl
[34m[1mwandb[0m: 	trainer_args: {'eval_steps': 500, 'eval_strategy': 'steps', 'learning_rate': 2e-05, 'logging_steps': 500, 'num_train_epochs': 1, 'output_dir': 'models', 'per_device_eval_batch_size': 16, 'per_device_train_batch_size': 16, 'save_steps': 5000, 'save_strategy': 'steps', 'save_total_limit': 1, 'warmup_ratio': 0.1}


config: {'method': 'grid', 'metric': {'goal': 'maximize', 'name': 'eval/Sts_spearman_cosine'}, 'parameters': {'trainer': {'value': 'mnrl'}, 'similarity_fn_name': {'value': 'cosine'}, 'epochs': {'value': 1}, 'batch_size': {'value': 16}, 'pooling_method': {'value': 'mean'}, 'bert': {'values': ['dictabert', 'mbert', 'alephbert']}, 'trainer_args': {'parameters': {'output_dir': {'value': 'models'}, 'learning_rate': {'value': 2e-05}, 'num_train_epochs': {'value': 1}, 'per_device_train_batch_size': {'value': 16}, 'per_device_eval_batch_size': {'value': 16}, 'warmup_ratio': {'value': 0.1}, 'eval_strategy': {'value': 'steps'}, 'eval_steps': {'value': 500}, 'save_strategy': {'value': 'steps'}, 'save_steps': {'value': 5000}, 'save_total_limit': {'value': 1}, 'logging_steps': {'value': 500}}}}}




config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/504M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/545k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Step,Training Loss,Validation Loss,Sts Pearson Cosine,Sts Spearman Cosine,Sts Pearson Manhattan,Sts Spearman Manhattan,Sts Pearson Euclidean,Sts Spearman Euclidean,Sts Pearson Dot,Sts Spearman Dot,Sts Pearson Max,Sts Spearman Max,Nli Cosine Accuracy,Nli Dot Accuracy,Nli Manhattan Accuracy,Nli Euclidean Accuracy,Nli Max Accuracy,Sequential Score
500,0.6789,0.004202,0.727146,0.707341,0.716814,0.707746,0.717464,0.708636,0.55714,0.531969,0.727146,0.708636,0.850693,0.154409,0.842703,0.843666,0.850693,0.850693
1000,0.4894,0.00223,0.715635,0.713061,0.72155,0.718647,0.722509,0.719611,0.49585,0.479114,0.722509,0.719611,0.882075,0.117828,0.87139,0.873412,0.882075,0.882075
1500,0.453,0.017574,0.718759,0.70866,0.717271,0.71136,0.717751,0.712219,0.539724,0.527346,0.718759,0.712219,0.890932,0.110127,0.876492,0.877358,0.890932,0.890932
2000,0.4346,0.000657,0.697805,0.692436,0.716091,0.710579,0.71697,0.711589,0.488019,0.475735,0.71697,0.711589,0.91134,0.094532,0.90335,0.904409,0.91134,0.91134
2500,0.4052,0.000322,0.717135,0.709215,0.724922,0.720626,0.725434,0.72148,0.53574,0.520094,0.725434,0.72148,0.921063,0.080381,0.914517,0.914228,0.921063,0.921063
3000,0.3998,0.001161,0.709454,0.70385,0.723631,0.72129,0.724437,0.721953,0.521772,0.509638,0.724437,0.721953,0.929919,0.072969,0.922603,0.922796,0.929919,0.929919
3500,0.3955,0.001838,0.711791,0.711721,0.729433,0.73331,0.729774,0.733305,0.514345,0.505613,0.729774,0.73331,0.934251,0.067,0.92886,0.928475,0.934251,0.934251
4000,0.3878,0.00141,0.731101,0.724939,0.742204,0.741059,0.742468,0.741276,0.550916,0.540923,0.742468,0.741276,0.940605,0.062187,0.933,0.934829,0.940605,0.940605
4500,0.3718,0.002673,0.721796,0.720745,0.738682,0.738761,0.738911,0.739731,0.542918,0.53208,0.738911,0.739731,0.948595,0.056026,0.939931,0.940797,0.948595,0.948595
5000,0.3576,0.000533,0.724206,0.720872,0.735428,0.735222,0.735864,0.736095,0.547388,0.534923,0.735864,0.736095,0.952926,0.048614,0.943396,0.943396,0.952926,0.952926


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

testing now..


Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

test results: {'Sts_pearson_cosine': 0.753397687885077, 'Sts_spearman_cosine': 0.7434463045634541, 'Sts_pearson_manhattan': 0.7489888308648115, 'Sts_spearman_manhattan': 0.7385632813704458, 'Sts_pearson_euclidean': 0.7495414111212155, 'Sts_spearman_euclidean': 0.7385803432679975, 'Sts_pearson_dot': 0.6495364140091465, 'Sts_spearman_dot': 0.6405153662869684, 'Sts_pearson_max': 0.753397687885077, 'Sts_spearman_max': 0.7434463045634541}


VBox(children=(Label(value='0.002 MB of 0.013 MB uploaded\r'), FloatProgress(value=0.17024470800180153, max=1.…

0,1
eval/Nli_cosine_accuracy,▁▃▄▅▆▆▆▇▇███
eval/Nli_dot_accuracy,█▆▅▄▃▃▂▂▂▁▁▁
eval/Nli_euclidean_accuracy,▁▃▃▅▆▆▇▇▇███
eval/Nli_manhattan_accuracy,▁▃▃▅▆▆▇▇▇███
eval/Nli_max_accuracy,▁▃▄▅▆▆▆▇▇███
eval/Sts_pearson_cosine,▇▅▅▁▅▃▄█▆▇▇▆
eval/Sts_pearson_dot,█▂▆▁▆▄▄▇▇▇▆▅
eval/Sts_pearson_euclidean,▁▃▁▁▃▃▅█▇▆▇▇
eval/Sts_pearson_manhattan,▁▂▁▁▃▃▅█▇▆▇▇
eval/Sts_pearson_max,▄▃▁▁▃▃▅█▇▆▇▇

0,1
eval/Nli_cosine_accuracy,0.95861
eval/Nli_dot_accuracy,0.0438
eval/Nli_euclidean_accuracy,0.94936
eval/Nli_manhattan_accuracy,0.94888
eval/Nli_max_accuracy,0.95861
eval/Sts_pearson_cosine,0.72336
eval/Sts_pearson_dot,0.53205
eval/Sts_pearson_euclidean,0.73876
eval/Sts_pearson_manhattan,0.73865
eval/Sts_pearson_max,0.73876


# Training and evaluation of loss function

**Model**: Dictabert

**Loss functions**: Grid search over Softmax, CosineEmbedding, Triplet & MNRL

**Epochs**: 3

**Learning rate**:$[2*10^{-5}]$


In [28]:
config = {
    "method": "grid",
    "metric": {"goal": "maximize", "name": "eval/Sts_spearman_cosine"},
    "parameters": {
        "trainer": {"values": ["softmax", "cosinesimilarity", "triplet", "mnrl"]},
        "similarity_fn_name": {"value": "cosine"},
        "epochs":{"value": 1},
        "batch_size": {"value": 16},
        "pooling_method": {"value": "mean"},
        "bert": {"value": "dictabert"},
        "trainer_args": {
            "parameters" : {
              "output_dir" : {"value": "models"},
              "learning_rate" : {"value" : 2e-5},
              "num_train_epochs": {"value": 1},
              "per_device_train_batch_size": {"value": 16},
              "per_device_eval_batch_size": {"value" : 16},
              "warmup_ratio": {"value": 0.1},
              "eval_strategy":{"value":"steps"},
              "eval_steps": {"value":500},
              "save_strategy": {"value":"steps"},
              "save_steps": {"value":5000},
              "save_total_limit":{"value":1},
              "logging_steps":{"value":500}
          }
        }
    }
}

# Use weights and biases to monitor training process,
# And finetune the network experimenting with different hyperparameters
sweep_id = wandb.sweep(sweep=config, project="NLP-final-project")
#sweep_id = "qai1hafw"
wandb.agent(sweep_id, function=lambda: run_sweep(config), count=4)

Create sweep with ID: lcnrzcaq
Sweep URL: https://wandb.ai/HebSBert/NLP-final-project/sweeps/lcnrzcaq


[34m[1mwandb[0m: Agent Starting Run: ycq3udhf with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	bert: dictabert
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	pooling_method: mean
[34m[1mwandb[0m: 	similarity_fn_name: cosine
[34m[1mwandb[0m: 	trainer: softmax
[34m[1mwandb[0m: 	trainer_args: {'eval_steps': 500, 'eval_strategy': 'steps', 'learning_rate': 2e-05, 'logging_steps': 500, 'num_train_epochs': 1, 'output_dir': 'models', 'per_device_eval_batch_size': 16, 'per_device_train_batch_size': 16, 'save_steps': 5000, 'save_strategy': 'steps', 'save_total_limit': 1, 'warmup_ratio': 0.1}


config: {'method': 'grid', 'metric': {'goal': 'maximize', 'name': 'eval/Sts_spearman_cosine'}, 'parameters': {'trainer': {'values': ['softmax', 'cosinesimilarity', 'triplet', 'mnrl']}, 'similarity_fn_name': {'value': 'cosine'}, 'epochs': {'value': 1}, 'batch_size': {'value': 16}, 'pooling_method': {'value': 'mean'}, 'bert': {'value': 'dictabert'}, 'trainer_args': {'parameters': {'output_dir': {'value': 'models'}, 'learning_rate': {'value': 2e-05}, 'num_train_epochs': {'value': 1}, 'per_device_train_batch_size': {'value': 16}, 'per_device_eval_batch_size': {'value': 16}, 'warmup_ratio': {'value': 0.1}, 'eval_strategy': {'value': 'steps'}, 'eval_steps': {'value': 500}, 'save_strategy': {'value': 'steps'}, 'save_steps': {'value': 5000}, 'save_total_limit': {'value': 1}, 'logging_steps': {'value': 500}}}}}




Some weights of BertModel were not initialized from the model checkpoint at dicta-il/dictabert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Sts Pearson Cosine,Sts Spearman Cosine,Sts Pearson Manhattan,Sts Spearman Manhattan,Sts Pearson Euclidean,Sts Spearman Euclidean,Sts Pearson Dot,Sts Spearman Dot,Sts Pearson Max,Sts Spearman Max,Nli Cosine Accuracy,Nli Dot Accuracy,Nli Manhattan Accuracy,Nli Euclidean Accuracy,Nli Max Accuracy,Sequential Score
500,1.0858,1.039603,0.63208,0.616527,0.640295,0.626614,0.638386,0.625067,0.147975,0.15803,0.640295,0.626614,0.804005,0.203023,0.790239,0.787928,0.804005,0.804005
1000,0.9632,0.913153,0.278329,0.408173,0.385904,0.441342,0.371589,0.437279,0.118936,0.123071,0.385904,0.441342,0.817385,0.188294,0.817097,0.817,0.817385,0.817385
1500,0.9027,0.86387,0.225722,0.329588,0.30925,0.352173,0.295953,0.346224,0.132802,0.152213,0.30925,0.352173,0.815653,0.187332,0.817963,0.817578,0.817963,0.815653
2000,0.866,0.881497,0.247645,0.309904,0.315287,0.332113,0.300736,0.325441,0.176164,0.18744,0.315287,0.332113,0.815556,0.198113,0.818348,0.818156,0.818348,0.815556
2500,0.8509,0.838138,0.233054,0.308747,0.30808,0.32994,0.291108,0.320171,0.183689,0.182912,0.30808,0.32994,0.831633,0.174047,0.83221,0.83144,0.83221,0.831633
3000,0.8354,0.824169,0.246414,0.349197,0.333921,0.367229,0.31618,0.358353,0.212042,0.24278,0.333921,0.367229,0.842896,0.157297,0.843184,0.844051,0.844051,0.842896
3500,0.8217,0.798265,0.25121,0.344915,0.327037,0.355856,0.310556,0.346379,0.232733,0.272263,0.327037,0.355856,0.843184,0.159318,0.842992,0.843377,0.843377,0.843184
4000,0.8182,0.798524,0.2464,0.331666,0.311374,0.333936,0.295196,0.324391,0.267145,0.306526,0.311374,0.333936,0.849538,0.148248,0.848864,0.849827,0.849827,0.849538
4500,0.8003,0.785596,0.255384,0.350962,0.327441,0.355339,0.310671,0.346871,0.269642,0.313736,0.327441,0.355339,0.850789,0.151328,0.850212,0.851656,0.851656,0.850789
5000,0.8048,0.785631,0.226496,0.336195,0.303991,0.338797,0.284677,0.329649,0.2495,0.308904,0.303991,0.338797,0.842414,0.157682,0.842222,0.843762,0.843762,0.842414


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Sts Pearson Cosine,Sts Spearman Cosine,Sts Pearson Manhattan,Sts Spearman Manhattan,Sts Pearson Euclidean,Sts Spearman Euclidean,Sts Pearson Dot,Sts Spearman Dot,Sts Pearson Max,Sts Spearman Max,Nli Cosine Accuracy,Nli Dot Accuracy,Nli Manhattan Accuracy,Nli Euclidean Accuracy,Nli Max Accuracy,Sequential Score
500,1.0858,1.039603,0.63208,0.616527,0.640295,0.626614,0.638386,0.625067,0.147975,0.15803,0.640295,0.626614,0.804005,0.203023,0.790239,0.787928,0.804005,0.804005
1000,0.9632,0.913153,0.278329,0.408173,0.385904,0.441342,0.371589,0.437279,0.118936,0.123071,0.385904,0.441342,0.817385,0.188294,0.817097,0.817,0.817385,0.817385
1500,0.9027,0.86387,0.225722,0.329588,0.30925,0.352173,0.295953,0.346224,0.132802,0.152213,0.30925,0.352173,0.815653,0.187332,0.817963,0.817578,0.817963,0.815653
2000,0.866,0.881497,0.247645,0.309904,0.315287,0.332113,0.300736,0.325441,0.176164,0.18744,0.315287,0.332113,0.815556,0.198113,0.818348,0.818156,0.818348,0.815556
2500,0.8509,0.838138,0.233054,0.308747,0.30808,0.32994,0.291108,0.320171,0.183689,0.182912,0.30808,0.32994,0.831633,0.174047,0.83221,0.83144,0.83221,0.831633
3000,0.8354,0.824169,0.246414,0.349197,0.333921,0.367229,0.31618,0.358353,0.212042,0.24278,0.333921,0.367229,0.842896,0.157297,0.843184,0.844051,0.844051,0.842896
3500,0.8217,0.798265,0.25121,0.344915,0.327037,0.355856,0.310556,0.346379,0.232733,0.272263,0.327037,0.355856,0.843184,0.159318,0.842992,0.843377,0.843377,0.843184
4000,0.8182,0.798524,0.2464,0.331666,0.311374,0.333936,0.295196,0.324391,0.267145,0.306526,0.311374,0.333936,0.849538,0.148248,0.848864,0.849827,0.849827,0.849538
4500,0.8003,0.785596,0.255384,0.350962,0.327441,0.355339,0.310671,0.346871,0.269642,0.313736,0.327441,0.355339,0.850789,0.151328,0.850212,0.851656,0.851656,0.850789
5000,0.8048,0.785631,0.226496,0.336195,0.303991,0.338797,0.284677,0.329649,0.2495,0.308904,0.303991,0.338797,0.842414,0.157682,0.842222,0.843762,0.843762,0.842414


testing now..


Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

test results: {'Sts_pearson_cosine': 0.3304956295468385, 'Sts_spearman_cosine': 0.4335885762308867, 'Sts_pearson_manhattan': 0.42210187731523, 'Sts_spearman_manhattan': 0.44692458629508797, 'Sts_pearson_euclidean': 0.3966640123713583, 'Sts_spearman_euclidean': 0.43043262370868024, 'Sts_pearson_dot': 0.34076841885591885, 'Sts_spearman_dot': 0.41032998262576925, 'Sts_pearson_max': 0.42210187731523, 'Sts_spearman_max': 0.44692458629508797}


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/Nli_cosine_accuracy,▁▂▂▂▃▄▄▅▅▄▅▅▇▆▆▆▇▇▆▆▇▇▇▇▇▇███████████
eval/Nli_dot_accuracy,█▇▇█▆▄▄▄▄▄▄▄▂▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
eval/Nli_euclidean_accuracy,▁▃▃▃▄▅▅▅▆▅▆▅▇▆▆▇▇▇▇▇▇▇▇▇▇▇███████████
eval/Nli_manhattan_accuracy,▁▃▃▃▄▅▅▅▅▅▆▅▇▆▆▇▇▇▇▇▇▇▇▇▇▇███████████
eval/Nli_max_accuracy,▁▂▂▂▃▄▄▅▅▄▅▅▇▆▆▆▇▇▇▆▇▇▇▇▇▇███████████
eval/Sts_pearson_cosine,█▂▁▁▁▁▁▁▂▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
eval/Sts_pearson_dot,▂▁▁▃▃▄▅▆▆▆▅▇▆▇▇▇██▇▇▇▇▇█▇█▇█▇█▇▇▇█▇█▇
eval/Sts_pearson_euclidean,█▃▁▁▁▂▂▁▂▁▁▂▂▂▂▂▂▂▂▂▂▂▂▃▂▃▂▃▂▂▂▂▂▂▂▃▂
eval/Sts_pearson_manhattan,█▃▁▁▁▂▁▁▁▁▁▂▂▂▂▂▃▃▂▂▂▂▂▃▃▃▃▃▂▃▂▂▃▃▃▃▃
eval/Sts_pearson_max,█▃▁▁▁▂▁▁▁▁▁▂▂▂▂▂▃▃▂▂▂▂▂▃▃▃▃▃▂▃▂▂▃▃▃▃▃

0,1
eval/Nli_cosine_accuracy,0.88429
eval/Nli_dot_accuracy,0.116
eval/Nli_euclidean_accuracy,0.88487
eval/Nli_manhattan_accuracy,0.88352
eval/Nli_max_accuracy,0.88487
eval/Sts_pearson_cosine,0.29534
eval/Sts_pearson_dot,0.30411
eval/Sts_pearson_euclidean,0.35959
eval/Sts_pearson_manhattan,0.38196
eval/Sts_pearson_max,0.38196


[34m[1mwandb[0m: Agent Starting Run: 7kaydg8w with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	bert: dictabert
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	pooling_method: mean
[34m[1mwandb[0m: 	similarity_fn_name: cosine
[34m[1mwandb[0m: 	trainer: cosinesimilarity
[34m[1mwandb[0m: 	trainer_args: {'eval_steps': 500, 'eval_strategy': 'steps', 'learning_rate': 2e-05, 'logging_steps': 500, 'num_train_epochs': 1, 'output_dir': 'models', 'per_device_eval_batch_size': 16, 'per_device_train_batch_size': 16, 'save_steps': 5000, 'save_strategy': 'steps', 'save_total_limit': 1, 'warmup_ratio': 0.1}


config: {'method': 'grid', 'metric': {'goal': 'maximize', 'name': 'eval/Sts_spearman_cosine'}, 'parameters': {'trainer': {'values': ['softmax', 'cosinesimilarity', 'triplet', 'mnrl']}, 'similarity_fn_name': {'value': 'cosine'}, 'epochs': {'value': 1}, 'batch_size': {'value': 16}, 'pooling_method': {'value': 'mean'}, 'bert': {'value': 'dictabert'}, 'trainer_args': {'parameters': {'output_dir': {'value': 'models'}, 'learning_rate': {'value': 2e-05}, 'num_train_epochs': {'value': 1}, 'per_device_train_batch_size': {'value': 16}, 'per_device_eval_batch_size': {'value': 16}, 'warmup_ratio': {'value': 0.1}, 'eval_strategy': {'value': 'steps'}, 'eval_steps': {'value': 500}, 'save_strategy': {'value': 'steps'}, 'save_steps': {'value': 5000}, 'save_total_limit': {'value': 1}, 'logging_steps': {'value': 500}}}}}




Some weights of BertModel were not initialized from the model checkpoint at dicta-il/dictabert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Sts Pearson Cosine,Sts Spearman Cosine,Sts Pearson Manhattan,Sts Spearman Manhattan,Sts Pearson Euclidean,Sts Spearman Euclidean,Sts Pearson Dot,Sts Spearman Dot,Sts Pearson Max,Sts Spearman Max,Nli Cosine Accuracy,Nli Dot Accuracy,Nli Manhattan Accuracy,Nli Euclidean Accuracy,Nli Max Accuracy,Sequential Score
500,0.1717,0.120523,0.55673,0.583618,0.605231,0.603112,0.6057,0.603706,0.347453,0.344811,0.6057,0.603706,0.861956,0.138525,0.856276,0.858587,0.861956,0.861956
1000,0.1155,0.107735,0.597974,0.620139,0.634562,0.63275,0.635196,0.634288,0.439037,0.454378,0.635196,0.634288,0.89382,0.106084,0.890258,0.889777,0.89382,0.89382
1500,0.1092,0.101867,0.642551,0.662698,0.663604,0.661003,0.663154,0.660802,0.57213,0.584245,0.663604,0.662698,0.90903,0.091067,0.909415,0.909318,0.909415,0.90903
2000,0.1056,0.100739,0.60127,0.619423,0.625818,0.624339,0.627543,0.626672,0.516271,0.518117,0.627543,0.626672,0.917886,0.085387,0.915094,0.913265,0.917886,0.917886
2500,0.105,0.102155,0.608877,0.618602,0.63186,0.625447,0.633502,0.627119,0.54121,0.54143,0.633502,0.627119,0.920293,0.081825,0.920004,0.919908,0.920293,0.920293
3000,0.1,0.101768,0.566155,0.607517,0.622228,0.622281,0.623086,0.623806,0.466515,0.482366,0.623086,0.623806,0.920389,0.082788,0.918945,0.91933,0.920389,0.920389
3500,0.0964,0.100506,0.592204,0.608162,0.609362,0.604852,0.612068,0.60739,0.570147,0.580588,0.612068,0.608162,0.928571,0.072584,0.927609,0.927416,0.928571,0.928571
4000,0.0973,0.097168,0.634562,0.65936,0.667604,0.666847,0.668014,0.666032,0.556952,0.566604,0.668014,0.666847,0.930208,0.072584,0.928764,0.928571,0.930208,0.930208
4500,0.0967,0.097385,0.628461,0.638394,0.645726,0.643862,0.646566,0.64437,0.581999,0.583837,0.646566,0.64437,0.933962,0.066712,0.930304,0.931556,0.933962,0.933962
5000,0.0957,0.097546,0.617612,0.632342,0.643002,0.642903,0.643462,0.64282,0.562854,0.571132,0.643462,0.642903,0.938294,0.063342,0.935503,0.93685,0.938294,0.938294


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

testing now..


Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

test results: {'Sts_pearson_cosine': 0.686020844359803, 'Sts_spearman_cosine': 0.6943428151235032, 'Sts_pearson_manhattan': 0.7009824133976774, 'Sts_spearman_manhattan': 0.6953852054289208, 'Sts_pearson_euclidean': 0.7019539709992649, 'Sts_spearman_euclidean': 0.6966777218203524, 'Sts_pearson_dot': 0.6659620291767417, 'Sts_spearman_dot': 0.6641743518820021, 'Sts_pearson_max': 0.7019539709992649, 'Sts_spearman_max': 0.6966777218203524}


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/Nli_cosine_accuracy,▁▃▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████████
eval/Nli_dot_accuracy,█▆▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/Nli_euclidean_accuracy,▁▃▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇█████████████
eval/Nli_manhattan_accuracy,▁▃▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇█████████████
eval/Nli_max_accuracy,▁▃▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████████
eval/Sts_pearson_cosine,▁▄█▄▅▂▄▇▆▆█▇▅▆▆▆█▆▅▆▇▆▇▅██▆▆▆█▇████▇█
eval/Sts_pearson_dot,▁▃▇▅▆▄▇▆▇▇█▇▇▇▇▇██▇▇▇▇█▇█▇▇▇▇████████
eval/Sts_pearson_euclidean,▁▄▆▃▄▃▂▇▅▄▇▅▄▆▅▄▇▄▄▅▇▅▆▅▇█▆▆▅▇▇▇█▇▇▇▇
eval/Sts_pearson_manhattan,▁▄▆▃▃▃▁▇▅▄▇▅▄▆▅▄▇▄▄▅▇▅▆▅▇█▆▆▆▇▇▇█▇▇▇▇
eval/Sts_pearson_max,▁▄▆▃▄▃▂▇▅▄▇▅▄▆▅▄▇▄▄▅▇▅▆▅▇█▆▆▅▇▇▇█▇▇▇▇

0,1
eval/Nli_cosine_accuracy,0.96698
eval/Nli_dot_accuracy,0.03167
eval/Nli_euclidean_accuracy,0.96563
eval/Nli_manhattan_accuracy,0.96534
eval/Nli_max_accuracy,0.96698
eval/Sts_pearson_cosine,0.64498
eval/Sts_pearson_dot,0.61034
eval/Sts_pearson_euclidean,0.67304
eval/Sts_pearson_manhattan,0.67238
eval/Sts_pearson_max,0.67304


[34m[1mwandb[0m: Agent Starting Run: ydhen9m9 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	bert: dictabert
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	pooling_method: mean
[34m[1mwandb[0m: 	similarity_fn_name: cosine
[34m[1mwandb[0m: 	trainer: triplet
[34m[1mwandb[0m: 	trainer_args: {'eval_steps': 500, 'eval_strategy': 'steps', 'learning_rate': 2e-05, 'logging_steps': 500, 'num_train_epochs': 1, 'output_dir': 'models', 'per_device_eval_batch_size': 16, 'per_device_train_batch_size': 16, 'save_steps': 5000, 'save_strategy': 'steps', 'save_total_limit': 1, 'warmup_ratio': 0.1}


config: {'method': 'grid', 'metric': {'goal': 'maximize', 'name': 'eval/Sts_spearman_cosine'}, 'parameters': {'trainer': {'values': ['softmax', 'cosinesimilarity', 'triplet', 'mnrl']}, 'similarity_fn_name': {'value': 'cosine'}, 'epochs': {'value': 1}, 'batch_size': {'value': 16}, 'pooling_method': {'value': 'mean'}, 'bert': {'value': 'dictabert'}, 'trainer_args': {'parameters': {'output_dir': {'value': 'models'}, 'learning_rate': {'value': 2e-05}, 'num_train_epochs': {'value': 1}, 'per_device_train_batch_size': {'value': 16}, 'per_device_eval_batch_size': {'value': 16}, 'warmup_ratio': {'value': 0.1}, 'eval_strategy': {'value': 'steps'}, 'eval_steps': {'value': 500}, 'save_strategy': {'value': 'steps'}, 'save_steps': {'value': 5000}, 'save_total_limit': {'value': 1}, 'logging_steps': {'value': 500}}}}}




Some weights of BertModel were not initialized from the model checkpoint at dicta-il/dictabert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Sts Pearson Cosine,Sts Spearman Cosine,Sts Pearson Manhattan,Sts Spearman Manhattan,Sts Pearson Euclidean,Sts Spearman Euclidean,Sts Pearson Dot,Sts Spearman Dot,Sts Pearson Max,Sts Spearman Max,Nli Cosine Accuracy,Nli Dot Accuracy,Nli Manhattan Accuracy,Nli Euclidean Accuracy,Nli Max Accuracy,Sequential Score
500,2.6219,0.778884,0.527248,0.603195,0.591659,0.608097,0.586849,0.606517,0.490317,0.546022,0.591659,0.608097,0.889392,0.111186,0.890354,0.890451,0.890451,0.889392
1000,1.4827,0.0,0.517042,0.596188,0.591157,0.605673,0.5813,0.60019,0.4993,0.566895,0.591157,0.605673,0.908837,0.092607,0.908356,0.907971,0.908837,0.908837
1500,1.3327,0.0,0.629338,0.668389,0.675836,0.676698,0.66518,0.670292,0.607578,0.641295,0.675836,0.676698,0.920581,0.080092,0.92241,0.920774,0.92241,0.920581
2000,1.237,0.0,0.594595,0.638559,0.645243,0.649039,0.630206,0.639613,0.570072,0.609331,0.645243,0.649039,0.925298,0.076434,0.927994,0.927031,0.927994,0.925298
2500,1.1434,0.0,0.600093,0.640779,0.647541,0.651152,0.632945,0.641901,0.582903,0.611034,0.647541,0.651152,0.933385,0.068829,0.934251,0.933192,0.934251,0.933385
3000,1.1378,0.0,0.614505,0.642887,0.65376,0.652172,0.640375,0.644491,0.584909,0.60715,0.65376,0.652172,0.941952,0.058529,0.943589,0.943011,0.943589,0.941952
3500,1.0749,0.0,0.632937,0.661619,0.666591,0.667148,0.656554,0.661441,0.615195,0.634287,0.666591,0.667148,0.947343,0.053427,0.949365,0.947728,0.949365,0.947343
4000,1.0562,0.0,0.626321,0.658927,0.664496,0.666916,0.650984,0.658105,0.603347,0.629804,0.664496,0.666916,0.948595,0.051694,0.95052,0.948306,0.95052,0.948595
4500,1.0403,0.0,0.623026,0.657664,0.669156,0.669625,0.651705,0.65878,0.599118,0.621703,0.669156,0.669625,0.953023,0.047844,0.953408,0.953312,0.953408,0.953023
5000,1.0374,0.0,0.658631,0.679788,0.683893,0.684975,0.6732,0.677966,0.640695,0.649298,0.683893,0.684975,0.957162,0.042838,0.956488,0.957162,0.957162,0.957162


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

testing now..


Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

test results: {'Sts_pearson_cosine': 0.6825048073682972, 'Sts_spearman_cosine': 0.7110350108334821, 'Sts_pearson_manhattan': 0.7178648422593138, 'Sts_spearman_manhattan': 0.7148834036361065, 'Sts_pearson_euclidean': 0.7003461261109568, 'Sts_spearman_euclidean': 0.7081517066015977, 'Sts_pearson_dot': 0.6705102750955405, 'Sts_spearman_dot': 0.6883081319161176, 'Sts_pearson_max': 0.7178648422593138, 'Sts_spearman_max': 0.7148834036361065}


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/Nli_cosine_accuracy,▁▃▄▅▅▆▇▇▇███
eval/Nli_dot_accuracy,█▆▅▅▄▃▂▂▂▁▁▁
eval/Nli_euclidean_accuracy,▁▃▄▅▅▆▇▇▇███
eval/Nli_manhattan_accuracy,▁▃▄▅▅▆▇▇▇███
eval/Nli_max_accuracy,▁▃▄▅▅▆▇▇▇███
eval/Sts_pearson_cosine,▂▁▇▅▅▆▇▆▆██▇
eval/Sts_pearson_dot,▁▁▆▅▅▅▇▆▆██▇
eval/Sts_pearson_euclidean,▁▁▇▅▅▅▇▆▆███
eval/Sts_pearson_manhattan,▁▁▇▅▅▆▇▇▇███
eval/Sts_pearson_max,▁▁▇▅▅▆▇▇▇███

0,1
eval/Nli_cosine_accuracy,0.95947
eval/Nli_dot_accuracy,0.04139
eval/Nli_euclidean_accuracy,0.95918
eval/Nli_manhattan_accuracy,0.95889
eval/Nli_max_accuracy,0.95947
eval/Sts_pearson_cosine,0.64742
eval/Sts_pearson_dot,0.6291
eval/Sts_pearson_euclidean,0.66932
eval/Sts_pearson_manhattan,0.68308
eval/Sts_pearson_max,0.68308


[34m[1mwandb[0m: Agent Starting Run: 86c8xhbt with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	bert: dictabert
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	pooling_method: mean
[34m[1mwandb[0m: 	similarity_fn_name: cosine
[34m[1mwandb[0m: 	trainer: mnrl
[34m[1mwandb[0m: 	trainer_args: {'eval_steps': 500, 'eval_strategy': 'steps', 'learning_rate': 2e-05, 'logging_steps': 500, 'num_train_epochs': 1, 'output_dir': 'models', 'per_device_eval_batch_size': 16, 'per_device_train_batch_size': 16, 'save_steps': 5000, 'save_strategy': 'steps', 'save_total_limit': 1, 'warmup_ratio': 0.1}


config: {'method': 'grid', 'metric': {'goal': 'maximize', 'name': 'eval/Sts_spearman_cosine'}, 'parameters': {'trainer': {'values': ['softmax', 'cosinesimilarity', 'triplet', 'mnrl']}, 'similarity_fn_name': {'value': 'cosine'}, 'epochs': {'value': 1}, 'batch_size': {'value': 16}, 'pooling_method': {'value': 'mean'}, 'bert': {'value': 'dictabert'}, 'trainer_args': {'parameters': {'output_dir': {'value': 'models'}, 'learning_rate': {'value': 2e-05}, 'num_train_epochs': {'value': 1}, 'per_device_train_batch_size': {'value': 16}, 'per_device_eval_batch_size': {'value': 16}, 'warmup_ratio': {'value': 0.1}, 'eval_strategy': {'value': 'steps'}, 'eval_steps': {'value': 500}, 'save_strategy': {'value': 'steps'}, 'save_steps': {'value': 5000}, 'save_total_limit': {'value': 1}, 'logging_steps': {'value': 500}}}}}




Some weights of BertModel were not initialized from the model checkpoint at dicta-il/dictabert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Sts Pearson Cosine,Sts Spearman Cosine,Sts Pearson Manhattan,Sts Spearman Manhattan,Sts Pearson Euclidean,Sts Spearman Euclidean,Sts Pearson Dot,Sts Spearman Dot,Sts Pearson Max,Sts Spearman Max,Nli Cosine Accuracy,Nli Dot Accuracy,Nli Manhattan Accuracy,Nli Euclidean Accuracy,Nli Max Accuracy,Sequential Score
500,0.6204,0.029589,0.772318,0.758284,0.769898,0.758265,0.770952,0.759888,0.635886,0.61135,0.772318,0.759888,0.887178,0.1134,0.881787,0.882268,0.887178,0.887178
1000,0.3785,0.010804,0.748059,0.740905,0.747185,0.742693,0.748013,0.743413,0.624004,0.611402,0.748059,0.743413,0.91134,0.090393,0.90412,0.903639,0.91134,0.91134
1500,0.3495,0.002894,0.776661,0.76948,0.774855,0.770274,0.774651,0.770633,0.663498,0.647921,0.776661,0.770633,0.920196,0.083654,0.909992,0.912206,0.920196,0.920196
2000,0.3277,0.008581,0.746782,0.749493,0.763822,0.760485,0.764023,0.760893,0.598226,0.590574,0.764023,0.760893,0.935406,0.068733,0.928668,0.930786,0.935406,0.935406
2500,0.303,0.001235,0.748725,0.747811,0.754591,0.752706,0.754726,0.752868,0.656343,0.649248,0.754726,0.752868,0.941471,0.0567,0.935695,0.935695,0.941471,0.941471
3000,0.3068,0.002215,0.756437,0.751477,0.758752,0.759644,0.759212,0.759321,0.660109,0.649387,0.759212,0.759644,0.948402,0.053331,0.943011,0.944455,0.948402,0.948402
3500,0.3003,0.002628,0.761024,0.757535,0.767667,0.76793,0.7682,0.768005,0.653063,0.640279,0.7682,0.768005,0.950039,0.051502,0.945803,0.947151,0.950039,0.950039
4000,0.3089,0.006383,0.776167,0.769291,0.772383,0.773667,0.773149,0.774775,0.686037,0.673706,0.776167,0.774775,0.954082,0.046496,0.950809,0.951579,0.954082,0.954082
4500,0.2707,0.002281,0.751833,0.750649,0.760173,0.760031,0.761701,0.761602,0.65375,0.645508,0.761701,0.761602,0.959569,0.040913,0.956585,0.957836,0.959569,0.959569
5000,0.2755,0.000779,0.765117,0.760586,0.767485,0.767672,0.768534,0.768918,0.657473,0.647798,0.768534,0.768918,0.966307,0.035425,0.960531,0.961398,0.966307,0.966307


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

testing now..


Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

test results: {'Sts_pearson_cosine': 0.7932420023036347, 'Sts_spearman_cosine': 0.7910961531105941, 'Sts_pearson_manhattan': 0.7871293529984481, 'Sts_spearman_manhattan': 0.7828040049951362, 'Sts_pearson_euclidean': 0.7884106120206235, 'Sts_spearman_euclidean': 0.7839322019573002, 'Sts_pearson_dot': 0.735430211604571, 'Sts_spearman_dot': 0.7244337406616848, 'Sts_pearson_max': 0.7932420023036347, 'Sts_spearman_max': 0.7910961531105941}


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/Nli_cosine_accuracy,▁▃▄▅▆▆▆▇▇███
eval/Nli_dot_accuracy,█▆▅▄▃▃▃▂▂▁▁▁
eval/Nli_euclidean_accuracy,▁▃▄▅▅▆▆▇▇███
eval/Nli_manhattan_accuracy,▁▃▃▅▆▆▆▇▇███
eval/Nli_max_accuracy,▁▃▄▅▆▆▆▇▇███
eval/Sts_pearson_cosine,▇▁█▁▁▃▄█▂▅▄▄
eval/Sts_pearson_dot,▄▃▆▁▆▆▅█▅▆▆▆
eval/Sts_pearson_euclidean,▇▁█▅▃▄▆█▅▆▅▅
eval/Sts_pearson_manhattan,▇▁█▅▃▄▆▇▄▆▅▅
eval/Sts_pearson_max,▇▁█▅▃▄▆█▄▆▅▅

0,1
eval/Nli_cosine_accuracy,0.96948
eval/Nli_dot_accuracy,0.03109
eval/Nli_euclidean_accuracy,0.96573
eval/Nli_manhattan_accuracy,0.96467
eval/Nli_max_accuracy,0.96948
eval/Sts_pearson_cosine,0.75828
eval/Sts_pearson_dot,0.65881
eval/Sts_pearson_euclidean,0.76432
eval/Sts_pearson_manhattan,0.7636
eval/Sts_pearson_max,0.76432


# Evaluation of the importance of pooling method

In [24]:
config = {
    "method": "grid",
    "metric": {"goal": "maximize", "name": "eval/Sts_spearman_cosine"},
    "parameters": {
        "trainer": {"value": "mnrl"},
        "similarity_fn_name": {"value": "cosine"},
        "epochs":{"value": 1},
        "batch_size": {"value": 16},
        "pooling_method": {"values": ["mean", "cls", "max"]},
        "bert": {"value": "dictabert"},
        "trainer_args": {
            "parameters" : {
              "output_dir" : {"value": "models"},
              "learning_rate" : {"value" : 2e-5},
              "num_train_epochs": {"value": 1},
              "per_device_train_batch_size": {"value": 16},
              "per_device_eval_batch_size": {"value" : 16},
              "warmup_ratio": {"value": 0.1},
              "eval_strategy":{"value":"steps"},
              "eval_steps": {"value":500},
              "save_strategy": {"value":"steps"},
              "save_steps": {"value":5000},
              "save_total_limit":{"value":1},
              "logging_steps":{"value":500}
          }
        }
    }
}

# Use weights and biases to monitor training process,
# And finetune the network experimenting with different hyperparameters
sweep_id = wandb.sweep(sweep=config, project="NLP-final-project")
#sweep_id = "qai1hafw"
wandb.agent(sweep_id, function=lambda: run_sweep(config), count=3)

Create sweep with ID: 9dj5p9vl
Sweep URL: https://wandb.ai/HebSBert/NLP-final-project/sweeps/9dj5p9vl


[34m[1mwandb[0m: Agent Starting Run: xof3t3lt with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	bert: dictabert
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	pooling_method: mean
[34m[1mwandb[0m: 	similarity_fn_name: cosine
[34m[1mwandb[0m: 	trainer: mnrl
[34m[1mwandb[0m: 	trainer_args: {'eval_steps': 500, 'eval_strategy': 'steps', 'learning_rate': 2e-05, 'logging_steps': 500, 'num_train_epochs': 1, 'output_dir': 'models', 'per_device_eval_batch_size': 16, 'per_device_train_batch_size': 16, 'save_steps': 5000, 'save_strategy': 'steps', 'save_total_limit': 1, 'warmup_ratio': 0.1}


config: {'method': 'grid', 'metric': {'goal': 'maximize', 'name': 'eval/Sts_spearman_cosine'}, 'parameters': {'trainer': {'value': 'mnrl'}, 'similarity_fn_name': {'value': 'cosine'}, 'epochs': {'value': 1}, 'batch_size': {'value': 16}, 'pooling_method': {'values': ['mean', 'cls', 'max']}, 'bert': {'value': 'dictabert'}, 'trainer_args': {'parameters': {'output_dir': {'value': 'models'}, 'learning_rate': {'value': 2e-05}, 'num_train_epochs': {'value': 1}, 'per_device_train_batch_size': {'value': 16}, 'per_device_eval_batch_size': {'value': 16}, 'warmup_ratio': {'value': 0.1}, 'eval_strategy': {'value': 'steps'}, 'eval_steps': {'value': 500}, 'save_strategy': {'value': 'steps'}, 'save_steps': {'value': 5000}, 'save_total_limit': {'value': 1}, 'logging_steps': {'value': 500}}}}}




Some weights of BertModel were not initialized from the model checkpoint at dicta-il/dictabert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Sts Pearson Cosine,Sts Spearman Cosine,Sts Pearson Manhattan,Sts Spearman Manhattan,Sts Pearson Euclidean,Sts Spearman Euclidean,Sts Pearson Dot,Sts Spearman Dot,Sts Pearson Max,Sts Spearman Max,Nli Cosine Accuracy,Nli Dot Accuracy,Nli Manhattan Accuracy,Nli Euclidean Accuracy,Nli Max Accuracy,Sequential Score
500,0.6204,0.029589,0.772318,0.758284,0.769898,0.758265,0.770952,0.759888,0.635886,0.61135,0.772318,0.759888,0.887178,0.1134,0.881787,0.882268,0.887178,0.887178
1000,0.3785,0.010804,0.748059,0.740905,0.747185,0.742693,0.748013,0.743413,0.624004,0.611402,0.748059,0.743413,0.91134,0.090393,0.90412,0.903639,0.91134,0.91134
1500,0.3495,0.002894,0.776661,0.76948,0.774855,0.770274,0.774651,0.770633,0.663498,0.647921,0.776661,0.770633,0.920196,0.083654,0.909992,0.912206,0.920196,0.920196
2000,0.3277,0.008581,0.746782,0.749493,0.763822,0.760485,0.764023,0.760893,0.598226,0.590574,0.764023,0.760893,0.935406,0.068733,0.928668,0.930786,0.935406,0.935406
2500,0.303,0.001235,0.748725,0.747811,0.754591,0.752706,0.754726,0.752868,0.656343,0.649248,0.754726,0.752868,0.941471,0.0567,0.935695,0.935695,0.941471,0.941471
3000,0.3068,0.002215,0.756437,0.751477,0.758752,0.759644,0.759212,0.759321,0.660109,0.649387,0.759212,0.759644,0.948402,0.053331,0.943011,0.944455,0.948402,0.948402
3500,0.3003,0.002628,0.761024,0.757535,0.767667,0.76793,0.7682,0.768005,0.653063,0.640279,0.7682,0.768005,0.950039,0.051502,0.945803,0.947151,0.950039,0.950039
4000,0.3089,0.006383,0.776167,0.769291,0.772383,0.773667,0.773149,0.774775,0.686037,0.673706,0.776167,0.774775,0.954082,0.046496,0.950809,0.951579,0.954082,0.954082
4500,0.2707,0.002281,0.751833,0.750649,0.760173,0.760031,0.761701,0.761602,0.65375,0.645508,0.761701,0.761602,0.959569,0.040913,0.956585,0.957836,0.959569,0.959569
5000,0.2755,0.000779,0.765117,0.760586,0.767485,0.767672,0.768534,0.768918,0.657473,0.647798,0.768534,0.768918,0.966307,0.035425,0.960531,0.961398,0.966307,0.966307


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

testing now..


Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

test results: {'Sts_pearson_cosine': 0.7932420023036347, 'Sts_spearman_cosine': 0.7910961531105941, 'Sts_pearson_manhattan': 0.7871293529984481, 'Sts_spearman_manhattan': 0.7828040049951362, 'Sts_pearson_euclidean': 0.7884106120206235, 'Sts_spearman_euclidean': 0.7839322019573002, 'Sts_pearson_dot': 0.735430211604571, 'Sts_spearman_dot': 0.7244337406616848, 'Sts_pearson_max': 0.7932420023036347, 'Sts_spearman_max': 0.7910961531105941}


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/Nli_cosine_accuracy,▁▃▄▅▆▆▆▇▇███
eval/Nli_dot_accuracy,█▆▅▄▃▃▃▂▂▁▁▁
eval/Nli_euclidean_accuracy,▁▃▄▅▅▆▆▇▇███
eval/Nli_manhattan_accuracy,▁▃▃▅▆▆▆▇▇███
eval/Nli_max_accuracy,▁▃▄▅▆▆▆▇▇███
eval/Sts_pearson_cosine,▇▁█▁▁▃▄█▂▅▄▄
eval/Sts_pearson_dot,▄▃▆▁▆▆▅█▅▆▆▆
eval/Sts_pearson_euclidean,▇▁█▅▃▄▆█▅▆▅▅
eval/Sts_pearson_manhattan,▇▁█▅▃▄▆▇▄▆▅▅
eval/Sts_pearson_max,▇▁█▅▃▄▆█▄▆▅▅

0,1
eval/Nli_cosine_accuracy,0.96948
eval/Nli_dot_accuracy,0.03109
eval/Nli_euclidean_accuracy,0.96573
eval/Nli_manhattan_accuracy,0.96467
eval/Nli_max_accuracy,0.96948
eval/Sts_pearson_cosine,0.75828
eval/Sts_pearson_dot,0.65881
eval/Sts_pearson_euclidean,0.76432
eval/Sts_pearson_manhattan,0.7636
eval/Sts_pearson_max,0.76432


[34m[1mwandb[0m: Agent Starting Run: pt0ioe9e with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	bert: dictabert
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	pooling_method: cls
[34m[1mwandb[0m: 	similarity_fn_name: cosine
[34m[1mwandb[0m: 	trainer: mnrl
[34m[1mwandb[0m: 	trainer_args: {'eval_steps': 500, 'eval_strategy': 'steps', 'learning_rate': 2e-05, 'logging_steps': 500, 'num_train_epochs': 1, 'output_dir': 'models', 'per_device_eval_batch_size': 16, 'per_device_train_batch_size': 16, 'save_steps': 5000, 'save_strategy': 'steps', 'save_total_limit': 1, 'warmup_ratio': 0.1}


config: {'method': 'grid', 'metric': {'goal': 'maximize', 'name': 'eval/Sts_spearman_cosine'}, 'parameters': {'trainer': {'value': 'mnrl'}, 'similarity_fn_name': {'value': 'cosine'}, 'epochs': {'value': 1}, 'batch_size': {'value': 16}, 'pooling_method': {'values': ['mean', 'cls', 'max']}, 'bert': {'value': 'dictabert'}, 'trainer_args': {'parameters': {'output_dir': {'value': 'models'}, 'learning_rate': {'value': 2e-05}, 'num_train_epochs': {'value': 1}, 'per_device_train_batch_size': {'value': 16}, 'per_device_eval_batch_size': {'value': 16}, 'warmup_ratio': {'value': 0.1}, 'eval_strategy': {'value': 'steps'}, 'eval_steps': {'value': 500}, 'save_strategy': {'value': 'steps'}, 'save_steps': {'value': 5000}, 'save_total_limit': {'value': 1}, 'logging_steps': {'value': 500}}}}}




Some weights of BertModel were not initialized from the model checkpoint at dicta-il/dictabert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Sts Pearson Cosine,Sts Spearman Cosine,Sts Pearson Manhattan,Sts Spearman Manhattan,Sts Pearson Euclidean,Sts Spearman Euclidean,Sts Pearson Dot,Sts Spearman Dot,Sts Pearson Max,Sts Spearman Max,Nli Cosine Accuracy,Nli Dot Accuracy,Nli Manhattan Accuracy,Nli Euclidean Accuracy,Nli Max Accuracy,Sequential Score
500,0.8153,0.067614,0.775649,0.758904,0.774378,0.76065,0.77564,0.761491,0.760684,0.736002,0.775649,0.761491,0.877358,0.120235,0.875722,0.877166,0.877358,0.877358
1000,0.4207,0.010133,0.748414,0.746727,0.754676,0.749037,0.754797,0.748734,0.736018,0.730774,0.754797,0.749037,0.907971,0.090296,0.905275,0.907297,0.907971,0.907971
1500,0.3764,0.001987,0.783614,0.775532,0.778946,0.774324,0.779179,0.77476,0.778613,0.769066,0.783614,0.775532,0.916731,0.081729,0.914228,0.914806,0.916731,0.916731
2000,0.3613,0.002396,0.762422,0.762199,0.765835,0.764419,0.766877,0.765481,0.745871,0.741873,0.766877,0.765481,0.928283,0.070851,0.926454,0.927513,0.928283,0.928283
2500,0.3347,0.001887,0.750131,0.746602,0.752016,0.749959,0.752369,0.750512,0.734013,0.727964,0.752369,0.750512,0.938005,0.058914,0.935214,0.935984,0.938005,0.938005
3000,0.3405,0.00032,0.761894,0.757449,0.761529,0.760179,0.762379,0.761116,0.746363,0.738157,0.762379,0.761116,0.9433,0.05516,0.94099,0.94176,0.9433,0.9433
3500,0.327,0.000657,0.765411,0.760791,0.762987,0.762826,0.764135,0.763333,0.753145,0.74629,0.765411,0.763333,0.949076,0.050347,0.947439,0.946958,0.949076,0.949076
4000,0.3325,0.004543,0.770451,0.762013,0.763813,0.762263,0.765018,0.763439,0.758549,0.748426,0.770451,0.763439,0.951097,0.04794,0.950039,0.950135,0.951097,0.951097
4500,0.3016,0.005418,0.756442,0.751649,0.754352,0.7535,0.755878,0.754067,0.744417,0.737785,0.756442,0.754067,0.955044,0.044282,0.953889,0.953985,0.955044,0.955044
5000,0.3009,0.001303,0.764797,0.760178,0.762352,0.762786,0.76337,0.763772,0.751501,0.745104,0.764797,0.763772,0.960146,0.03918,0.958317,0.959087,0.960146,0.960146


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

testing now..


Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

test results: {'Sts_pearson_cosine': 0.7937795687547732, 'Sts_spearman_cosine': 0.7931615246741368, 'Sts_pearson_manhattan': 0.7953429705625565, 'Sts_spearman_manhattan': 0.7929631037566001, 'Sts_pearson_euclidean': 0.7956373378402196, 'Sts_spearman_euclidean': 0.7932811638746345, 'Sts_pearson_dot': 0.7892564954549324, 'Sts_spearman_dot': 0.7864207516176439, 'Sts_pearson_max': 0.7956373378402196, 'Sts_spearman_max': 0.7932811638746345}


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/Nli_cosine_accuracy,▁▄▄▅▆▆▇▇▇███
eval/Nli_dot_accuracy,█▆▅▄▃▃▂▂▂▁▁▁
eval/Nli_euclidean_accuracy,▁▃▄▅▆▆▇▇▇███
eval/Nli_manhattan_accuracy,▁▃▄▅▆▆▇▇▇███
eval/Nli_max_accuracy,▁▄▄▅▆▆▇▇▇███
eval/Sts_pearson_cosine,▆▁█▄▁▄▄▅▃▄▄▄
eval/Sts_pearson_dot,▅▁█▃▁▃▄▅▃▄▃▃
eval/Sts_pearson_euclidean,▇▂█▅▁▄▄▄▂▄▃▃
eval/Sts_pearson_manhattan,▇▂█▅▁▃▄▄▂▄▃▃
eval/Sts_pearson_max,▆▂█▄▁▃▄▅▂▄▃▃

0,1
eval/Nli_cosine_accuracy,0.96226
eval/Nli_dot_accuracy,0.03543
eval/Nli_euclidean_accuracy,0.96169
eval/Nli_manhattan_accuracy,0.96024
eval/Nli_max_accuracy,0.96226
eval/Sts_pearson_cosine,0.76149
eval/Sts_pearson_dot,0.74561
eval/Sts_pearson_euclidean,0.76111
eval/Sts_pearson_manhattan,0.76035
eval/Sts_pearson_max,0.76149


[34m[1mwandb[0m: Agent Starting Run: ujg8b4ih with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	bert: dictabert
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	pooling_method: max
[34m[1mwandb[0m: 	similarity_fn_name: cosine
[34m[1mwandb[0m: 	trainer: mnrl
[34m[1mwandb[0m: 	trainer_args: {'eval_steps': 500, 'eval_strategy': 'steps', 'learning_rate': 2e-05, 'logging_steps': 500, 'num_train_epochs': 1, 'output_dir': 'models', 'per_device_eval_batch_size': 16, 'per_device_train_batch_size': 16, 'save_steps': 5000, 'save_strategy': 'steps', 'save_total_limit': 1, 'warmup_ratio': 0.1}


config: {'method': 'grid', 'metric': {'goal': 'maximize', 'name': 'eval/Sts_spearman_cosine'}, 'parameters': {'trainer': {'value': 'mnrl'}, 'similarity_fn_name': {'value': 'cosine'}, 'epochs': {'value': 1}, 'batch_size': {'value': 16}, 'pooling_method': {'values': ['mean', 'cls', 'max']}, 'bert': {'value': 'dictabert'}, 'trainer_args': {'parameters': {'output_dir': {'value': 'models'}, 'learning_rate': {'value': 2e-05}, 'num_train_epochs': {'value': 1}, 'per_device_train_batch_size': {'value': 16}, 'per_device_eval_batch_size': {'value': 16}, 'warmup_ratio': {'value': 0.1}, 'eval_strategy': {'value': 'steps'}, 'eval_steps': {'value': 500}, 'save_strategy': {'value': 'steps'}, 'save_steps': {'value': 5000}, 'save_total_limit': {'value': 1}, 'logging_steps': {'value': 500}}}}}




Some weights of BertModel were not initialized from the model checkpoint at dicta-il/dictabert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Sts Pearson Cosine,Sts Spearman Cosine,Sts Pearson Manhattan,Sts Spearman Manhattan,Sts Pearson Euclidean,Sts Spearman Euclidean,Sts Pearson Dot,Sts Spearman Dot,Sts Pearson Max,Sts Spearman Max,Nli Cosine Accuracy,Nli Dot Accuracy,Nli Manhattan Accuracy,Nli Euclidean Accuracy,Nli Max Accuracy,Sequential Score
500,1.0961,0.010329,0.74152,0.722266,0.738737,0.718367,0.738758,0.719242,0.736424,0.712749,0.74152,0.722266,0.86571,0.133808,0.865229,0.865133,0.86571,0.86571
1000,0.4857,0.004545,0.736864,0.728123,0.736375,0.725517,0.737031,0.725776,0.732634,0.720653,0.737031,0.728123,0.894012,0.104447,0.893435,0.894109,0.894109,0.894012
1500,0.4305,0.004999,0.775763,0.763552,0.767647,0.756753,0.76815,0.757886,0.77751,0.76291,0.77751,0.763552,0.904987,0.093858,0.902869,0.902772,0.904987,0.904987
2000,0.3936,0.001753,0.760998,0.756392,0.76,0.753491,0.760502,0.75412,0.756678,0.74956,0.760998,0.756392,0.920581,0.076627,0.917597,0.918752,0.920581,0.920581
2500,0.3606,0.000657,0.753293,0.749958,0.749306,0.745614,0.749628,0.74577,0.748283,0.742014,0.753293,0.749958,0.930978,0.067385,0.929534,0.9304,0.930978,0.930978
3000,0.3656,0.001433,0.758771,0.751903,0.753394,0.74766,0.752786,0.747334,0.758291,0.750586,0.758771,0.751903,0.939161,0.059588,0.937909,0.938487,0.939161,0.939161
3500,0.3512,0.001225,0.767586,0.760862,0.758839,0.755402,0.759079,0.75528,0.767313,0.759868,0.767586,0.760862,0.943878,0.056026,0.942915,0.94407,0.94407,0.943878
4000,0.3513,0.015988,0.78079,0.770574,0.766405,0.763747,0.766025,0.763808,0.782158,0.770597,0.782158,0.770597,0.946958,0.052946,0.945995,0.946284,0.946958,0.946958
4500,0.3193,0.005458,0.759735,0.754686,0.752502,0.749465,0.752082,0.749397,0.760436,0.753151,0.760436,0.754686,0.953119,0.046303,0.95206,0.952734,0.953119,0.953119
5000,0.323,0.001191,0.771317,0.766275,0.7623,0.762719,0.761799,0.762313,0.770338,0.763638,0.771317,0.766275,0.95774,0.042742,0.957066,0.957258,0.95774,0.95774


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

testing now..


Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

test results: {'Sts_pearson_cosine': 0.7844997571545432, 'Sts_spearman_cosine': 0.7864185843262981, 'Sts_pearson_manhattan': 0.7881926441665078, 'Sts_spearman_manhattan': 0.7833781587894545, 'Sts_pearson_euclidean': 0.7875443679789712, 'Sts_spearman_euclidean': 0.7826657834656103, 'Sts_pearson_dot': 0.783667409011653, 'Sts_spearman_dot': 0.7842379130183798, 'Sts_pearson_max': 0.7881926441665078, 'Sts_spearman_max': 0.7864185843262981}


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/Nli_cosine_accuracy,▁▃▄▅▆▆▇▇▇███
eval/Nli_dot_accuracy,█▆▅▄▃▂▂▂▁▁▁▁
eval/Nli_euclidean_accuracy,▁▃▄▅▆▆▇▇▇███
eval/Nli_manhattan_accuracy,▁▃▄▅▆▆▇▇▇███
eval/Nli_max_accuracy,▁▃▄▅▆▆▇▇▇███
eval/Sts_pearson_cosine,▂▁▇▅▄▄▆█▅▆▅▆
eval/Sts_pearson_dot,▂▁▇▄▃▅▆█▅▆▅▆
eval/Sts_pearson_euclidean,▁▁█▆▄▅▆█▄▇▄▆
eval/Sts_pearson_manhattan,▂▁█▆▄▅▆█▅▇▅▆
eval/Sts_pearson_max,▂▁▇▅▄▄▆█▅▆▅▆

0,1
eval/Nli_cosine_accuracy,0.96044
eval/Nli_dot_accuracy,0.03966
eval/Nli_euclidean_accuracy,0.96015
eval/Nli_manhattan_accuracy,0.95966
eval/Nli_max_accuracy,0.96044
eval/Sts_pearson_cosine,0.76721
eval/Sts_pearson_dot,0.76647
eval/Sts_pearson_euclidean,0.75835
eval/Sts_pearson_manhattan,0.7592
eval/Sts_pearson_max,0.76721


# Evaluation of the importance of learning rate

In [24]:
config = {
    "method": "grid",
    "metric": {"goal": "maximize", "name": "eval/Sts_spearman_cosine"},
    "parameters": {
        "trainer": {"value": "mnrl"},
        "similarity_fn_name": {"value": "cosine"},
        "epochs":{"value": 1},
        "batch_size": {"value": 16},
        "pooling_method": {"value": "mean"},
        "bert": {"value": "dictabert"},
        "trainer_args": {
            "parameters" : {
              "output_dir" : {"value": "models"},
              "learning_rate" : {"values" : [2e-4, 2e-5, 2e-8]},
              "num_train_epochs": {"value": 1},
              "per_device_train_batch_size": {"value": 16},
              "per_device_eval_batch_size": {"value" : 16},
              "warmup_ratio": {"value": 0.1},
              "eval_strategy":{"value":"steps"},
              "eval_steps": {"value":500},
              "save_strategy": {"value":"steps"},
              "save_steps": {"value":5000},
              "save_total_limit":{"value":1},
              "logging_steps":{"value":500}
          }
        }
    }
}

# Use weights and biases to monitor training process,
# And finetune the network experimenting with different hyperparameters
sweep_id = wandb.sweep(sweep=config, project="NLP-final-project")
#sweep_id = "qai1hafw"
wandb.agent(sweep_id, function=lambda: run_sweep(config), count=3)

Create sweep with ID: umu2xrki
Sweep URL: https://wandb.ai/HebSBert/NLP-final-project/sweeps/umu2xrki


[34m[1mwandb[0m: Agent Starting Run: 3shq8cgo with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	bert: dictabert
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	pooling_method: mean
[34m[1mwandb[0m: 	similarity_fn_name: cosine
[34m[1mwandb[0m: 	trainer: mnrl
[34m[1mwandb[0m: 	trainer_args: {'eval_steps': 500, 'eval_strategy': 'steps', 'learning_rate': 0.0002, 'logging_steps': 500, 'num_train_epochs': 1, 'output_dir': 'models', 'per_device_eval_batch_size': 16, 'per_device_train_batch_size': 16, 'save_steps': 5000, 'save_strategy': 'steps', 'save_total_limit': 1, 'warmup_ratio': 0.1}


config: {'method': 'grid', 'metric': {'goal': 'maximize', 'name': 'eval/Sts_spearman_cosine'}, 'parameters': {'trainer': {'value': 'mnrl'}, 'similarity_fn_name': {'value': 'cosine'}, 'epochs': {'value': 1}, 'batch_size': {'value': 16}, 'pooling_method': {'value': 'mean'}, 'bert': {'value': 'dictabert'}, 'trainer_args': {'parameters': {'output_dir': {'value': 'models'}, 'learning_rate': {'values': [0.0002, 2e-05, 2e-08]}, 'num_train_epochs': {'value': 1}, 'per_device_train_batch_size': {'value': 16}, 'per_device_eval_batch_size': {'value': 16}, 'warmup_ratio': {'value': 0.1}, 'eval_strategy': {'value': 'steps'}, 'eval_steps': {'value': 500}, 'save_strategy': {'value': 'steps'}, 'save_steps': {'value': 5000}, 'save_total_limit': {'value': 1}, 'logging_steps': {'value': 500}}}}}




Some weights of BertModel were not initialized from the model checkpoint at dicta-il/dictabert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Sts Pearson Cosine,Sts Spearman Cosine,Sts Pearson Manhattan,Sts Spearman Manhattan,Sts Pearson Euclidean,Sts Spearman Euclidean,Sts Pearson Dot,Sts Spearman Dot,Sts Pearson Max,Sts Spearman Max,Nli Cosine Accuracy,Nli Dot Accuracy,Nli Manhattan Accuracy,Nli Euclidean Accuracy,Nli Max Accuracy,Sequential Score
500,0.5608,0.003299,0.699664,0.688245,0.700372,0.688602,0.699678,0.687808,0.635373,0.610354,0.700372,0.688602,0.869754,0.127744,0.860705,0.862245,0.869754,0.869754
1000,0.7127,0.014584,0.654125,0.638426,0.665968,0.646053,0.664244,0.644355,0.613317,0.587832,0.665968,0.646053,0.842126,0.155949,0.839623,0.840585,0.842126,0.842126
1500,0.738,0.042322,0.602676,0.588763,0.61982,0.603446,0.620461,0.603261,0.53822,0.517049,0.620461,0.603446,0.833365,0.163362,0.824605,0.825279,0.833365,0.833365
2000,0.7252,0.003914,0.638178,0.61827,0.638482,0.614378,0.639022,0.615194,0.630576,0.603483,0.639022,0.61827,0.863304,0.134097,0.860993,0.861379,0.863304,0.863304
2500,0.6764,0.019533,0.643465,0.630359,0.660059,0.640463,0.659404,0.639785,0.589215,0.568116,0.660059,0.640463,0.876203,0.12293,0.873508,0.873604,0.876203,0.876203
3000,0.6586,0.036264,0.631502,0.61571,0.649772,0.630283,0.650551,0.630755,0.559353,0.533567,0.650551,0.630755,0.868502,0.128032,0.863015,0.862919,0.868502,0.868502
3500,0.6278,0.012443,0.64098,0.628257,0.64929,0.632194,0.651111,0.634982,0.601531,0.583904,0.651111,0.634982,0.883423,0.111475,0.879765,0.881305,0.883423,0.883423
4000,0.6147,0.017735,0.641611,0.639311,0.639573,0.648543,0.640118,0.648927,0.575549,0.585047,0.641611,0.648927,0.893338,0.105121,0.892953,0.89382,0.89382,0.893338
4500,0.5413,0.00389,0.663412,0.66169,0.680137,0.670745,0.681319,0.672515,0.615895,0.608443,0.681319,0.672515,0.915383,0.084136,0.91442,0.913747,0.915383,0.915383
5000,0.4965,0.002404,0.67516,0.666379,0.687249,0.676848,0.688631,0.67837,0.634948,0.621628,0.688631,0.67837,0.926069,0.073161,0.922699,0.922988,0.926069,0.926069


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

testing now..


Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

test results: {'Sts_pearson_cosine': 0.705169757890988, 'Sts_spearman_cosine': 0.7032846286034432, 'Sts_pearson_manhattan': 0.7194948915911035, 'Sts_spearman_manhattan': 0.7121402590876722, 'Sts_pearson_euclidean': 0.7202124556831114, 'Sts_spearman_euclidean': 0.7122621533597242, 'Sts_pearson_dot': 0.6729893014945529, 'Sts_spearman_dot': 0.6637695130370123, 'Sts_pearson_max': 0.7202124556831114, 'Sts_spearman_max': 0.7122621533597242}


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/Nli_cosine_accuracy,▃▂▁▃▄▃▄▅▆▇▇█
eval/Nli_dot_accuracy,▆██▆▅▆▅▄▃▂▂▁
eval/Nli_euclidean_accuracy,▃▂▁▃▄▃▄▅▆▇▇█
eval/Nli_manhattan_accuracy,▃▂▁▃▄▃▄▅▆▇▇█
eval/Nli_max_accuracy,▃▂▁▃▄▃▄▅▆▇▇█
eval/Sts_pearson_cosine,█▅▁▄▄▃▄▄▅▆▇▇
eval/Sts_pearson_dot,█▆▁█▅▃▆▄▇██▇
eval/Sts_pearson_euclidean,█▅▁▃▄▄▄▃▆▇▇▇
eval/Sts_pearson_manhattan,█▅▁▃▄▄▄▃▆▇▇▇
eval/Sts_pearson_max,█▅▁▃▄▄▄▃▆▇▇▇

0,1
eval/Nli_cosine_accuracy,0.94561
eval/Nli_dot_accuracy,0.05189
eval/Nli_euclidean_accuracy,0.94388
eval/Nli_manhattan_accuracy,0.94349
eval/Nli_max_accuracy,0.94561
eval/Sts_pearson_cosine,0.68165
eval/Sts_pearson_dot,0.62596
eval/Sts_pearson_euclidean,0.69005
eval/Sts_pearson_manhattan,0.68948
eval/Sts_pearson_max,0.69005


[34m[1mwandb[0m: Agent Starting Run: vbafwmkc with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	bert: dictabert
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	pooling_method: mean
[34m[1mwandb[0m: 	similarity_fn_name: cosine
[34m[1mwandb[0m: 	trainer: mnrl
[34m[1mwandb[0m: 	trainer_args: {'eval_steps': 500, 'eval_strategy': 'steps', 'learning_rate': 2e-05, 'logging_steps': 500, 'num_train_epochs': 1, 'output_dir': 'models', 'per_device_eval_batch_size': 16, 'per_device_train_batch_size': 16, 'save_steps': 5000, 'save_strategy': 'steps', 'save_total_limit': 1, 'warmup_ratio': 0.1}


config: {'method': 'grid', 'metric': {'goal': 'maximize', 'name': 'eval/Sts_spearman_cosine'}, 'parameters': {'trainer': {'value': 'mnrl'}, 'similarity_fn_name': {'value': 'cosine'}, 'epochs': {'value': 1}, 'batch_size': {'value': 16}, 'pooling_method': {'value': 'mean'}, 'bert': {'value': 'dictabert'}, 'trainer_args': {'parameters': {'output_dir': {'value': 'models'}, 'learning_rate': {'values': [0.0002, 2e-05, 2e-08]}, 'num_train_epochs': {'value': 1}, 'per_device_train_batch_size': {'value': 16}, 'per_device_eval_batch_size': {'value': 16}, 'warmup_ratio': {'value': 0.1}, 'eval_strategy': {'value': 'steps'}, 'eval_steps': {'value': 500}, 'save_strategy': {'value': 'steps'}, 'save_steps': {'value': 5000}, 'save_total_limit': {'value': 1}, 'logging_steps': {'value': 500}}}}}




Some weights of BertModel were not initialized from the model checkpoint at dicta-il/dictabert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Sts Pearson Cosine,Sts Spearman Cosine,Sts Pearson Manhattan,Sts Spearman Manhattan,Sts Pearson Euclidean,Sts Spearman Euclidean,Sts Pearson Dot,Sts Spearman Dot,Sts Pearson Max,Sts Spearman Max,Nli Cosine Accuracy,Nli Dot Accuracy,Nli Manhattan Accuracy,Nli Euclidean Accuracy,Nli Max Accuracy,Sequential Score
500,0.6204,0.029589,0.772318,0.758284,0.769898,0.758265,0.770952,0.759888,0.635886,0.61135,0.772318,0.759888,0.887178,0.1134,0.881787,0.882268,0.887178,0.887178
1000,0.3785,0.010804,0.748059,0.740905,0.747185,0.742693,0.748013,0.743413,0.624004,0.611402,0.748059,0.743413,0.91134,0.090393,0.90412,0.903639,0.91134,0.91134
1500,0.3495,0.002894,0.776661,0.76948,0.774855,0.770274,0.774651,0.770633,0.663498,0.647921,0.776661,0.770633,0.920196,0.083654,0.909992,0.912206,0.920196,0.920196
2000,0.3277,0.008581,0.746782,0.749493,0.763822,0.760485,0.764023,0.760893,0.598226,0.590574,0.764023,0.760893,0.935406,0.068733,0.928668,0.930786,0.935406,0.935406
2500,0.303,0.001235,0.748725,0.747811,0.754591,0.752706,0.754726,0.752868,0.656343,0.649248,0.754726,0.752868,0.941471,0.0567,0.935695,0.935695,0.941471,0.941471
3000,0.3068,0.002215,0.756437,0.751477,0.758752,0.759644,0.759212,0.759321,0.660109,0.649387,0.759212,0.759644,0.948402,0.053331,0.943011,0.944455,0.948402,0.948402
3500,0.3003,0.002628,0.761024,0.757535,0.767667,0.76793,0.7682,0.768005,0.653063,0.640279,0.7682,0.768005,0.950039,0.051502,0.945803,0.947151,0.950039,0.950039
4000,0.3089,0.006383,0.776167,0.769291,0.772383,0.773667,0.773149,0.774775,0.686037,0.673706,0.776167,0.774775,0.954082,0.046496,0.950809,0.951579,0.954082,0.954082
4500,0.2707,0.002281,0.751833,0.750649,0.760173,0.760031,0.761701,0.761602,0.65375,0.645508,0.761701,0.761602,0.959569,0.040913,0.956585,0.957836,0.959569,0.959569
5000,0.2755,0.000779,0.765117,0.760586,0.767485,0.767672,0.768534,0.768918,0.657473,0.647798,0.768534,0.768918,0.966307,0.035425,0.960531,0.961398,0.966307,0.966307


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

testing now..


Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

test results: {'Sts_pearson_cosine': 0.7932420023036347, 'Sts_spearman_cosine': 0.7910961531105941, 'Sts_pearson_manhattan': 0.7871293529984481, 'Sts_spearman_manhattan': 0.7828040049951362, 'Sts_pearson_euclidean': 0.7884106120206235, 'Sts_spearman_euclidean': 0.7839322019573002, 'Sts_pearson_dot': 0.735430211604571, 'Sts_spearman_dot': 0.7244337406616848, 'Sts_pearson_max': 0.7932420023036347, 'Sts_spearman_max': 0.7910961531105941}


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/Nli_cosine_accuracy,▁▃▄▅▆▆▆▇▇███
eval/Nli_dot_accuracy,█▆▅▄▃▃▃▂▂▁▁▁
eval/Nli_euclidean_accuracy,▁▃▄▅▅▆▆▇▇███
eval/Nli_manhattan_accuracy,▁▃▃▅▆▆▆▇▇███
eval/Nli_max_accuracy,▁▃▄▅▆▆▆▇▇███
eval/Sts_pearson_cosine,▇▁█▁▁▃▄█▂▅▄▄
eval/Sts_pearson_dot,▄▃▆▁▆▆▅█▅▆▆▆
eval/Sts_pearson_euclidean,▇▁█▅▃▄▆█▅▆▅▅
eval/Sts_pearson_manhattan,▇▁█▅▃▄▆▇▄▆▅▅
eval/Sts_pearson_max,▇▁█▅▃▄▆█▄▆▅▅

0,1
eval/Nli_cosine_accuracy,0.96948
eval/Nli_dot_accuracy,0.03109
eval/Nli_euclidean_accuracy,0.96573
eval/Nli_manhattan_accuracy,0.96467
eval/Nli_max_accuracy,0.96948
eval/Sts_pearson_cosine,0.75828
eval/Sts_pearson_dot,0.65881
eval/Sts_pearson_euclidean,0.76432
eval/Sts_pearson_manhattan,0.7636
eval/Sts_pearson_max,0.76432


[34m[1mwandb[0m: Agent Starting Run: 9eqy2fle with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	bert: dictabert
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	pooling_method: mean
[34m[1mwandb[0m: 	similarity_fn_name: cosine
[34m[1mwandb[0m: 	trainer: mnrl
[34m[1mwandb[0m: 	trainer_args: {'eval_steps': 500, 'eval_strategy': 'steps', 'learning_rate': 2e-08, 'logging_steps': 500, 'num_train_epochs': 1, 'output_dir': 'models', 'per_device_eval_batch_size': 16, 'per_device_train_batch_size': 16, 'save_steps': 5000, 'save_strategy': 'steps', 'save_total_limit': 1, 'warmup_ratio': 0.1}


config: {'method': 'grid', 'metric': {'goal': 'maximize', 'name': 'eval/Sts_spearman_cosine'}, 'parameters': {'trainer': {'value': 'mnrl'}, 'similarity_fn_name': {'value': 'cosine'}, 'epochs': {'value': 1}, 'batch_size': {'value': 16}, 'pooling_method': {'value': 'mean'}, 'bert': {'value': 'dictabert'}, 'trainer_args': {'parameters': {'output_dir': {'value': 'models'}, 'learning_rate': {'values': [0.0002, 2e-05, 2e-08]}, 'num_train_epochs': {'value': 1}, 'per_device_train_batch_size': {'value': 16}, 'per_device_eval_batch_size': {'value': 16}, 'warmup_ratio': {'value': 0.1}, 'eval_strategy': {'value': 'steps'}, 'eval_steps': {'value': 500}, 'save_strategy': {'value': 'steps'}, 'save_steps': {'value': 5000}, 'save_total_limit': {'value': 1}, 'logging_steps': {'value': 500}}}}}




VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113100688887951, max=1.0…

Some weights of BertModel were not initialized from the model checkpoint at dicta-il/dictabert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Sts Pearson Cosine,Sts Spearman Cosine,Sts Pearson Manhattan,Sts Spearman Manhattan,Sts Pearson Euclidean,Sts Spearman Euclidean,Sts Pearson Dot,Sts Spearman Dot,Sts Pearson Max,Sts Spearman Max,Nli Cosine Accuracy,Nli Dot Accuracy,Nli Manhattan Accuracy,Nli Euclidean Accuracy,Nli Max Accuracy,Sequential Score
500,1.1598,0.238276,0.644954,0.625658,0.641448,0.62664,0.640253,0.626386,0.273029,0.281478,0.644954,0.62664,0.736619,0.271948,0.733635,0.728918,0.736619,0.736619
1000,1.1162,0.230104,0.649057,0.629333,0.644744,0.629645,0.643661,0.629151,0.287443,0.293114,0.649057,0.629645,0.738833,0.267424,0.735079,0.729784,0.738833,0.738833
1500,1.0646,0.223251,0.652772,0.632673,0.647834,0.632232,0.646844,0.632216,0.300196,0.303019,0.652772,0.632673,0.740277,0.264632,0.735368,0.729881,0.740277,0.740277
2000,1.0351,0.217695,0.655967,0.63515,0.650519,0.634381,0.649607,0.634539,0.3113,0.312341,0.655967,0.63515,0.742876,0.261552,0.737004,0.730843,0.742876,0.742876
2500,1.012,0.213145,0.658704,0.637847,0.652853,0.63646,0.652015,0.636341,0.320707,0.319617,0.658704,0.637847,0.744513,0.259338,0.737101,0.731132,0.744513,0.744513
3000,0.998,0.209445,0.660934,0.63991,0.654789,0.638267,0.654004,0.638008,0.328394,0.32585,0.660934,0.63991,0.745572,0.257894,0.737197,0.731613,0.745572,0.745572
3500,0.9908,0.20638,0.662859,0.64168,0.656468,0.640218,0.655729,0.63949,0.334961,0.331309,0.662859,0.64168,0.746534,0.256835,0.737197,0.732961,0.746534,0.746534
4000,0.9708,0.203873,0.664358,0.64307,0.657792,0.641475,0.657097,0.640951,0.340096,0.33557,0.664358,0.64307,0.74692,0.255968,0.737197,0.733924,0.74692,0.74692
4500,0.9645,0.201978,0.66559,0.644088,0.658864,0.642633,0.658204,0.642122,0.344385,0.339284,0.66559,0.644088,0.748652,0.255487,0.737967,0.734501,0.748652,0.748652
5000,0.9482,0.200562,0.666498,0.644804,0.659668,0.643359,0.659034,0.643007,0.34751,0.342027,0.666498,0.644804,0.748749,0.254813,0.738448,0.735368,0.748749,0.748749


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

testing now..


Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

test results: {'Sts_pearson_cosine': 0.7052239050394219, 'Sts_spearman_cosine': 0.7066636563362375, 'Sts_pearson_manhattan': 0.7001065797818615, 'Sts_spearman_manhattan': 0.6930022339359652, 'Sts_pearson_euclidean': 0.7001058661834826, 'Sts_spearman_euclidean': 0.6934368961330774, 'Sts_pearson_dot': 0.44867442507757327, 'Sts_spearman_dot': 0.4452585958751602, 'Sts_pearson_max': 0.7052239050394219, 'Sts_spearman_max': 0.7066636563362375}


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/Nli_cosine_accuracy,▁▂▃▅▅▆▇▇████
eval/Nli_dot_accuracy,█▆▅▄▃▂▂▂▁▁▁▁
eval/Nli_euclidean_accuracy,▁▂▂▃▃▄▅▆▆▇▇█
eval/Nli_manhattan_accuracy,▁▃▄▆▆▆▆▆▇███
eval/Nli_max_accuracy,▁▂▃▅▅▆▇▇████
eval/Sts_pearson_cosine,▁▂▃▄▅▆▇▇▇███
eval/Sts_pearson_dot,▁▂▃▄▅▆▇▇▇███
eval/Sts_pearson_euclidean,▁▂▃▄▅▆▇▇▇███
eval/Sts_pearson_manhattan,▁▂▃▄▅▆▇▇▇███
eval/Sts_pearson_max,▁▂▃▄▅▆▇▇▇███

0,1
eval/Nli_cosine_accuracy,0.74913
eval/Nli_dot_accuracy,0.25433
eval/Nli_euclidean_accuracy,0.73643
eval/Nli_manhattan_accuracy,0.73816
eval/Nli_max_accuracy,0.74913
eval/Sts_pearson_cosine,0.66742
eval/Sts_pearson_dot,0.35068
eval/Sts_pearson_euclidean,0.65988
eval/Sts_pearson_manhattan,0.66049
eval/Sts_pearson_max,0.66742
