### Setup

In [1]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("GPU is available and will be used.")
else:
    device = torch.device("cpu")
    print("No GPU available, using CPU.")

GPU is available and will be used.


In [2]:
!nvidia-smi

Mon Feb 17 03:49:14 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.05             Driver Version: 550.127.05     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX 6000 Ada Gene...    On  |   00000000:01:00.0 Off |                  Off |
| 30%   45C    P2             88W /  300W |    2060MiB /  49140MiB |      9%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import os
import json
import random
import pandas as pd
import numpy as np
from transformers import (
    BertTokenizer, BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling,
    TapasTokenizer, TapasForMaskedLM,
    AdamW, get_scheduler
)
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
from model_complete import JSONBERT_COMPLETE
from no_cl import JSONBERT_INTERPOLATE
from no_ip_alpha_0 import JSONBERT_NEWLOSS_0
from no_ip_alpha_1 import JSONBERT_NEWLOSS_1
from dataset import JSONDataset, JSONDataCollator, create_data

import sys
sys.path.append('/root/woojun/')

from utils import (
    _serialize_vanilla,
    _serialize,
    tokenize_table,
    _find_positions,
    mask_entry,
    predict_masked_tokens,
    evaluate_masked_prediction,
    get_table_embedding,
    prepare_Xy,
    train_eval_rf
)

  from .autonotebook import tqdm as notebook_tqdm


GPU is available and will be used.


In [4]:
# import importlib
# import utils

# importlib.reload(utils)

In [5]:
# Tokenizer & config

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
config = BertConfig.from_pretrained('bert-base-uncased')

### Models

#### From pre-trained

* BERT
* TaPas
* TaBERT

In [6]:
# BERT
bert_base = BertForMaskedLM.from_pretrained('bert-base-uncased')
bert_base = bert_base.to(device)


# TaPas
tapas_name = "google/tapas-base-masklm"
tapas_tokenizer = TapasTokenizer.from_pretrained(tapas_name)
tapas = TapasForMaskedLM.from_pretrained(tapas_name)
tapas.to(device)


# # TaBERT
tabert = BertForMaskedLM.from_pretrained('bert-base-uncased')

model_path = './TaBERT/tabert_base_k1'
tabert_state_dict = torch.load(os.path.join(model_path, "model.bin"))

state_dict = {}
for key, value in state_dict.items():
    new_key = key.replace("_bert_model.", "")
    state_dict[new_key] = value

tabert.load_state_dict(state_dict, strict=False)
tabert = tabert.to(device)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model t

#### Domain-specific pre-trained

* Ours
* -no CL (lambda = 0)
* -no Interpolation (alpha = 0)
* -no Interpolation (alpha = 1)
* -no Header Embedding Layer (alpha = 1, lambda = 0)
---
* BERT trained with text serialization

In [7]:
# Product

ours_path_product = './models/product_complete/epoch-9'
no_cl_path_product = './models/product_no_cl/epoch-9'
alpha_0_path_product = './models/product_alpha_0/epoch-9'
alpha_1_path_product = './models/product_alpha_1/epoch-9'
no_hel_path_product = './models/product_no_hel/epoch-9'
bert_path_product = './models/product_bert/epoch-9'

ours_product = JSONBERT_COMPLETE(config, tokenizer, ours_path_product)
ours_product = ours_product.to(device)

no_cl_product = JSONBERT_INTERPOLATE(config, tokenizer, no_cl_path_product)
no_cl_product = no_cl_product.to(device)

alpha_0_product = JSONBERT_NEWLOSS_0(config, tokenizer, alpha_0_path_product)
alpha_0_product = alpha_0_product.to(device)

alpha_1_product = JSONBERT_NEWLOSS_1(config, tokenizer, alpha_1_path_product)
alpha_1_product = alpha_1_product.to(device)

no_hel_product = BertForMaskedLM.from_pretrained(no_hel_path_product, local_files_only=True)
no_hel_product = no_hel_product.to(device)

bert_product = BertForMaskedLM.from_pretrained(bert_path_product, local_files_only=True)
bert_product = bert_product.to(device)

Key embeddings are trainable!
Pre-trained JSONBERT loaded from ./models/product_complete/epoch-9
Key embeddings are trainable!
Pre-trained JSONBERT_INTERPOLATE loaded from ./models/product_no_cl/epoch-9
Key embeddings are trainable!
Pre-trained JSONBERT_NEWLOSS loaded from ./models/product_alpha_0/epoch-9
Key embeddings are trainable!
Pre-trained JSONBERT_NEWLOSS loaded from ./models/product_alpha_1/epoch-9


In [8]:
# Movie

ours_path_movie = './models/movie_complete/epoch-9'
no_cl_path_movie = './models/movie_no_cl/epoch-9'
alpha_0_path_movie = './models/movie_alpha_0/epoch-9'
alpha_1_path_movie = './models/movie_alpha_1/epoch-9'
no_hel_path_movie = './models/movie_no_hel/epoch-9'
bert_path_movie = './models/movie_bert/epoch-9'

ours_movie = JSONBERT_COMPLETE(config, tokenizer, ours_path_movie)
ours_movie = ours_movie.to(device)

no_cl_movie = JSONBERT_INTERPOLATE(config, tokenizer, no_cl_path_movie)
no_cl_movie = no_cl_movie.to(device)

alpha_0_movie = JSONBERT_NEWLOSS_0(config, tokenizer, alpha_0_path_movie)
alpha_0_movie = alpha_0_movie.to(device)

alpha_1_movie = JSONBERT_NEWLOSS_1(config, tokenizer, alpha_1_path_movie)
alpha_1_movie = alpha_1_movie.to(device)

no_hel_movie = BertForMaskedLM.from_pretrained(no_hel_path_movie, local_files_only=True)
no_hel_movie = no_hel_movie.to(device)

bert_movie = BertForMaskedLM.from_pretrained(bert_path_movie, local_files_only=True)
bert_movie = bert_movie.to(device)

Key embeddings are trainable!
Pre-trained JSONBERT loaded from ./models/movie_complete/epoch-9


### Masked Prediction

In [None]:
# Data
pretraining_movie_path = './data/pretraining_data_movie.jsonl'
pretraining_product_path = './data/pretraining_data_product.jsonl'

movie_path = './data/Movie_top100'
product_path = './data/Product_top100'

movie = create_data(movie_path, path_is="test", sample_num=20, pretraining_path=pretraining_movie_path)
product = create_data(product_path, path_is="test", sample_num=20, pretraining_path=pretraining_product_path)

In [None]:
len(movie)

In [None]:
len(product)

#### Evaluate Masked Prediction

**In-domain MP**
* Unseen rows when pre-training
* Additional 2 columns that are unseen during pre-training
* Trained with ___ samples, tested with 1000 samples

**Cross-domain MP**
* Tested on unseen domains for BERT & Our model

In [None]:
# In-domain: Movie

# Pre-trained: BERT, TaPas, TaBERT
evaluate_masked_prediction(movie, 'Key', bert_base, tokenizer)
evaluate_masked_prediction(movie, 'Key', tapas, tapas_tokenizer)
evaluate_masked_prediction(movie, 'Key', tabert, tokenizer)

evaluate_masked_prediction(movie, 'Value', bert_base, tokenizer)
evaluate_masked_prediction(movie, 'Value', tapas, tapas_tokenizer)
evaluate_masked_prediction(movie, 'Value', tabert, tokenizer)


# Domain-specific pre-trained: Ours, No CL, No IP_a0, No IP_a1, No HEL, trained BERT
evaluate_masked_prediction(movie, 'Key', ours_movie, tokenizer)
evaluate_masked_prediction(movie, 'Key', no_cl_movie, tokenizer)
evaluate_masked_prediction(movie, 'Key', alpha_0_movie, tokenizer)
evaluate_masked_prediction(movie, 'Key', alpha_1_movie, tokenizer)
evaluate_masked_prediction(movie, 'Key', no_hel_movie, tokenizer)
evaluate_masked_prediction(movie, 'Key', bert_movie, tokenizer)

evaluate_masked_prediction(movie, 'Value', ours_movie, tokenizer)
evaluate_masked_prediction(movie, 'Value', no_cl_movie, tokenizer)
evaluate_masked_prediction(movie, 'Value', alpha_0_movie, tokenizer)
evaluate_masked_prediction(movie, 'Value', alpha_1_movie, tokenizer)
evaluate_masked_prediction(movie, 'Value', no_hel_movie, tokenizer)
evaluate_masked_prediction(movie, 'Value', bert_movie, tokenizer)

In [None]:
# In-domain: Product

# Pre-trained: BERT, TaPas, TaBERT
evaluate_masked_prediction(product, 'Key', bert_base, tokenizer)
evaluate_masked_prediction(product, 'Key', tapas, tapas_tokenizer)
evaluate_masked_prediction(product, 'Key', tabert, tokenizer)

evaluate_masked_prediction(product, 'Value', bert_base, tokenizer)
evaluate_masked_prediction(product, 'Value', tapas, tapas_tokenizer)
evaluate_masked_prediction(product, 'Value', tabert, tokenizer)


# Domain-specific pre-trained: Ours, No CL, No IP_a0, No IP_a1, No HEL, trained BERT
evaluate_masked_prediction(product, 'Key', ours_product, tokenizer)
evaluate_masked_prediction(product, 'Key', no_cl_product, tokenizer)
evaluate_masked_prediction(product, 'Key', alpha_0_product, tokenizer)
evaluate_masked_prediction(product, 'Key', alpha_1_product, tokenizer)
evaluate_masked_prediction(product, 'Key', no_hel_product, tokenizer)
evaluate_masked_prediction(product, 'Key', bert_product, tokenizer)

evaluate_masked_prediction(product, 'Value', ours_product, tokenizer)
evaluate_masked_prediction(product, 'Value', no_cl_product, tokenizer)
evaluate_masked_prediction(product, 'Value', alpha_0_product, tokenizer)
evaluate_masked_prediction(product, 'Value', alpha_1_product, tokenizer)
evaluate_masked_prediction(product, 'Value', no_hel_product, tokenizer)
evaluate_masked_prediction(product, 'Value', bert_product, tokenizer)

In [None]:
# Cross-domain

# Trained on Product -> Tested on Movie
evaluate_masked_prediction(movie, 'Key', ours_product, tokenizer)
evaluate_masked_prediction(movie, 'Value', ours_product, tokenizer)

evaluate_masked_prediction(movie, 'Key', bert_product, tokenizer)
evaluate_masked_prediction(movie, 'Value', bert_product, tokenizer)


# Trained on Movie -> Tested on Product
evaluate_masked_prediction(product, 'Key', ours_movie, tokenizer)
evaluate_masked_prediction(product, 'Value', ours_movie, tokenizer)

evaluate_masked_prediction(product, 'Key', bert_movie, tokenizer)
evaluate_masked_prediction(product, 'Value', bert_movie, tokenizer)

#### Tuning Lambda

In [None]:
# # Load models

# ours_movie_lambda_04 = JSONBERT_COMPLETE(config, tokenizer, "models/lambda_04_movie/epoch-9", lambda_align=0.4)
# ours_movie_lambda_045 = JSONBERT_COMPLETE(config, tokenizer, "models/lambda_045_movie/epoch-9", lambda_align=0.45)

# ours_movie_lambda_04 = ours_movie_lambda_04.to(device)
# ours_movie_lambda_045 = ours_movie_lambda_045.to(device)

# ours_product_lambda_04 = JSONBERT_COMPLETE(config, tokenizer, "models/lambda_04_product/epoch-9", lambda_align=0.4)
# ours_product_lambda_045 = JSONBERT_COMPLETE(config, tokenizer, "models/lambda_045_product/epoch-9", lambda_align=0.45)

# ours_product_lambda_04 = ours_product_lambda_04.to(device)
# ours_product_lambda_045 = ours_product_lambda_045.to(device)

In [None]:
# # In-domain: Movie

# evaluate_masked_prediction(movie, 'Key', ours_movie_lambda_04, tokenizer)
# evaluate_masked_prediction(movie, 'Key', ours_movie_lambda_045, tokenizer)

# evaluate_masked_prediction(movie, 'Value', ours_movie_lambda_04, tokenizer)
# evaluate_masked_prediction(movie, 'Value', ours_movie_lambda_045, tokenizer)

In [None]:
# # In-domain: Product

# evaluate_masked_prediction(product, 'Key', ours_product_lambda_04, tokenizer)
# evaluate_masked_prediction(product, 'Key', ours_product_lambda_045, tokenizer)

# evaluate_masked_prediction(product, 'Value', ours_product_lambda_04, tokenizer)
# evaluate_masked_prediction(product, 'Value', ours_product_lambda_045, tokenizer)

In [None]:
# # Cross-domain

# # Trained on Product -> Tested on Movie
# evaluate_masked_prediction(movie, 'Key', ours_product_lambda_03, tokenizer)
# evaluate_masked_prediction(movie, 'Value', ours_product_lambda_03, tokenizer)

# evaluate_masked_prediction(movie, 'Key', ours_product_lambda_05, tokenizer)
# evaluate_masked_prediction(movie, 'Value', ours_product_lambda_05, tokenizer)


# # Trained on Movie -> Tested on Product
# evaluate_masked_prediction(product, 'Key', ours_movie_lambda_03, tokenizer)
# evaluate_masked_prediction(product, 'Value', ours_movie_lambda_03, tokenizer)

# evaluate_masked_prediction(product, 'Key', ours_movie_lambda_05, tokenizer)
# evaluate_masked_prediction(product, 'Value', ours_movie_lambda_05, tokenizer)

### Classification

In [None]:
# Fine-tuning tools #

from io import StringIO

### For Ours
def fine_tune_model(model, dataloader, optimizer, scheduler, device, epochs=3):
    model.to(device)
    model.train()

    for epoch in range(epochs):
        epoch_loss = 0
        progress_bar = tqdm(dataloader, desc="Training")

        for batch in progress_bar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()

            if hasattr(model, "key_embedding"):
                outputs = model(
                    input_ids=input_ids, 
                    attention_mask=attention_mask, 
                    labels=labels, 
                    key_positions=batch["key_positions"],
                    compute_alignment_loss=True
                )
                loss = outputs["loss"]
            else:
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
            loss.backward()

            optimizer.step()
            scheduler.step()

            epoch_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(dataloader):.4f}")

def create_optimizer_and_scheduler(model, dataloader, learning_rate, epochs):
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    num_training_steps = len(dataloader) * epochs
    scheduler = get_scheduler(
        "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )
    return optimizer, scheduler

### For TaPas
def apply_masking(inputs, mask_prob=0.15):
    """ Apply MLM-style random masking to TAPAS input_ids. """
    labels = inputs['input_ids'].clone()
    mask = torch.full(labels.shape, mask_prob)
    
    # Apply special token mask (avoid masking special tokens)
    special_tokens_mask = tapas_tokenizer.get_special_tokens_mask(labels.tolist(), already_has_special_tokens=True)
    special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)

    # Apply masking
    mask.masked_fill_(special_tokens_mask, value=0.0)
    masked_indices = torch.bernoulli(mask).bool()
    labels[~masked_indices] = -100  # Ignore unmasked tokens in loss
    inputs['input_ids'][masked_indices] = tapas_tokenizer.mask_token_id 

    return inputs, labels

def prepare_tapas_traindata(entry):
    """ Tokenize tabular data for TAPAS while handling batch structure. """
    str_entry = {k: str(v) for k, v in entry.items()}
    table = pd.DataFrame([str_entry]) 
    inputs = tapas_tokenizer(table=table, queries=["What is the missing value?"], padding="max_length", truncation=True, return_tensors="pt").to(device)
    inputs, labels = apply_masking(inputs)

    return inputs, labels

def fine_tune_tapas(tapas, path, epochs):
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        lines = f.readlines()
    df = pd.read_csv(StringIO(''.join(lines)))
    data = df.to_dict(orient="records")

    train_data = [prepare_tapas_traindata(row) for row in data]

    # Convert train_data to tensors
    input_tensors = {k: torch.cat([entry[0][k] for entry in train_data]) for k in train_data[0][0].keys()}
    label_tensors = torch.cat([entry[1] for entry in train_data])

    # Create a DataLoader
    batch_size = 8
    train_dataset = TensorDataset(input_tensors['input_ids'], 
                                input_tensors['attention_mask'], 
                                input_tensors['token_type_ids'], 
                                label_tensors)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    optimizer = AdamW(tapas.parameters(), lr=1e-6)
    tapas.train()

    # Training loop with batching
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, token_type_ids, labels = [x.to(device) for x in batch]
            inputs = {
                "input_ids": input_ids, 
                "attention_mask": attention_mask, 
                "token_type_ids": token_type_ids
            }

            outputs = tapas(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")


In [9]:
# data
# product_for_cls_path = "./data/product_for_cls.csv"
movie_for_cls_path = "./data/movie_for_cls.csv"
adult_path = "./data/adult.csv"
bank_path = "./data/bank.csv"
heart_path = "./data/heart.csv"

In [None]:
# Fine-tuning datasets #

# product_for_cls = JSONDataset(product_for_cls_path, tokenizer, path_is='csv')
# product_for_cls_bert = JSONDataset(product_for_cls_path, tokenizer, path_is='csv', version='bert')

movie_for_cls = JSONDataset(movie_for_cls_path, tokenizer, path_is='csv')
movie_for_cls_bert = JSONDataset(movie_for_cls_path, tokenizer, path_is='csv', version='bert')

adult = JSONDataset(adult_path, tokenizer, path_is='csv')
adult_bert = JSONDataset(adult_path, tokenizer, path_is='csv', version='bert')

bank = JSONDataset(bank_path, tokenizer, path_is='csv')
bank_bert = JSONDataset(bank_path, tokenizer, path_is='csv', version='bert')

heart = JSONDataset(heart_path, tokenizer, path_is='csv')
heart_bert = JSONDataset(heart_path, tokenizer, path_is='csv', version='bert')

#### Evaluate pre-trained models

In [10]:
# Evaluate pre-trained (including domain-specific) models

models = {
    "bert_base": (bert_base, tokenizer),
    "tapas": (tapas, tapas_tokenizer),
    "tabert": (tabert, tokenizer),
    "bert_product": (bert_product, tokenizer),
    "bert_movie": (bert_movie, tokenizer),
    "ours_product": (ours_product, tokenizer),
    "ours_movie": (ours_movie, tokenizer)
}

NameError: name 'bert_movie' is not defined

In [None]:
# # Product_test
# results = {}
# for name, (model, tokenizer) in models.items():
#     X_train, X_test, y_train, y_test = prepare_Xy(product_for_cls_path, model, tokenizer, seed=42)
#     results[name] = train_eval_rf(X_train, X_test, y_train, y_test, seed=42)

# # Print results
# for model_name, metrics in results.items():
#     print(f"Metrics for {model_name} in product_for_cls:")
#     print(f"\t{metrics['precision']: .4f}")
#     print(f"\t{metrics['recall']: .4f}")
#     print(f"\t{metrics['f1_score']: .4f}")

In [None]:
# Movie_test
results = {}
for name, (model, tokenizer) in models.items():
    X_train, X_test, y_train, y_test = prepare_Xy(movie_for_cls_path, model, tokenizer, seed=42)
    results[name] = train_eval_rf(X_train, X_test, y_train, y_test, seed=42)

# Print results
for model_name, metrics in results.items():
    print(f"Metrics for {model_name} in movie_for_cls:")
    print(f"\t{metrics['precision']: .4f}")
    print(f"\t{metrics['recall']: .4f}")
    print(f"\t{metrics['f1_score']: .4f}")

In [None]:
# Adult
results = {}
for name, (model, tokenizer) in models.items():
    X_train, X_test, y_train, y_test = prepare_Xy(adult_path, model, tokenizer, seed=42)
    results[name] = train_eval_rf(X_train, X_test, y_train, y_test, seed=42)

# Print results
for model_name, metrics in results.items():
    print(f"Metrics for {model_name} in adult:")
    print(f"\t{metrics['precision']: .4f}")
    print(f"\t{metrics['recall']: .4f}")
    print(f"\t{metrics['f1_score']: .4f}")

  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


Metrics for bert_base in adult:
	 0.7607
	 0.8558
	 0.8054
Metrics for tapas in adult:
	 0.7615
	 0.7981
	 0.7793
Metrics for tabert in adult:
	 0.7607
	 0.8558
	 0.8054
Metrics for ours_movie in adult:
	 0.6522
	 0.7212
	 0.6849


In [None]:
# Bank
results = {}
for name, (model, tokenizer) in models.items():
    X_train, X_test, y_train, y_test = prepare_Xy(bank_path, model, tokenizer, seed=42)
    results[name] = train_eval_rf(X_train, X_test, y_train, y_test, seed=42)

# Print results
for model_name, metrics in results.items():
    print(f"Metrics for {model_name} in bank:")
    print(f"\t{metrics['precision']: .4f}")
    print(f"\t{metrics['recall']: .4f}")
    print(f"\t{metrics['f1_score']: .4f}")

  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


Metrics for bert_base in bank:
	 0.7788
	 0.7788
	 0.7788
Metrics for tapas in bank:
	 0.7670
	 0.7596
	 0.7633
Metrics for tabert in bank:
	 0.7788
	 0.7788
	 0.7788
Metrics for ours_movie in bank:
	 0.7551
	 0.7115
	 0.7327


In [None]:
# Heart
results = {}
for name, (model, tokenizer) in models.items():
    X_train, X_test, y_train, y_test = prepare_Xy(heart_path, model, tokenizer, seed=42)
    results[name] = train_eval_rf(X_train, X_test, y_train, y_test, seed=42)

# Print results
for model_name, metrics in results.items():
    print(f"Metrics for {model_name} in heart:")
    print(f"\t{metrics['precision']: .4f}")
    print(f"\t{metrics['recall']: .4f}")
    print(f"\t{metrics['f1_score']: .4f}")

  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


Metrics for bert_base in heart:
	 0.9200
	 0.8598
	 0.8889
Metrics for tapas in heart:
	 0.9000
	 0.8411
	 0.8696
Metrics for tabert in heart:
	 0.9200
	 0.8598
	 0.8889
Metrics for ours_movie in heart:
	 0.8660
	 0.7850
	 0.8235


#### Fine-tune

In [None]:
######## Set Fine-tuning Domain ##########
training_domain = movie_for_cls
training_domain_bert = movie_for_cls_bert
training_domain_path = movie_for_cls_path
##########################################

In [None]:
# Fine-tune models (BERT, TaBERT, Ours-product, Ours-movie)

# Data Collator
jsonbert_data_collator = JSONDataCollator(
    tokenizer=tokenizer,
    hybrid_epochs=4
)
bert_data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)


# Create Dataloaders
jsonbert_train_dataloader = DataLoader(
    training_domain,
    batch_size=8,
    shuffle=True,
    collate_fn=jsonbert_data_collator,
)

bert_train_dataloader = DataLoader(
    training_domain_bert,
    batch_size=8,
    shuffle=True,
    collate_fn=bert_data_collator,
)


# Hyperparameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
learning_rate = 1e-6
epochs = 6

# Optimizers and Schedulers for each model
optimizers = {}
schedulers = {}

# Fine-tuning models
optimizers["bert_base"], schedulers["bert_base"] = create_optimizer_and_scheduler(bert_base, bert_train_dataloader, 1e-6, 6)
print("\nFine-tuning bert_base...")
fine_tune_model(bert_base, bert_train_dataloader, optimizers["bert_base"], schedulers["bert_base"], device, epochs)

optimizers["tabert"], schedulers["tabert"] = create_optimizer_and_scheduler(tabert, bert_train_dataloader, 1e-6, 6)
print("\nFine-tuning tabert...")
fine_tune_model(tabert, bert_train_dataloader, optimizers["tabert"], schedulers["tabert"], device, epochs)

optimizers["ours_product"], schedulers["ours_product"] = create_optimizer_and_scheduler(ours_product, jsonbert_train_dataloader, 1e-6, 6)
print("\nFine-tuning ours_product...")
fine_tune_model(ours_product, jsonbert_train_dataloader, optimizers["ours_product"], schedulers["ours_product"], device, epochs)

optimizers["ours_movie"], schedulers["ours_movie"] = create_optimizer_and_scheduler(ours_movie, jsonbert_train_dataloader, 1e-6, 6)
print("\nFine-tuning ours_movie...")
fine_tune_model(ours_movie, jsonbert_train_dataloader, optimizers["ours_movie"], schedulers["ours_movie"], device, epochs)

optimizers["bert_product"], schedulers["bert_product"] = create_optimizer_and_scheduler(bert_product, jsonbert_train_dataloader, 1e-6, 6)
print("\nFine-tuning bert_product...")
fine_tune_model(bert_product, jsonbert_train_dataloader, optimizers["bert_product"], schedulers["bert_product"], device, epochs)

optimizers["bert_movie"], schedulers["bert_movie"] = create_optimizer_and_scheduler(bert_movie, jsonbert_train_dataloader, 1e-6, 6)
print("\nFine-tuning bert_movie...")
fine_tune_model(bert_movie, jsonbert_train_dataloader, optimizers["bert_movie"], schedulers["bert_movie"], device, epochs)

In [None]:
# Fine-tune TaPas

fine_tune_tapas(tapas, training_domain_path, epochs)

#### Evaluate fine-tuned models

In [None]:
models = {
    "bert_base": (bert_base, tokenizer),
    "tapas": (tapas, tapas_tokenizer),
    "tabert": (tabert, tokenizer),
    "bert_product": (bert_product, tokenizer),
    "bert_movie": (bert_movie, tokenizer),
    "ours_product": (ours_product, tokenizer),
    "ours_movie": (ours_movie, tokenizer)
}

In [None]:
# Classification on trainined domain
results = {}
for name, (model, tokenizer) in models.items():
    X_train, X_test, y_train, y_test = prepare_Xy(training_domain_path, model, tokenizer, seed=42)
    results[name] = train_eval_rf(X_train, X_test, y_train, y_test, seed=42)

# Print results
for model_name, metrics in results.items():
    print(f"Metrics for {model_name} in trained domain:")
    print(f"\t{metrics['precision']: .4f}")
    print(f"\t{metrics['recall']: .4f}")
    print(f"\t{metrics['f1_score']: .4f}")