In [1]:
# Device

import torch

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("GPU is available and will be used.")
else:
    device = torch.device("cpu")
    print("No GPU available, using CPU.")

GPU is available and will be used.


In [2]:
#Imports

import os
import json
import random
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from transformers import (
    BertTokenizer, BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling,
    TapasTokenizer, TapasForMaskedLM,
    AdamW, get_scheduler
)
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
from model_complete import JSONBERT_COMPLETE
from dataset import JSONDataset, JSONDataCollator, create_data

import sys
sys.path.append('/root/woojun/')

from utils import (
    _serialize_vanilla,
    _serialize,
    tokenize_table,
    _find_positions,
    mask_entry,
    predict_masked_tokens,
    evaluate_masked_prediction,
    train_eval_rf
)

  from .autonotebook import tqdm as notebook_tqdm


GPU is available and will be used.


In [3]:
# Tokenizer & config

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
config = BertConfig.from_pretrained('bert-base-uncased')

In [4]:
# Models

# BERT
bert_base = BertForMaskedLM.from_pretrained('bert-base-uncased')
bert_base = bert_base.to(device)

# TaPas
tapas_name = "google/tapas-base-masklm"
tapas_tokenizer = TapasTokenizer.from_pretrained(tapas_name)
tapas = TapasForMaskedLM.from_pretrained(tapas_name)
tapas.to(device)

# Ours
ours_path_movie = './models/movie_complete/epoch-9'
ours_movie = JSONBERT_COMPLETE(config, tokenizer, ours_path_movie)
ours_movie = ours_movie.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Key embeddings are trainable!
Pre-trained JSONBERT loaded from ./models/movie_complete/epoch-9


In [5]:
# functions

def prepare_Xy(df, model, tokenizer, target='filename', seed=42):
    data = df.to_dict(orient="records")
    y = df[target].values
    X = np.array([get_table_embedding(entry, model, tokenizer, target) for entry in data])

    return X, y

In [6]:
# Data

pretraining_movie_path = './data/pretraining_data_movie.jsonl'

movie_path = './data/Movie_top100'

movie = create_data(movie_path, path_is="test", sample_num=20, pretraining_path=pretraining_movie_path)

Skipping non-English table: Movie_adorocinema.com_October2023.json
Skipping non-English table: Movie_afisha.ru_October2023.json
Skipping non-English table: Movie_ak.sv_October2023.json
Skipping non-English table: Movie_allcinema.net_October2023.json
Skipping non-English table: Movie_allocine.fr_October2023.json
Skipping non-English table: Movie_arte.tv_October2023.json
Skipping non-English table: Movie_cinecitta.de_October2023.json
Skipping non-English table: Movie_cinefil.com_October2023.json
Skipping non-English table: Movie_cinema-rank.net_October2023.json
Skipping non-English table: Movie_cinematoday.jp_October2023.json
Skipping non-English table: Movie_comingsoon.it_October2023.json
Skipping non-English table: Movie_cpop.it_October2023.json
Skipping non-English table: Movie_crank-in.net_October2023.json
Skipping non-English table: Movie_dok-film.net_October2023.json
Skipping non-English table: Movie_domkino.tv_October2023.json
Skipping non-English table: Movie_ecranlarge.com_Octob

In [20]:
# Data

pretraining_product_path = './data/pretraining_data_product.jsonl'

product_path = './data/Product_top100'

product = create_data(product_path, path_is="test", sample_num=20, pretraining_path=pretraining_product_path)

Skipping non-English table: Product_10x10.co.kr_October2023.json
Skipping non-English table: Product_all.biz_October2023.json
Skipping non-English table: Product_avito.ru_October2023.json
Skipping non-English table: Product_com.ru_October2023.json
Skipping non-English table: Product_docomo.ne.jp_October2023.json
Skipping non-English table: Product_elektronikai-hulladek-felvasarlas.hu_October2023.json
Skipping non-English table: Product_eltiempo.com_October2023.json
Skipping non-English table: Product_fateful.hu_October2023.json
Skipping non-English table: Product_havidijas-keresooptimalizalas.hu_October2023.json
Skipping non-English table: Product_line.me_October2023.json
Skipping non-English table: Product_made-in-china.com_October2023.json
Skipping non-English table: Product_mattel.com_October2023.json
Skipping non-English table: Product_numizmatik.ru_October2023.json
Skipping non-English table: Product_odoo.com_October2023.json
Skipping non-English table: Product_pp.ua_October2023.j

### Source Clustering

In [7]:
# def run_clustering_experiment(X, y):
#     # Convert labels to binary format
#     unique_labels = np.unique(y)
#     n_clusters = len(unique_labels)

#     # Run K-Means clustering with the number of ground truth clusters
#     kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
#     kmeans.fit(X)

#     # Predict clusters for test data
#     cluster_labels = kmeans.predict(X)

#     # Evaluate clustering performance
#     nmi = normalized_mutual_info_score(y, cluster_labels)
#     print(f"Normalized Mutual Information (NMI) on test data: {nmi:.2f}")

#     ari = adjusted_rand_score(y, cluster_labels)
#     print(f"Adjusted Rand Index (ARI) on test data: {ari:.2f}")

#     # Verify data sizes
#     print(f"X shape: {X.shape}, y length: {len(y)}")

#     # Check unique labels
#     print(f"Unique labels in y: {np.unique(y)}")


#     # Visualize clustering
#     plt.figure(figsize=(8, 6))
#     plt.scatter(X[:, 0], X[:, 1], c=cluster_labels, cmap='tab20', marker='o', edgecolor='k')
#     plt.title(f"K-Means Clustering Results with {n_clusters} Clusters")
#     plt.xlabel("Feature 1")
#     plt.ylabel("Feature 2")
#     plt.legend(['Cluster ' + str(i) for i in range(n_clusters)])
#     plt.show()

In [8]:
# movie_df = pd.DataFrame(movie)
# sampled_filenames = ['Movie_telescopefilm.com_October2023.json', 'Movie_tubitv.com_October2023.json']
# sampled_data = [row for row in movie if row["filename"] in sampled_filenames]
# sampled_df = pd.DataFrame(sampled_data)
# sampled_df.head()

In [9]:
# # Export embeddings

# data = sampled_df.to_dict(orient="records")
# sampled_df['our_embeddings'] = [get_table_embedding(entry, ours_movie, tokenizer, 'filename') for entry in data]
# sampled_df['bert_embeddings'] = [get_table_embedding(entry, bert_base, tokenizer, 'filename') for entry in data]

# csv_export_path = 'embeddings_from_our_model.csv'
# sampled_df.to_csv(csv_export_path, index=False)
# print(f'DataFrame exported to {csv_export_path}')

# json_export_path = 'embeddings_from_our_model.json'
# sampled_df.to_json(json_export_path, orient='records', indent=4)
# print(f'DataFrame exported to {json_export_path}')

In [10]:
# X, y = prepare_Xy(sampled_df, notsure_movie, tokenizer, 'filename')
# run_clustering_experiment(X, y)

In [11]:
# X, y = prepare_Xy(sampled_df, ours_movie, tokenizer, 'filename')
# run_clustering_experiment(X, y)

In [12]:
# X, y = prepare_Xy(sampled_df, bert_base, tokenizer, 'filename')
# run_clustering_experiment(X, y)

In [13]:
# X, y = prepare_Xy(sampled_df, tapas, tapas_tokenizer, 'filename')
# run_clustering_experiment(X, y)

### Masked Prediction

In [14]:
from no_cl import JSONBERT_INTERPOLATE
from no_ip_alpha_0 import JSONBERT_NEWLOSS_0
from no_ip_alpha_1 import JSONBERT_NEWLOSS_1

no_cl_path_movie = './models/movie_no_cl/epoch-9'
alpha_0_path_movie = './models/movie_alpha_0/epoch-9'
alpha_1_path_movie = './models/movie_alpha_1/epoch-9'
no_hel_path_movie = './models/movie_no_hel/epoch-9'
bert_path_movie = './models/movie_bert/epoch-9'


no_cl_movie = JSONBERT_INTERPOLATE(config, tokenizer, no_cl_path_movie)
no_cl_movie = no_cl_movie.to(device)

alpha_0_movie = JSONBERT_NEWLOSS_0(config, tokenizer, alpha_0_path_movie)
alpha_0_movie = alpha_0_movie.to(device)

alpha_1_movie = JSONBERT_NEWLOSS_1(config, tokenizer, alpha_1_path_movie)
alpha_1_movie = alpha_1_movie.to(device)

no_hel_movie = BertForMaskedLM.from_pretrained(no_hel_path_movie, local_files_only=True)
no_hel_movie = no_hel_movie.to(device)

bert_movie = BertForMaskedLM.from_pretrained(bert_path_movie, local_files_only=True)
bert_movie = bert_movie.to(device)

Key embeddings are trainable!
Pre-trained JSONBERT_INTERPOLATE loaded from ./models/movie_no_cl/epoch-9
Key embeddings are trainable!
Pre-trained JSONBERT_NEWLOSS loaded from ./models/movie_alpha_0/epoch-9
Key embeddings are trainable!
Pre-trained JSONBERT_NEWLOSS loaded from ./models/movie_alpha_1/epoch-9


In [17]:
# Product

ours_path_product = './models/product_complete/epoch-9'
no_cl_path_product = './models/product_no_cl/epoch-9'
alpha_0_path_product = './models/product_alpha_0/epoch-9'
alpha_1_path_product = './models/product_alpha_1/epoch-9'
no_hel_path_product = './models/product_no_hel/epoch-9'
bert_path_product = './models/product_bert/epoch-9'

ours_product = JSONBERT_COMPLETE(config, tokenizer, ours_path_product)
ours_product = ours_product.to(device)

no_cl_product = JSONBERT_INTERPOLATE(config, tokenizer, no_cl_path_product)
no_cl_product = no_cl_product.to(device)

alpha_0_product = JSONBERT_NEWLOSS_0(config, tokenizer, alpha_0_path_product)
alpha_0_product = alpha_0_product.to(device)

alpha_1_product = JSONBERT_NEWLOSS_1(config, tokenizer, alpha_1_path_product)
alpha_1_product = alpha_1_product.to(device)

no_hel_product = BertForMaskedLM.from_pretrained(no_hel_path_product, local_files_only=True)
no_hel_product = no_hel_product.to(device)

bert_product = BertForMaskedLM.from_pretrained(bert_path_product, local_files_only=True)
bert_product = bert_product.to(device)

Key embeddings are trainable!
Pre-trained JSONBERT loaded from ./models/product_complete/epoch-9
Key embeddings are trainable!
Pre-trained JSONBERT_INTERPOLATE loaded from ./models/product_no_cl/epoch-9
Key embeddings are trainable!
Pre-trained JSONBERT_NEWLOSS loaded from ./models/product_alpha_0/epoch-9
Key embeddings are trainable!
Pre-trained JSONBERT_NEWLOSS loaded from ./models/product_alpha_1/epoch-9


In [15]:
# In-domain: Movie

# Pre-trained: BERT, TaPas, TaBERT
evaluate_masked_prediction(movie, 'Key', bert_base, tokenizer)
evaluate_masked_prediction(movie, 'Key', tapas, tapas_tokenizer)

evaluate_masked_prediction(movie, 'Value', bert_base, tokenizer)
evaluate_masked_prediction(movie, 'Value', tapas, tapas_tokenizer)

# Domain-specific pre-trained: Ours, No CL, No IP_a0, No IP_a1, No HEL, trained BERT
evaluate_masked_prediction(movie, 'Key', ours_movie, tokenizer)
evaluate_masked_prediction(movie, 'Key', no_cl_movie, tokenizer)
evaluate_masked_prediction(movie, 'Key', alpha_0_movie, tokenizer)
evaluate_masked_prediction(movie, 'Key', alpha_1_movie, tokenizer)
evaluate_masked_prediction(movie, 'Key', no_hel_movie, tokenizer)
evaluate_masked_prediction(movie, 'Key', bert_movie, tokenizer)

evaluate_masked_prediction(movie, 'Value', ours_movie, tokenizer)
evaluate_masked_prediction(movie, 'Value', no_cl_movie, tokenizer)
evaluate_masked_prediction(movie, 'Value', alpha_0_movie, tokenizer)
evaluate_masked_prediction(movie, 'Value', alpha_1_movie, tokenizer)
evaluate_masked_prediction(movie, 'Value', no_hel_movie, tokenizer)
evaluate_masked_prediction(movie, 'Value', bert_movie, tokenizer)

Correct / Total: 1843/3183
Model Accuracy on Masked Key Prediction: 0.5790%


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


Correct / Total: 1093/3869
Model Accuracy on Masked Key Prediction: 0.2825%
Correct / Total: 1660/2849
Model Accuracy on Masked Value Prediction: 0.5827%
Correct / Total: 13617/32659
Model Accuracy on Masked Value Prediction: 0.4169%
Correct / Total: 3175/3183
Model Accuracy on Masked Key Prediction: 0.9975%
Correct / Total: 3171/3183
Model Accuracy on Masked Key Prediction: 0.9962%
Correct / Total: 2562/3183
Model Accuracy on Masked Key Prediction: 0.8049%
Correct / Total: 3177/3183
Model Accuracy on Masked Key Prediction: 0.9981%
Correct / Total: 3177/3183
Model Accuracy on Masked Key Prediction: 0.9981%
Correct / Total: 2552/3183
Model Accuracy on Masked Key Prediction: 0.8018%
Correct / Total: 2204/2849
Model Accuracy on Masked Value Prediction: 0.7736%
Correct / Total: 2176/2849
Model Accuracy on Masked Value Prediction: 0.7638%
Correct / Total: 2160/2849
Model Accuracy on Masked Value Prediction: 0.7582%
Correct / Total: 2172/2849
Model Accuracy on Masked Value Prediction: 0.7624

In [21]:
# In-domain: product

# Pre-trained: BERT, TaPas, TaBERT
evaluate_masked_prediction(product, 'Key', bert_base, tokenizer)
evaluate_masked_prediction(product, 'Key', tapas, tapas_tokenizer)

evaluate_masked_prediction(product, 'Value', bert_base, tokenizer)
evaluate_masked_prediction(product, 'Value', tapas, tapas_tokenizer)

# Domain-specific pre-trained: Ours, No CL, No IP_a0, No IP_a1, No HEL, trained BERT
evaluate_masked_prediction(product, 'Key', ours_product, tokenizer)
evaluate_masked_prediction(product, 'Key', no_cl_product, tokenizer)
evaluate_masked_prediction(product, 'Key', alpha_0_product, tokenizer)
evaluate_masked_prediction(product, 'Key', alpha_1_product, tokenizer)
evaluate_masked_prediction(product, 'Key', no_hel_product, tokenizer)
evaluate_masked_prediction(product, 'Key', bert_product, tokenizer)

evaluate_masked_prediction(product, 'Value', ours_product, tokenizer)
evaluate_masked_prediction(product, 'Value', no_cl_product, tokenizer)
evaluate_masked_prediction(product, 'Value', alpha_0_product, tokenizer)
evaluate_masked_prediction(product, 'Value', alpha_1_product, tokenizer)
evaluate_masked_prediction(product, 'Value', no_hel_product, tokenizer)
evaluate_masked_prediction(product, 'Value', bert_product, tokenizer)

Correct / Total: 1920/4140
Model Accuracy on Masked Key Prediction: 0.4638%
Correct / Total: 1053/4448
Model Accuracy on Masked Key Prediction: 0.2367%
Correct / Total: 1430/2490
Model Accuracy on Masked Value Prediction: 0.5743%
Correct / Total: 18640/48358
Model Accuracy on Masked Value Prediction: 0.3855%
Correct / Total: 4088/4140
Model Accuracy on Masked Key Prediction: 0.9874%
Correct / Total: 4090/4140
Model Accuracy on Masked Key Prediction: 0.9879%
Correct / Total: 3062/4140
Model Accuracy on Masked Key Prediction: 0.7396%
Correct / Total: 4096/4140
Model Accuracy on Masked Key Prediction: 0.9894%
Correct / Total: 4104/4140
Model Accuracy on Masked Key Prediction: 0.9913%
Correct / Total: 3171/4140
Model Accuracy on Masked Key Prediction: 0.7659%
Correct / Total: 1775/2490
Model Accuracy on Masked Value Prediction: 0.7129%
Correct / Total: 1753/2490
Model Accuracy on Masked Value Prediction: 0.7040%
Correct / Total: 1771/2490
Model Accuracy on Masked Value Prediction: 0.7112%


In [22]:
# Cross-domain

# Trained on Product -> Tested on Movie
evaluate_masked_prediction(movie, 'Key', ours_product, tokenizer)
evaluate_masked_prediction(movie, 'Value', ours_product, tokenizer)

evaluate_masked_prediction(movie, 'Key', bert_product, tokenizer)
evaluate_masked_prediction(movie, 'Value', bert_product, tokenizer)


# Trained on Movie -> Tested on Product
evaluate_masked_prediction(product, 'Key', ours_movie, tokenizer)
evaluate_masked_prediction(product, 'Value', ours_movie, tokenizer)

evaluate_masked_prediction(product, 'Key', bert_movie, tokenizer)
evaluate_masked_prediction(product, 'Value', bert_movie, tokenizer)

Correct / Total: 3160/3183
Model Accuracy on Masked Key Prediction: 0.9928%
Correct / Total: 2066/2849
Model Accuracy on Masked Value Prediction: 0.7252%
Correct / Total: 2647/3183
Model Accuracy on Masked Key Prediction: 0.8316%
Correct / Total: 1721/2849
Model Accuracy on Masked Value Prediction: 0.6041%
Correct / Total: 4082/4140
Model Accuracy on Masked Key Prediction: 0.9860%
Correct / Total: 1599/2490
Model Accuracy on Masked Value Prediction: 0.6422%
Correct / Total: 3130/4140
Model Accuracy on Masked Key Prediction: 0.7560%
Correct / Total: 1447/2490
Model Accuracy on Masked Value Prediction: 0.5811%
