In [None]:
import os
import pickle
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import auc
from tqdm import tqdm

os.chdir("/home1/wangtianshu/universal-blocker")
data_dirs = [
    d.name
    for d in Path("./data/blocking").iterdir()
    if d.name not in ["songs", "citeseer-dblp"]
]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import py_stringmatching as sm
import torch
import torch.nn.functional as F
torch.set_grad_enabled(False)

qgram_tokenizer = sm.tokenizer.qgram_tokenizer.QgramTokenizer(qval=5, padding=False)
whitespace_tokenzier = sm.tokenizer.whitespace_tokenizer.WhitespaceTokenizer()
cosine = sm.similarity_measure.cosine.Cosine()

from src.models import SimCSE
from scipy.spatial import distance

def sparse_similarity(
    s1,
    s2,
    tokenizer,
    similarity,
):
    t1 = tokenizer.tokenize(s1)
    t2 = tokenizer.tokenize(s2)
    return similarity.get_sim_score(t1, t2)

def ngram_similarity(s1, s2):
    return sparse_similarity(s1, s2, tokenizer=qgram_tokenizer, similarity=cosine)

def token_similarity(s1, s2):
    return sparse_similarity(s1, s2, tokenizer=whitespace_tokenzier, similarity=cosine)

def cosine_similarity(e1, e2):
    return 1 - distance.cosine(e1, e2)

def prepare_model():
    global model, tokenizer, device
    model_name_or_path="./models/roberta-base"
    device=5
    model = SimCSE(model_name_or_path=model_name_or_path, max_length=256)
    model = model.load_from_checkpoint("results/fit/simcse/gittables/1cwvyg3q/checkpoints/step=1500-AP=0.46677.ckpt")
    model.eval()
    model = model.to(device)
    tokenizer = model.collate_fn.tokenizer
    
prepare_model()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

import json
import random
from datasets import Dataset

from src.datamodules.blocking import dict2tuples
from pytorch_lightning.utilities import move_data_to_device
import torch.nn.functional as F


sns.set_theme()

def encode(batch):
    collate_fn = getattr(model, "collate_fn", None)

    batch: list[dict] = [dict(zip(batch, t)) for t in zip(*batch.values())]
    batch = [dict2tuples(r, "id") for r in batch]
    texts = [" ".join([t[1] for t in l]) for l in batch]

    batch = move_data_to_device(collate_fn(batch), model.device)
    embeddings = F.normalize(model(batch).detach()).to("cpu").numpy()

    return {
        "text": texts,
        "embeddings": embeddings,
    }


def get_similarity(r1, r2):
    s1 = r1["text"]
    s2 = r2["text"]
    e1 = r1["embeddings"]
    e2 = r2["embeddings"]
    
    ngram_sim = ngram_similarity(s1, s2)
    cosine_sim = cosine_similarity(e1, e2)
    
    return ngram_sim, cosine_sim

def check(p, dfs):
    df1, df2 = dfs[0], dfs[len(dfs) - 1]
    r1 = df1.loc[p[0]]
    r2 = df2.loc[p[1]]
    
    ngram_sim, cosine_sim = get_similarity(r1, r2)
    
    print(p)
    print(repr(r1["text"]))
    print(repr(r2["text"]))
    print(f"spase similarity {ngram_sim}")
    print(f"dense similarity {cosine_sim}")
    
def check_set(st, dfs, num=5):
    for p in random.sample(list(st), num):
        check(p, dfs)
        print()

K = 20
# for data_dir in data_dirs:
# for data_dir in ["walmart-amazon_homo", "imdb-dbpedia", "amazon-google", "walmart-amazon_heter", "movies"]:
for data_dir in ["imdb-dbpedia", "movies", "amazon-google", "walmart-amazon_homo", "walmart-amazon_heter"]:
# for data_dir in ["imdb-dbpedia"]:
    print(data_dir)
    
    table_paths = sorted(Path(f"./data/blocking/{data_dir}").glob("[1-2]*.csv"))
    ds_paths = [Path(f"./data/blocking_map/{data_dir}/{p.stem}") for p in table_paths]
    
    for i, ds_path in enumerate(ds_paths):
        if not ds_path.exists():
            df = pd.read_csv(table_paths[i], low_memory=False)
            df = df.fillna("")
            columns = list(df.columns)
            columns.remove("id")
            df[columns] = df[columns].astype(str)
            ds = Dataset.from_pandas(df)
            ds = ds.map(encode, batched=True, batch_size=32)
            ds.save_to_disk(ds_path)
    dfs = [Dataset.load_from_disk(p).to_pandas().set_index('id') for p in ds_paths]
    
    with Path(f"./results/debug/sparse_join/{data_dir}.pickle").open("rb") as f:
        sparse_candidates = pickle.load(f)

    with Path(f"./results/debug/simcse/{data_dir}.pickle").open("rb") as f:
        dense_candidates = pickle.load(f)

    matches_path = Path(f"./data/blocking/{data_dir}/matches.csv")
    matches = set(
        pd.read_csv(matches_path).itertuples(index=False, name=None)
    )
    
    
    sparse_cands = set().union(*[s for s in sparse_candidates[:K]])
    dense_cands = set().union(*[s for s in dense_candidates[:K]])
    sparse_hits = matches & sparse_cands
    dense_hits = matches & dense_cands
    sparse_misses = matches - sparse_hits
    dense_missses = matches - dense_hits
    
#     print(len(sparse_hits))
#     print(len(dense_hits))
#     print(len(sparse_misses))
#     print(len(dense_missses))
    
    both_hits = sparse_hits & dense_hits
    sparse_hits_dense_misses = sparse_hits & dense_missses
    sparse_misses_dense_hits = sparse_misses & dense_hits
    both_misses = sparse_misses & dense_missses
    
#     samples = []
#     df1, df2 = dfs[0], dfs[len(dfs) - 1]
    
#     for t, s in zip(
#         ["both", "sparse", "dense", "neither"], 
#         [both_hits, sparse_hits_dense_misses, sparse_misses_dense_hits, both_misses],
#     ):
#         for p in s:
#             r1 = df1[df1["id"] == p[0]].iloc[0]
#             r2 = df2[df2["id"] == p[1]].iloc[0]
#             ngram_sim, cosine_sim = get_similarity(r1, r2)
        
#             instance = {
#                 "pair": p,
#                 "r1": r1["_text"],
#                 "r2": r2["_text"],
#                 "sparse_sim": ngram_sim,
#                 "dense_sim": cosine_sim,
#                 "type": t,
#             }
#             samples.append(instance)
            
#     df = pd.DataFrame(samples)
#     g = sns.catplot(data=df, x="sparse_sim", y="type", kind="violin", height=3, aspect=2)
#     plt.show()
    
    print("---------------------------")
#     print(f"both hits {len(both_hits)}")
#     check_set(both_hits, dfs)
    
    print(f"sparse hits dense misses {len(sparse_hits_dense_misses)}")
    check_set(sparse_hits_dense_misses, dfs, num=5)
    
    print("---------------------------")
    
    print(f"sparse misses dense hits {len(sparse_misses_dense_hits)}")
    check_set(sparse_misses_dense_hits, dfs, num=5)
    
#     print(f"both misses {len(both_misses)}")
#     check_set(both_misses, dfs)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

import json
import random
from datasets import Dataset

sns.set_theme()

def encode(batch):
    collate_fn = getattr(model, "collate_fn", None)

    batch: list[dict] = [dict(zip(batch, t)) for t in zip(*batch.values())]
    batch = [dict2tuples(r, "id") for r in batch]
    texts = [" ".join([t[1] for t in l]) for l in batch]
    texts = [
        " ".join(t[1] for t in l if len(tokenizer.tokenize(t[1])) <= 20)
        for l in batch
    ]

    batch = move_data_to_device(collate_fn(batch), model.device)

    embeddings = model(batch).detach().to("cpu").numpy()
    embeddings = F.normalize(model(batch).detach()).to("cpu").numpy()

    return {
        "text": texts,
        "embeddings": embeddings,
    }

def get_similarity(r1, r2):
    s1 = r1["text"]
    s2 = r2["text"]
    e1 = r1["embeddings"]
    e2 = r2["embeddings"]
    
    ngram_sim = ngram_similarity(s1, s2)
    cosine_sim = cosine_similarity(e1, e2)
    
    return ngram_sim, cosine_sim


def check(p, dfs):
    df1, df2 = dfs[0], dfs[len(dfs) - 1]
    r1 = df1[df1["id"] == p[0]].iloc[0]
    r2 = df2[df2["id"] == p[1]].iloc[0]
    
    ngram_sim, cosine_sim = get_similarity(r1, r2)
    
    print(p)
    print(r1["text"])
    print(r2["text"])
    print(f"spase similarity {ngram_sim}")
    print(f"dense similarity {cosine_sim}")
    
def check_set(st, dfs, num=5):
    for p in random.sample(list(st), num):
        check(p, dfs)
        print()

K = 20
for data_dir in data_dirs:
# for data_dir in ["walmart-amazon_homo", "imdb-dbpedia", "amazon-google", "walmart-amazon_heter", "movies"]:
# for data_dir in ["amazon-google"]:
    print(data_dir)
    
    table_paths = sorted(Path(f"./data/blocking/{data_dir}").glob("[1-2]*.csv"))
    ds_paths = [Path(f"./data/blocking_map/{data_dir}/{p.stem}") for p in table_paths]
    
    for i, ds_path in enumerate(ds_paths):
        if not ds_path.exists():
            df = pd.read_csv(table_paths[i], index_col="id", low_memory=False)
            ds = Dataset.from_pandas(df)
            ds = ds.map(encode, batched=True, batch_size=32)
            ds.save_to_disk(ds_path)
    dfs = [Dataset.load_from_disk(p).to_pandas() for p in ds_paths]
    
    with Path(f"./results/debug/sparse_join/{data_dir}.pickle").open("rb") as f:
        sparse_candidates = pickle.load(f)

    with Path(f"./results/debug/simcse/{data_dir}.pickle").open("rb") as f:
        dense_candidates = pickle.load(f)

    matches_path = Path(f"./data/blocking/{data_dir}/matches.csv")
    matches = set(pd.read_csv(matches_path).itertuples(index=False, name=None))
    
    sparse_cands = set().union(*[s for s in sparse_candidates[:K]])
    dense_cands = set().union(*[s for s in dense_candidates[:K]])
    sparse_hits = matches & sparse_cands
    dense_hits = matches & dense_cands
    sparse_misses = matches - sparse_hits
    dense_missses = matches - dense_hits
    
#     print(len(sparse_hits))
#     print(len(dense_hits))
#     print(len(sparse_misses))
#     print(len(dense_missses))
    
    both_hits = sparse_hits & dense_hits
    sparse_hits_dense_misses = sparse_hits & dense_missses
    sparse_misses_dense_hits = sparse_misses & dense_hits
    both_misses = sparse_misses & dense_missses
    
    samples = []
    df1, df2 = dfs[0], dfs[len(dfs) - 1]
    
    for t, s in zip(
        ["both", "sparse", "dense", "neither"], 
        [both_hits, sparse_hits_dense_misses, sparse_misses_dense_hits, both_misses],
    ):
        for p in s:
            r1 = df1[df1["id"] == p[0]].iloc[0]
            r2 = df2[df2["id"] == p[1]].iloc[0]
            ngram_sim, cosine_sim = get_similarity(r1, r2)
        
            instance = {
                "pair": p,
                "r1": r1["text"],
                "r2": r2["text"],
                "sparse_sim": ngram_sim,
                "dense_sim": cosine_sim,
                "type": t,
            }
            samples.append(instance)
            
    df = pd.DataFrame(samples)
    g = sns.catplot(data=df, x="sparse_sim", y="type", kind="violin", height=3, aspect=2)
    plt.show()
    
#     print("---------------------------")
# #     print(f"both hits {len(both_hits)}")
# #     check_set(both_hits, dfs)
    
#     print(f"sparse hits dense misses {len(sparse_hits_dense_misses)}")
#     check_set(sparse_hits_dense_misses, dfs)
    
#     print("---------------------------")
    
#     print(f"sparse misses dense hits {len(sparse_misses_dense_hits)}")
#     check_set(sparse_misses_dense_hits, dfs)
    
# #     print(f"both misses {len(both_misses)}")
# #     check_set(both_misses, dfs)

In [None]:
import torch.nn.functional as F
torch.set_grad_enabled(False)

def get_embeddings(t):
    features = tokenizer(
        t,
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt",
    ).to(5)
    embeddings = F.normalize(model(features)).cpu()
    return embeddings[0]

s1 = "toshiba transmemory u202, 128 gb 28.99 toshiba 128 gb"
s2 = "pendrive 128gb toshiba transmemory white, usb 2.0, blanco toshiba 128 g"
e1 = get_embeddings(s1)
e2 = get_embeddings(s2)
print(cosine_similarity(e1, e2))