In [None]:
import os
os.chdir("/home1/wangtianshu/universal-blocker")
from pathlib import Path
import pandas as pd

data_dirs = [
    d.name
    for d in Path("./data/blocking").iterdir()
    if d.name not in ["songs", "citeseer-dblp"]
]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import py_stringmatching as sm
import torch
import torch.nn.functional as F
torch.set_grad_enabled(False)

qgram_tokenizer = sm.tokenizer.qgram_tokenizer.QgramTokenizer(qval=5, padding=False)
whitespace_tokenzier = sm.tokenizer.whitespace_tokenizer.WhitespaceTokenizer()
cosine = sm.similarity_measure.cosine.Cosine()

from src.models import SimCSE
from scipy.spatial import distance

def sparse_similarity(
    s1,
    s2,
    tokenizer,
    similarity,
):
    t1 = tokenizer.tokenize(s1)
    t2 = tokenizer.tokenize(s2)
    return similarity.get_sim_score(t1, t2)

def ngram_similarity(s1, s2):
    return sparse_similarity(s1, s2, tokenizer=qgram_tokenizer, similarity=cosine)

def token_similarity(s1, s2):
    return sparse_similarity(s1, s2, tokenizer=whitespace_tokenzier, similarity=cosine)

def cosine_similarity(e1, e2):
    return 1 - distance.cosine(e1, e2)

def prepare_model():
    global model, tokenizer, device
    model_name_or_path="./models/roberta-base"
    device=5
    model = SimCSE(model_name_or_path=model_name_or_path, max_length=256)
    model = model.load_from_checkpoint("results/fit/simcse/gittables/12xm9v0r/checkpoints/step=2400-AP=0.45223.ckpt")
    model.eval()
    model = model.to(device)
    tokenizer = model.convert_to_features.keywords["tokenizer"]
    
prepare_model()

In [None]:
from datasets import Dataset, load_dataset
import pandas as pd
import numpy as np
import math
from random import randrange

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

import seaborn as sns
import statistics

sns.set_theme()

def get_text(batch):
    columns = [c for c in batch.keys() if "id" not in c]
    batch_size = len(next(iter(batch.values())))

    records = []
    for i in range(batch_size):
        records.append([(c, batch[c][i]) for c in columns])

    texts = [
        " ".join(str(t[1]).lower() for t in record if t[1] is not None)
        for record in records
    ]
    features = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt",
    ).to(device)
    embeddings = F.normalize(model(features)).cpu()

    return {"_text": texts, "embeddings": embeddings}

def prepare_pair(matches, dfs):
    pairs = pd.merge(matches, dfs[0], left_on="id1", right_on="id")[["id1", "id2", "_text", "embeddings"]]
    pairs = pd.merge(pairs, dfs[len(dfs) - 1], left_on="id2", right_on="id")
    pairs = pairs[["id1", "id2", "_text_x", "_text_y", "embeddings_x", "embeddings_y"]]
    return pairs

def get_similarity(batch):
    s1_lst = batch["_text_x"]
    s2_lst = batch["_text_y"]
    e1_lst = batch["embeddings_x"]
    e2_lst = batch["embeddings_y"]
    
    ngram_sims = [ngram_similarity(s1, s2) for s1, s2 in zip(s1_lst, s2_lst)]
    token_sims = [token_similarity(s1, s2) for s1, s2 in zip(s1_lst, s2_lst)]
    dense_sims = [cosine_similarity(e1, e2) for e1, e2 in zip(e1_lst, e2_lst)]
    
    return {
        "ngram": ngram_sims,
        "token": token_sims,
        "dense": dense_sims,
    }

times = 5
for data_dir in sorted(data_dirs):
    print(data_dir)
    
    table_paths = sorted(Path(f"./data/blocking/{data_dir}").glob("[1-2]*.csv"))
    ds_paths = [Path(f"./data/blocking_map/{data_dir}/{p.stem}") for p in table_paths]
    
    for i, ds_path in enumerate(ds_paths):
        if not ds_path.exists():
            df = pd.read_csv(table_paths[i], index_col="id", low_memory=False)
            ds = Dataset.from_pandas(df)
            ds = ds.map(get_text, batched=True, batch_size=32)
            ds.save_to_disk(ds_path)
    datasets = [Dataset.load_from_disk(p) for p in ds_paths]
    
    
    for ds in datasets:
        ds.set_format("numpy")
    
    matches_path = Path(f"./data/blocking_map/{data_dir}/matches_{times}")
    if not matches_path.exists():
        label_path = Path(f"./data/blocking/{data_dir}/matches.csv")
        matches = pd.read_csv(label_path)
        dfs = [ds.to_pandas() for ds in datasets]
        matches = prepare_pair(matches, dfs)
        matches = Dataset.from_pandas(matches, preserve_index=False)
        matches = matches.map(get_similarity, batched=True, batch_size=32)
        matches.save_to_disk(matches_path)
    
    mismatches_path = Path(f"./data/blocking_map/{data_dir}/mismatches_{times}")
    if not mismatches_path.exists():
        label_path = Path(f"./data/blocking/{data_dir}/matches.csv")
        matches = pd.read_csv(label_path)
        dfs = [ds.to_pandas() for ds in datasets]
        mismatches = set()
        for k in tqdm(range(len(matches) * times)):
            ind1 = randrange(len(dfs[0]))
            ind2 = randrange(len(dfs[len(dfs) - 1]))
            id1 = dfs[0].iloc[ind1]["id"]
            id2 = dfs[len(dfs) - 1].iloc[ind2]["id"]
            mismatches.add((id1, id2))
        mismatches = mismatches - set(matches.itertuples(index=False, name=None))
        
        mismatches = pd.DataFrame(mismatches, columns =["id1", "id2"])
        mismatches = prepare_pair(mismatches, dfs)
        mismatches = Dataset.from_pandas(mismatches, preserve_index=False)
        mismatches = mismatches.map(get_similarity, batched=True, batch_size=32)
        mismatches.save_to_disk(mismatches_path)
        
    matches = pd.DataFrame(Dataset.load_from_disk(matches_path))
    mismatches = pd.DataFrame(Dataset.load_from_disk(mismatches_path))
    
#     sorted_matches = {}
#     sorted_mismatches = {}
#     for t in ["ngram", "dense"]:
#         sorted_matches[t] = sorted(matches[t])
#         sorted_mismatches[t] = sorted(mismatches[t])
    
#     tpr_threshold = {}
#     print(f"TPR={0.9}:")
#     for t in ["ngram", "dense"]:
#         # 1. TPR = 0.9 FDR(FPR)=?
#         threshold = sorted_matches[t][math.floor(len(sorted_matches[t]) * 0.1) - 1]
#         TP = len(list(filter(lambda x: x >= threshold, sorted_matches[t])))
#         FP = len(list(filter(lambda x: x >= threshold, sorted_mismatches[t])))
#         FDR = FP / (TP + FP)
# #         FPR = FP / len(sorted_mismatches[t])
#         print(f"{t}, FDR={FDR}")
#         tpr_threshold[t] = threshold
#     print(tpr_threshold)

    
#     # 2. FPR = 1~0.9 TPR
#     for FPR in range(100, 94, -1):
#         FPR = FPR / 100
#         print(f"FPR={FPR}:")
#         for t in ["ngram", "dense"]:
#             threshold = sorted_mismatches[t][math.ceil(len(sorted_mismatches[t]) * FPR) - 1]
#             TP = len(list(filter(lambda x: x >= threshold, sorted_matches[t])))
#             TPR = TP / len(sorted_matches[t])
#             print(f"{t} TPR={TPR}")
#     print()
        
#         # 2. FPR = 0.9 FNR=? FN/P = 1 - TP/P
#         threshold = sorted_mismatches[math.ceil(len(mismatches) * 0.9) - 1]
#         FN = len(list(filter(lambda x: x < threshold, sorted_matches)))
#         FNR = FN / len(sorted_matches)
#         print(t, threshold, FNR)
        
#         # 3. FPR = 0 FNR=?
#         threshold = sorted_mismatches[math.ceil(len(mismatches) * 1) - 1]
#         TP = len(list(filter(lambda x: x >= threshold, sorted_matches)))
#         TPR = TP / len(sorted_matches)
#         print(t, threshold, TPR)
        
    matches["label"] = "matched"
    mismatches["label"] = "mismatched"
    
    df = pd.concat([matches, mismatches])[["label", "ngram", "token", "dense"]]
    flatten = []
    for item in df.to_dict('records'):
        for t in ["ngram", "token", "dense"]:
            flatten.append({"label": item["label"], "type": t, "similarity": item[t]})
    df_flatten = pd.DataFrame(flatten)
    
    sns.displot(data=df_flatten[df_flatten["type"] != "token"], x="similarity", hue="label", col="type", stat="density", bins=20, kde=True, multiple="stack")
    sns.jointplot(data=df, x="dense", y="ngram", hue="label")
    
    plt.show()
#     for t in ["ngram", "token", "dense"]:
#         max_mismatches = max(mismatches[t])
#         area1 = len(list(filter(lambda x: x > max_mismatches, matches[t]))) / len(matches)
    
#     for t in ["ngram", "token", "dense"]:
#         sns.diplot(matches[t])
#         plt.hist(matches[t], bins=20, alpha=0.5, label="matches", density=True, stacked=True)
#         plt.axvline(mean(matches[t]), color="k", alpha=0.5, linestyle='dashed')
#         plt.hist(mismatches[t], bins=20, alpha=0.5, label="mismatches", density=True, stacked=True)
#         plt.axvline(mean(mismatches[t]), color="k", alpha=0.5, linestyle='dashed')
#         plt.legend(loc='upper right')
#         plt.title(f"{data_dir}-{t}")
#         plt.show()

In [None]:
from datasets import Dataset, load_dataset
import pandas as pd
import numpy as np
import math
from random import randrange

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

import seaborn as sns
import statistics

sns.set_theme()

def get_text(batch):
    columns = [c for c in batch.keys() if "id" not in c]
    batch_size = len(next(iter(batch.values())))

    records = []
    for i in range(batch_size):
        records.append([(c, batch[c][i]) for c in columns])

    texts = [
        " ".join(str(t[1]).lower() for t in record if t[1] is not None)
        for record in records
    ]
    features = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt",
    ).to(device)
    embeddings = F.normalize(model(features)).cpu()

    return {"_text": texts, "embeddings": embeddings}

def prepare_pair(matches, dfs):
    pairs = pd.merge(matches, dfs[0], left_on="id1", right_on="id")[["id1", "id2", "_text", "embeddings"]]
    pairs = pd.merge(pairs, dfs[len(dfs) - 1], left_on="id2", right_on="id")
    pairs = pairs[["id1", "id2", "_text_x", "_text_y", "embeddings_x", "embeddings_y"]]
    return pairs

def get_similarity(batch):
    s1_lst = batch["_text_x"]
    s2_lst = batch["_text_y"]
    e1_lst = batch["embeddings_x"]
    e2_lst = batch["embeddings_y"]
    
    ngram_sims = [ngram_similarity(s1, s2) for s1, s2 in zip(s1_lst, s2_lst)]
    token_sims = [token_similarity(s1, s2) for s1, s2 in zip(s1_lst, s2_lst)]
    dense_sims = [cosine_similarity(e1, e2) for e1, e2 in zip(e1_lst, e2_lst)]
    
    return {
        "ngram": ngram_sims,
        "token": token_sims,
        "dense": dense_sims,
    }

times = 5
for data_dir in ["movies"]:
    print(data_dir)
    
    table_paths = sorted(Path(f"./data/blocking/{data_dir}").glob("[1-2]*.csv"))
    ds_paths = [Path(f"./data/blocking_map/{data_dir}/{p.stem}") for p in table_paths]
    
    for i, ds_path in enumerate(ds_paths):
        if not ds_path.exists():
            df = pd.read_csv(table_paths[i], index_col="id", low_memory=False)
            ds = Dataset.from_pandas(df)
            ds = ds.map(get_text, batched=True, batch_size=32)
            ds.save_to_disk(ds_path)
    datasets = [Dataset.load_from_disk(p) for p in ds_paths]
    
    for ds in datasets:
        ds.set_format("numpy")
    
    matches_path = Path(f"./data/blocking_map/{data_dir}/matches_{times}")
    if not matches_path.exists():
        label_path = Path(f"./data/blocking/{data_dir}/matches.csv")
        matches = pd.read_csv(label_path)
        dfs = [ds.to_pandas() for ds in datasets]
        matches = prepare_pair(matches, dfs)
        matches = Dataset.from_pandas(matches, preserve_index=False)
        matches = matches.map(get_similarity, batched=True, batch_size=32)
        matches.save_to_disk(matches_path)
    
    mismatches_path = Path(f"./data/blocking_map/{data_dir}/mismatches_{times}")
    if not mismatches_path.exists():
        label_path = Path(f"./data/blocking/{data_dir}/matches.csv")
        matches = pd.read_csv(label_path)
        dfs = [ds.to_pandas() for ds in datasets]
        mismatches = set()
        for k in tqdm(range(len(matches) * times)):
            ind1 = randrange(len(dfs[0]))
            ind2 = randrange(len(dfs[len(dfs) - 1]))
            id1 = dfs[0].iloc[ind1]["id"]
            id2 = dfs[len(dfs) - 1].iloc[ind2]["id"]
            mismatches.add((id1, id2))
        mismatches = mismatches - set(matches.itertuples(index=False, name=None))
        
        mismatches = pd.DataFrame(mismatches, columns =["id1", "id2"])
        mismatches = prepare_pair(mismatches, dfs)
        mismatches = Dataset.from_pandas(mismatches, preserve_index=False)
        mismatches = mismatches.map(get_similarity, batched=True, batch_size=32)
        mismatches.save_to_disk(mismatches_path)
        
    matches = pd.DataFrame(Dataset.load_from_disk(matches_path))
    mismatches = pd.DataFrame(Dataset.load_from_disk(mismatches_path))
    print(len(matches))
    print(len(mismatches))
    
    sorted_matches = {}
    sorted_mismatches = {}
    for t in ["ngram", "dense"]:
        sorted_matches[t] = sorted(matches[t])
        sorted_mismatches[t] = sorted(mismatches[t])

    # 2. FPR = 1~0.9 TPR
    for FPR in range(100, 94, -1):
        FPR = FPR / 100
        print(f"FPR={FPR}:")
        for t in ["ngram", "dense"]:
            threshold = sorted_mismatches[t][math.ceil(len(sorted_mismatches[t]) * FPR) - 1]
            TP = len(list(filter(lambda x: x >= threshold, sorted_matches[t])))
            TPR = TP / len(sorted_matches[t])
            print(f"{t} TPR={TPR}")
    print()