# Preprocess

In [None]:
from pathlib import Path

import pandas as pd

In [None]:
def swap_id(row, df):
    id1 = row["id1"]
    id2 = row["id2"]
    lst1 = df.index[df["id"] == id1].tolist()
    lst2 = df.index[df["id"] == id2].tolist()
    assert len(lst1) == len(lst2) == 1
    idx1, idx2 = lst1[0], lst2[0]
    if idx1 > idx2:
        return id2, id1
    else:
        return id1, id2

In [None]:
# dblp-scholoar
folder = Path("../data/blocking/dblp-scholar/")
df1 = pd.read_csv(folder / "raw" / "DBLP1.csv", encoding="latin-1")
df1.to_csv(folder / "1_dblp.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())
df2 = pd.read_csv(folder / "raw" / "Scholar.csv")
df2.to_csv(folder / "2_scholar.csv", index=False)
print(df2.map(lambda x: len(str(x))).describe())
df = pd.read_csv(folder / "raw" / "DBLP-Scholar_perfectMapping.csv")
df.rename(columns={"idDBLP": "id1", "idScholar": "id2"}, inplace=True)
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df2["id"]))].empty
df.to_csv(folder / "matches.csv", index=False)

In [None]:
# dblp-acm
folder = Path("../data/blocking/dblp-acm/")
df1 = pd.read_csv(folder / "raw" / "DBLP2.csv", encoding="latin-1")
df1.to_csv(folder / "1_dblp.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())
df2 = pd.read_csv(folder / "raw" / "ACM.csv")
df2.to_csv(folder / "2_acm.csv", index=False)
print(df2.map(lambda x: len(str(x))).describe())
df = pd.read_csv(folder / "raw" / "DBLP-ACM_perfectMapping.csv")
df.rename(columns={"idDBLP": "id1", "idACM": "id2"}, inplace=True)
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df2["id"]))].empty
df.to_csv(folder / "matches.csv", index=False)

In [None]:
# amazon-google
folder = Path("../data/blocking/amazon-google/")
df1 = pd.read_csv(folder / "raw" / "Amazon.csv", encoding="latin-1")
df1.drop(columns=["description"], inplace=True)
df1.to_csv(folder / "1_amazon.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())
df2 = pd.read_csv(folder / "raw" / "GoogleProducts.csv", encoding="latin-1")
df2.drop(columns=["description"], inplace=True)
df2.rename(columns={"manufacturer": "factory"}, inplace=True)
df2.to_csv(folder / "2_google.csv", index=False)
print(df2.map(lambda x: len(str(x))).describe())
df = pd.read_csv(folder / "raw" / "Amzon_GoogleProducts_perfectMapping.csv")
df.rename(columns={"idAmazon": "id1", "idGoogleBase": "id2"}, inplace=True)
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df2["id"]))].empty
df.to_csv(folder / "matches.csv", index=False)

In [None]:
# abt-buy_homo
folder = Path("../data/blocking/abt-buy_homo/")
df1 = pd.read_csv(folder / "raw" / "Abt.csv", encoding="latin-1")
df1.drop(columns=["description"], inplace=True)
df1.to_csv(folder / "1_abt.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())
df2 = pd.read_csv(folder / "raw" / "Buy.csv")
df2.drop(columns=["description", "manufacturer"], inplace=True)  # drop manufacturer
df2.to_csv(folder / "2_buy.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())
df = pd.read_csv(folder / "raw" / "abt_buy_perfectMapping.csv")
df.rename(columns={"idAbt": "id1", "idBuy": "id2"}, inplace=True)
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df2["id"]))].empty
df.to_csv(folder / "matches.csv", index=False)

In [None]:
# abt-buy_heter
folder = Path("../data/blocking/abt-buy_heter/")
df1 = pd.read_csv(folder / "raw" / "Abt.csv", encoding="latin-1")
df1.drop(columns=["description"], inplace=True)
df1.rename(columns={"name": "title"}, inplace=True)
df1.to_csv(folder / "1_abt.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())
df2 = pd.read_csv(folder / "raw" / "Buy.csv")
df2.drop(columns=["description"], inplace=True)
df2.to_csv(folder / "2_buy.csv", index=False)
print(df2.map(lambda x: len(str(x))).describe())
df = pd.read_csv(folder / "raw" / "abt_buy_perfectMapping.csv")
df.rename(columns={"idAbt": "id1", "idBuy": "id2"}, inplace=True)
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df2["id"]))].empty
df.to_csv(folder / "matches.csv", index=False)

In [None]:
# cora
from xml.etree import ElementTree

folder = Path("../data/blocking/cora/")
tree = ElementTree.parse(folder / "raw" / "cora.xml")
root = tree.getroot()

entities = []
for elem in root:
    entities.append({attr.tag: attr.text for attr in elem})

df1 = pd.DataFrame(entities)
idx = df1.index + 1
df1.insert(0, "id", idx)
df1.to_csv(folder / "1_cora.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())

df = pd.read_csv(folder / "raw" / "cora_gold.csv", sep=";")
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df1["id"]))].empty
# df = df.apply(lambda r: swap_id(r, df1), axis=1, result_type='broadcast')
assert df.equals(df.apply(lambda r: swap_id(r, df1), axis=1, result_type="broadcast"))
df.to_csv(folder / "matches.csv", index=False)

In [None]:
# fodors-zagats_homo
folder = Path("../data/blocking/fodors-zagats_homo/")
df1 = pd.read_csv(folder / "raw" / "fodors_zagat_raw_data" / "tableA.csv")
df1 = df1.map(lambda x: x.strip("'").replace("\\'", "'") if isinstance(x, str) else x)
df1.to_csv(folder / "1_fodors.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())
df2 = pd.read_csv(folder / "raw" / "fodors_zagat_raw_data" / "tableB.csv")
df2 = df2.map(lambda x: x.strip("'").replace("\\'", "'") if isinstance(x, str) else x)
df2.to_csv(folder / "2_zagats.csv", index=False)
print(df2.map(lambda x: len(str(x))).describe())
df = pd.read_csv(folder / "raw" / "fodors_zagat_raw_data" / "matches.csv")
df.rename(columns={"fodors_id": "id1", "zagats_id": "id2"}, inplace=True)
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df2["id"]))].empty
df.to_csv(folder / "matches.csv", index=False)

In [None]:
# fodors-zagats_heter
folder = Path("../data/blocking/fodors-zagats_heter/")
df1 = pd.read_csv(folder / "raw" / "fodors_zagat_raw_data" / "tableA.csv")
df1 = df1.map(lambda x: x.strip("'").replace("\\'", "'") if isinstance(x, str) else x)
df1 = df1.rename(columns={"name": "title"})
address = df1["addr"] + " " + df1["city"]
df1 = df1.drop(columns=["addr", "city"])
df1.insert(2, "address", address)
category = df1["type"] + " " + df1["class"].astype(str)
df1 = df1.drop(columns=["type", "class"])
df1.insert(4, "category", category)
df1.to_csv(folder / "1_fodors.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())

df2 = pd.read_csv(folder / "raw" / "fodors_zagat_raw_data" / "tableB.csv")
df2 = df2.map(lambda x: x.strip("'").replace("\\'", "'") if isinstance(x, str) else x)
df2 = df2.drop(columns=["name"])
df2.to_csv(folder / "2_zagats.csv", index=False)
print(df2.map(lambda x: len(str(x))).describe())

df = pd.read_csv(folder / "raw" / "fodors_zagat_raw_data" / "matches.csv")
df.rename(columns={"fodors_id": "id1", "zagats_id": "id2"}, inplace=True)
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df2["id"]))].empty
df.to_csv(folder / "matches.csv", index=False)

In [None]:
# songs
folder = Path("../data/blocking/songs/")
df1 = pd.read_csv(folder / "raw" / "msd.csv")
df1.to_csv(folder / "1_msd.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())

df = pd.read_csv(folder / "raw" / "matches_msd_msd.csv")
df = df[df["id1"] != df["id2"]]
df = df[df["id1"] < df["id2"]]
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df1["id"]))].empty
# df = df.apply(lambda r: swap_id(r, df1), axis=1, result_type='broadcast')
assert df.equals(df.apply(lambda r: swap_id(r, df1), axis=1, result_type="broadcast"))
df.to_csv(folder / "matches.csv", index=False)

In [None]:
# citeseer-dblp
folder = Path("../data/blocking/citeseer-dblp/")
df1 = pd.read_csv(folder / "raw" / "citeseer.csv", low_memory=False)
df1.to_csv(folder / "1_citeseer.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())

df2 = pd.read_csv(folder / "raw" / "dblp.csv", low_memory=False)
df2.to_csv(folder / "2_dblp.csv", index=False)
print(df2.map(lambda x: len(str(x))).describe())

df = pd.read_csv(folder / "raw" / "matches_citeseer_dblp.csv")
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df2["id"]))].empty
df.to_csv(folder / "matches.csv", index=False)

In [None]:
from pathlib import Path

import pandas as pd
from jnius import autoclass

EntitySerializationReader = autoclass(
    "org.scify.jedai.datareader.entityreader.EntitySerializationReader"
)
GtSerializationReader = autoclass(
    "org.scify.jedai.datareader.groundtruthreader.GtSerializationReader"
)


def profile2df(path):
    entitySerializationReader = EntitySerializationReader(path)
    entityProfiles = entitySerializationReader.getEntityProfiles()
    profilesIterator = entityProfiles.iterator()
    profiles = []
    while profilesIterator.hasNext():
        profile = profilesIterator.next()
        pf = {"id": len(profiles)}
        attributesIterator = profile.getAttributes().iterator()
        while attributesIterator.hasNext():
            attribute = attributesIterator.next()
            pf[attribute.getName()] = attribute.getValue()
        profiles.append(pf)

    return pd.DataFrame(profiles)


def duplicates2df(path):
    gtSerializationReader = GtSerializationReader(path)
    duplicatePairs = gtSerializationReader.getDuplicatePairs(None)
    duplicatePairsIterator = duplicatePairs.iterator()
    duplicates = []
    while duplicatePairsIterator.hasNext():
        idDuplicates = duplicatePairsIterator.next()
        duplicate = {
            "id1": idDuplicates.getEntityId1(),
            "id2": idDuplicates.getEntityId2(),
        }
        duplicates.append(duplicate)

    return pd.DataFrame(duplicates)

In [None]:
# imdb-dbpedia
folder = Path("../data/blocking/imdb-dbpedia")
file = "../src/vendor/JedAIToolkit/data/cleanCleanErDatasets/imdbProfiles"
df1 = profile2df(file)
df1 = df1[["id", "title", "starring", "writer", "editor"]]
df1.to_csv(folder / "1_imdb.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())
file = "../src/vendor/JedAIToolkit/data/cleanCleanErDatasets/dbpediaProfiles"
df2 = profile2df(file)
df2 = df2[
    [
        "id",
        "title",
        "director name",
        "actor name",
        "year",
        "imdb_ksearch_id",
        "genre",
        "url",
    ]
]
df2.to_csv(folder / "2_dbpedia.csv", index=False)
print(df2.map(lambda x: len(str(x))).describe())
file = "../src/vendor/JedAIToolkit/data/cleanCleanErDatasets/moviesIdDuplicates"
df = duplicates2df(file)
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df2["id"]))].empty
df.to_csv(folder / "matches.csv", index=False)

In [None]:
# movies
def url_suffix(s):
    return s.split("/")[-1]


folder = Path("../data/blocking/movies")
file = "../src/vendor/JedAIToolkit/data/cleanCleanErDatasets/imdbProfilesNEW"
df1 = profile2df(file)
print(len(df1.columns))
len1 = len(df1)

file = "../src/vendor/JedAIToolkit/data/cleanCleanErDatasets/tmdbProfiles"
df2 = profile2df(file)
len2 = len(df2)
print(len(df2.columns))

file = "../src/vendor/JedAIToolkit/data/cleanCleanErDatasets/tvdbProfiles"
df3 = profile2df(file)
len3 = len(df3)
print(len(df3.columns))

df2["id"] += len1
df3["id"] += len1 + len2

df = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
df = df.rename(url_suffix, axis="columns")
df.to_csv(folder / "1_movies.csv", index=False)
print(df.map(lambda x: len(str(x))).describe())

file1 = "../src/vendor/JedAIToolkit/data/cleanCleanErDatasets/imdbTmdbIdDuplicates"
dp1 = duplicates2df(file1)
dp1["id2"] += len1

file2 = "../src/vendor/JedAIToolkit/data/cleanCleanErDatasets/imdbTvdbIdDuplicates"
dp2 = duplicates2df(file2)
dp2["id2"] += len1 + len2

file3 = "../src/vendor/JedAIToolkit/data/cleanCleanErDatasets/tmdbTvdbIdDuplicates"
dp3 = duplicates2df(file3)
dp3["id1"] += len1
dp3["id2"] += len1 + len2

dp = pd.concat([dp1, dp2, dp3], axis=0, ignore_index=True)
assert dp[~(dp["id1"].isin(df["id"]) & dp["id2"].isin(df["id"]))].empty
# dp = dp.apply(lambda r: swap_id(r, df), axis=1, result_type='broadcast')
assert dp.equals(dp.apply(lambda r: swap_id(r, df), axis=1, result_type="broadcast"))
dp.to_csv(folder / "matches.csv", index=False)

In [None]:
# census
folder = Path("../data/blocking/census/")
file = "../src/vendor/JedAIToolkit/data/dirtyErDatasets/censusProfiles"
df1 = profile2df(file)
attrs = list(df1.columns)
attrs[1:] = sorted(attrs[1:])
df1 = df1[attrs]
df1.to_csv(folder / "1_census.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())

file = "../src/vendor/JedAIToolkit/data/dirtyErDatasets/censusIdDuplicates"
df = duplicates2df(file)
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df1["id"]))].empty
# df = df.apply(lambda r: swap_id(r, df1), axis=1, result_type='broadcast')
assert df.equals(df.apply(lambda r: swap_id(r, df1), axis=1, result_type="broadcast"))
df.to_csv(folder / "matches.csv", index=False)

In [None]:
# notebook
folder = Path("../data/blocking/notebook/")

df1 = pd.read_csv(folder / "raw" / "X.csv")
df1 = df1.rename(columns={"instance_id": "id"})
title = df1["title"]
df1 = df1.drop(columns=["title"])
df1.insert(1, "title", title)
df1.to_csv(folder / "1_notebook.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())

df = pd.read_csv(folder / "raw" / "Y.csv")
df = df[df["label"] == 1][["left_instance_id", "right_instance_id"]]
df = df.rename(columns={"left_instance_id": "id1", "right_instance_id": "id2"})
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df1["id"]))].empty
df = df.apply(lambda r: swap_id(r, df1), axis=1, result_type="broadcast")
assert df.equals(df.apply(lambda r: swap_id(r, df1), axis=1, result_type="broadcast"))
df.to_csv(folder / "matches.csv", index=False)

In [None]:
# notebook2
folder = Path("../data/blocking/notebook2/")

df1 = pd.read_csv(folder / "raw" / "X.csv")
df1.to_csv(folder / "1_notebook.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())

df = pd.read_csv(folder / "raw" / "Y.csv")
df = df.rename(columns={"lid": "id1", "rid": "id2"})
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df1["id"]))].empty
df = df.apply(lambda r: swap_id(r, df1), axis=1, result_type="broadcast")
assert df.equals(df.apply(lambda r: swap_id(r, df1), axis=1, result_type="broadcast"))
df.to_csv(folder / "matches.csv", index=False)

In [None]:
# altosight
folder = Path("../data/blocking/altosight/")

df1 = pd.read_csv(folder / "raw" / "X.csv")
df1 = df1.rename(columns={"instance_id": "id"})
df1 = df1[["id", "name", "price", "brand", "size"]]
df1.to_csv(folder / "1_altosight.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())

df = pd.read_csv(folder / "raw" / "Y.csv")
df = df[df["label"] == 1][["left_instance_id", "right_instance_id"]]
df = df.rename(columns={"left_instance_id": "id1", "right_instance_id": "id2"})
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df1["id"]))].empty
# df = df.apply(lambda r: swap_id(r, df1), axis=1, result_type='broadcast')
assert df.equals(df.apply(lambda r: swap_id(r, df1), axis=1, result_type="broadcast"))
df.to_csv(folder / "matches.csv", index=False)

In [None]:
# altosight2
folder = Path("../data/blocking/altosight2/")

df1 = pd.read_csv(folder / "raw" / "X.csv")
df1.to_csv(folder / "1_altosight.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())

df = pd.read_csv(folder / "raw" / "Y.csv")
df = df.rename(columns={"lid": "id1", "rid": "id2"})
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df1["id"]))].empty
df = df.apply(lambda r: swap_id(r, df1), axis=1, result_type="broadcast")
assert df.equals(df.apply(lambda r: swap_id(r, df1), axis=1, result_type="broadcast"))
df.to_csv(folder / "matches.csv", index=False)

In [None]:
# notebook_full
folder = Path("../data/blocking/notebook_full/")

df1 = pd.read_csv(folder / "raw" / "X.csv")
df1.to_csv(folder / "1_notebook.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())

df = pd.read_csv(folder / "raw" / "Y.csv")
df = df.rename(columns={"lid": "id1", "rid": "id2"})
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df1["id"]))].empty
df = df.apply(lambda r: swap_id(r, df1), axis=1, result_type="broadcast")
assert df.equals(df.apply(lambda r: swap_id(r, df1), axis=1, result_type="broadcast"))
df.to_csv(folder / "matches.csv", index=False)

In [None]:
# altosight_full
folder = Path("../data/blocking/altosight_full/")

df1 = pd.read_csv(folder / "raw" / "X.csv")
df1.to_csv(folder / "1_altosight.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())

df = pd.read_csv(folder / "raw" / "Y.csv")
df = df.rename(columns={"lid": "id1", "rid": "id2"})
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df1["id"]))].empty
df = df.apply(lambda r: swap_id(r, df1), axis=1, result_type="broadcast")
assert df.equals(df.apply(lambda r: swap_id(r, df1), axis=1, result_type="broadcast"))
df.to_csv(folder / "matches.csv", index=False)

# Deprecated

In [None]:
# cdds
folder = Path("../data/blocking/cds/")
file = "../src/vendor/JedAIToolkit/data/dirtyErDatasets/cddbProfiles"
df1 = profile2df(file)
df1.to_csv(folder / "1_cds.csv", index=False)

file = "../src/vendor/JedAIToolkit/data/dirtyErDatasets/cddbIdDuplicates"
df = duplicates2df(file)
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df1["id"]))].empty
df.to_csv(folder / "matches.csv", index=False)

In [None]:
# restaurants
folder = Path("../data/blocking/restaurants")
file = "../src/vendor/JedAIToolkit/data/cleanCleanErDatasets/restaurant1Profiles"
df1 = profile2df(file)
df1.rename(columns=lambda c: c.split("#")[-1], inplace=True)
df1.to_csv(folder / "1_resturant.csv", index=False)
print(df1.map(lambda x: len(str(x))).describe())
file = "../src/vendor/JedAIToolkit/data/cleanCleanErDatasets/restaurant2Profiles"
df2 = profile2df(file)
df2.rename(columns=lambda c: c.split("#")[-1], inplace=True)
df2.to_csv(folder / "2_resturant.csv", index=False)
print(df2.map(lambda x: len(str(x))).describe())
file = "../src/vendor/JedAIToolkit/data/cleanCleanErDatasets/restaurantsIdDuplicates"
df = duplicates2df(file)
assert df[~(df["id1"].isin(df1["id"]) & df["id2"].isin(df2["id"]))].empty
df.to_csv(folder / "matches.csv", index=False)