In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import random
import json

from tqdm import tqdm

# Round1_T2D_f3

In [None]:
GT_file = './data/dataset_GT/Round1_T2D_f3.csv'

chunk_size = 1000  # Adjust this based on your memory constraints
ids = {}
column_names = ["table_name", "row", "col", "url"] 
total_rows = sum(1 for line in open(GT_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk


with tqdm(total=total_iterations) as pbar:
    for chunk_GT in pd.read_csv(GT_file, chunksize=chunk_size):
        items = chunk_GT[chunk_GT['target'] == 1]
        for _, row in items.iterrows():
            if row['target'] == 1:
                ids[row["key"]] = {
                    "id": 'https://www.wikidata.org/entity/' + row['id'],
                    "name": row['name'],
                    "ed_score": row['ed_score'],
                    "jaccard_score": row['jaccard_score']
                }
        pbar.update(1)  # Update progress bar for each chunk iteration

print("Processing complete.")

In [None]:
# find the mention in the table
tables = "./data/Dataset/Dataset/Round1_T2D/tables/"
cea_file = './data/Dataset/Dataset/Round1_T2D/gt/CEA_Round1_gt_WD.csv'
os.listdir(tables)
df = pd.read_csv(cea_file, header=None)
df["key"] = df[0] + " " + df[1].astype('str') + " " + df[2].astype('str')
cea_keys = set(df["key"].values)
key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {row+1} {col}"
            if key in cea_keys:
                cell_value = df.iloc[row, col]
                key_to_cell[key] = cell_value

In [None]:
cea_file = './data/Dataset/Dataset/Round1_T2D/gt/CEA_Round1_gt_WD.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        parts = row['url'].split('/')
        wikidata_id = parts[-1]
        num_rows, num_columns = df.shape
        key = f"{row['table_name']} {row['row']} {row['col']}"
        if key in ids:
            cell_value = key_to_cell[key]
            data = ids[key]
            mentions[cell_value] = data

print("Processing complete.")

In [None]:
sorted_mentions = sorted(mentions.items(), key=lambda x: x[1]["ed_score"])

json_file_path = "./data/Round1_T2D_f3_sorted_mentions.json"

# Save the sorted_mentions dictionary to a JSON file
with open(json_file_path, "w") as json_file:
    json.dump(sorted_mentions, json_file, indent=4)

print(f"Sorted mentions saved to {json_file_path}")

In [None]:
####################
# READ THE JSON
#####################

json_file_path = "./data/Round1_T2D_f3_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R1_sorted_mentions = json.load(file)

In [None]:
# Extract the ed_score and jaccard_score values
ed_scores = [item[1]['ed_score'] for item in R1_sorted_mentions]
jaccard_scores = [item[1]['jaccard_score'] for item in R1_sorted_mentions]

# Convert to a pandas DataFrame for easier analysis
df = pd.DataFrame({'ED Score': ed_scores, 'Jaccard Score': jaccard_scores})

# Density Plot for ED Score and Jaccard Score
plt.figure(figsize=(10, 6))

# Plot for ED Score
sns.kdeplot(df['ED Score'], fill=True, label='ED Score')

# Plot for Jaccard Score
sns.kdeplot(df['Jaccard Score'], fill=True, label='Jaccard Score')

plt.xlabel('Score')
plt.ylabel('Density')
plt.title('Density Plot of ED and Jaccard Scores')
plt.legend(loc='upper left')  # Show legend with labels
plt.show()


In [None]:
df.describe()

## Sample extraction

In [None]:
# SPLIT OVER THE QUARTILES

n = len(R1_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = R1_sorted_mentions[:q1_idx]
q2 = R1_sorted_mentions[q1_idx:q2_idx]
q3 = R1_sorted_mentions[q2_idx:q3_idx]
q4 = R1_sorted_mentions[q3_idx:]
    

In [None]:
sample_size = int(len(R1_sorted_mentions)/40)  
R1_sample_keys = []
R1_sample_keys = R1_sample_keys + random.sample(q1, sample_size)
R1_sample_keys = R1_sample_keys + random.sample(q2, sample_size)
R1_sample_keys = R1_sample_keys + random.sample(q3, sample_size)
R1_sample_keys = R1_sample_keys + random.sample(q4, sample_size)

In [None]:
# Extracting ED scores and Jaccard scores
ed_scores = [score[1]['ed_score'] for score in R1_sample_keys]
jaccard_scores = [score[1]['jaccard_score'] for score in R1_sample_keys]

plt.figure(figsize=(8, 5))

sns.kdeplot(ed_scores, color='skyblue', label='Edit Distance Score', fill=True)
sns.kdeplot(jaccard_scores, color='salmon', label='Jaccard Score', fill=True)

plt.title('Edit Distance and Jaccard Score Density')
plt.xlabel('Score')
plt.ylabel('Density')
plt.legend(loc="upper left")

plt.show()

# Round3_2019

In [None]:
GT_file = './data/dataset_GT/Round3_f3.csv'
chunk_size = 1000  # Adjust this based on your memory constraints

ids = {}
column_names = ["table_name", "row", "col", "url"] 
total_rows = sum(1 for line in open(GT_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk


with tqdm(total=total_iterations) as pbar:
    for chunk_GT in pd.read_csv(GT_file, chunksize=chunk_size):
        items = chunk_GT[chunk_GT['target'] == 1]
        for _, row in items.iterrows():
            if row['target'] == 1:
                ids[row["key"]] = {
                    "id": 'https://www.wikidata.org/entity/' + row['id'],
                    "name": row['name'],
                    "ed_score": row['ed_score'],
                    "jaccard_score": row['jaccard_score']
                }
        pbar.update(1)  # Update progress bar for each chunk iteration

print("Processing complete.")

In [None]:
# find the mention in the table
tables = "./data/Dataset/Dataset/Round3_2019/tables/"
cea_file = './data/Dataset/Dataset/Round3_2019/gt/CEA_Round3_gt_WD.csv'
os.listdir(tables)
df = pd.read_csv(cea_file, header=None)
df["key"] = df[0] + " " + df[1].astype('str') + " " + df[2].astype('str')
cea_keys = set(df["key"].values)
key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {row+1} {col}"
            if key in cea_keys:
                cell_value = df.iloc[row, col]
                key_to_cell[key] = cell_value

In [None]:
cea_file = './data/Dataset/Dataset/Round3_2019/gt/CEA_Round3_gt_WD.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        parts = row['url'].split('/')
        wikidata_id = parts[-1]
        num_rows, num_columns = df.shape
        key = f"{row['table_name']} {row['row']} {row['col']}"
        if key in ids:
            cell_value = key_to_cell[key]
            data = ids[key]
            mentions[cell_value] = data

print("Processing complete.")

In [None]:
sorted_mentions = sorted(mentions.items(), key=lambda x: x[1]["ed_score"])

json_file_path = "./data/Round3_2019_sorted_mentions.json"

# Save the sorted_mentions dictionary to a JSON file
with open(json_file_path, "w") as json_file:
    json.dump(sorted_mentions, json_file, indent=4)

print(f"Sorted mentions saved to {json_file_path}")

In [None]:
####################
# READ THE JSON
#####################

json_file_path = "./data/Round3_2019_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R3_sorted_mentions = json.load(file)

In [None]:
# Extract the ed_score and jaccard_score values
ed_scores = [item[1]['ed_score'] for item in R3_sorted_mentions]
jaccard_scores = [item[1]['jaccard_score'] for item in R3_sorted_mentions]

# Convert to a pandas DataFrame for easier analysis
df = pd.DataFrame({'ED Score': ed_scores, 'Jaccard Score': jaccard_scores})

# Density Plot for ED Score and Jaccard Score
plt.figure(figsize=(10, 6))

# Plot for ED Score
sns.kdeplot(df['ED Score'], fill=True, label='ED Score')

# Plot for Jaccard Score
sns.kdeplot(df['Jaccard Score'], fill=True, label='Jaccard Score')

plt.xlabel('Score')
plt.ylabel('Density')
plt.title('Density Plot of ED and Jaccard Scores')
plt.legend(loc='upper left')  # Show legend with labels
plt.show()


In [None]:
df.describe()

## Sample extraction

In [None]:
# SPLIT OVER THE QUARTILES

n = len(R3_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = R3_sorted_mentions[:q1_idx]
q2 = R3_sorted_mentions[q1_idx:q2_idx]
q3 = R3_sorted_mentions[q2_idx:q3_idx]
q4 = R3_sorted_mentions[q3_idx:]
    

In [None]:
sample_size = int(len(R3_sorted_mentions)/40) 
R3_sample_keys = []
R3_sample_keys = R3_sample_keys + random.sample(q1, sample_size)
R3_sample_keys = R3_sample_keys + random.sample(q2, sample_size)
R3_sample_keys = R3_sample_keys + random.sample(q3, sample_size)
R3_sample_keys = R3_sample_keys + random.sample(q4, sample_size)

In [None]:
# Extracting ED scores and Jaccard scores
ed_scores = [score[1]['ed_score'] for score in R3_sample_keys]
jaccard_scores = [score[1]['jaccard_score'] for score in R3_sample_keys]

plt.figure(figsize=(8, 5))

sns.kdeplot(ed_scores, color='skyblue', label='Edit Distance Score', fill=True)
sns.kdeplot(jaccard_scores, color='salmon', label='Jaccard Score', fill=True)

plt.title('Edit Distance and Jaccard Score Density')
plt.xlabel('Score')
plt.ylabel('Density')
plt.legend(loc="upper left")

plt.show()

# 2T_Round4

In [None]:
GT_file = './data/dataset_GT/2T-2020_f3.csv'
chunk_size = 1000  # Adjust this based on your memory constraints

ids = {}
column_names = ["table_name", "row", "col", "url"] 
total_rows = sum(1 for line in open(GT_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk


with tqdm(total=total_iterations) as pbar:
    for chunk_GT in pd.read_csv(GT_file, chunksize=chunk_size):
        items = chunk_GT[chunk_GT['target'] == 1]
        for _, row in items.iterrows():
            ids[row["key"]] = {
                "id": 'https://www.wikidata.org/entity/' + row['id'],
                "name": row['name'],
                "ed_score": row['ed_score'],
                "jaccard_score": row['jaccard_score']
            }
        pbar.update(1)  # Update progress bar for each chunk iteration

print("Processing complete.")

In [None]:
# find the mention in the table
tables = "./data/Dataset/Dataset/2T_2020/tables/"
cea_file = './data/Dataset/Dataset/2T_2020/gt/cea.csv'
os.listdir(tables)
df = pd.read_csv(cea_file, header=None)
df["key"] = df[0] + " " + df[1].astype('str') + " " + df[2].astype('str')
cea_keys = set(df["key"].values)
key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {row+1} {col}"
            if key in cea_keys:
                cell_value = df.iloc[row, col]
                key_to_cell[key] = cell_value

In [None]:
cea_file = './data/Dataset/Dataset/2T_2020/gt/cea.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        parts = row['url'].split('/')
        wikidata_id = parts[-1]
        num_rows, num_columns = df.shape
        key = f"{row['table_name']} {row['row']} {row['col']}"
        if key in ids:
            cell_value = key_to_cell[key]
            data = ids[key]
            mentions[cell_value] = data

print("Processing complete.")

In [None]:
sorted_mentions = sorted(mentions.items(), key=lambda x: x[1]["ed_score"])

json_file_path = "./data/2T_Round4_sorted_mentions.json"

# Save the sorted_mentions dictionary to a JSON file
with open(json_file_path, "w") as json_file:
    json.dump(sorted_mentions, json_file, indent=4)

print(f"Sorted mentions saved to {json_file_path}")

In [None]:
####################
# READ THE JSON
#####################

json_file_path = "./data/2T_Round4_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R4_2T_sorted_mentions = json.load(file)

In [None]:
# Extract the ed_score and jaccard_score values
ed_scores = [item[1]['ed_score'] for item in R4_2T_sorted_mentions]
jaccard_scores = [item[1]['jaccard_score'] for item in R4_2T_sorted_mentions]

# Convert to a pandas DataFrame for easier analysis
df = pd.DataFrame({'ED Score': ed_scores, 'Jaccard Score': jaccard_scores})

# Density Plot for ED Score and Jaccard Score
plt.figure(figsize=(10, 6))

# Plot for ED Score
sns.kdeplot(df['ED Score'], fill=True, label='ED Score')

# Plot for Jaccard Score
sns.kdeplot(df['Jaccard Score'], fill=True, label='Jaccard Score')

plt.xlabel('Score')
plt.ylabel('Density')
plt.title('Density Plot of ED and Jaccard Scores')
plt.legend(loc='upper left')  # Show legend with labels
plt.show()


In [None]:
df.describe()

## Sample extraction

In [None]:
## Sample extraction
# SPLIT OVER THE QUARTILES

n = len(R4_2T_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = R4_2T_sorted_mentions[:q1_idx]
q2 = R4_2T_sorted_mentions[q1_idx:q2_idx]
q3 = R4_2T_sorted_mentions[q2_idx:q3_idx]
q4 = R4_2T_sorted_mentions[q3_idx:]

In [None]:
sample_size = int(len(R4_2T_sorted_mentions)/40) 
R4_2T_sample_keys = []
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q1, sample_size)
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q2, sample_size)
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q3, sample_size)
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q4, sample_size)

In [None]:
# Extracting ED scores and Jaccard scores
ed_scores = [score[1]['ed_score'] for score in R4_2T_sample_keys]
jaccard_scores = [score[1]['jaccard_score'] for score in R4_2T_sample_keys]

plt.figure(figsize=(8, 5))

sns.kdeplot(ed_scores, color='skyblue', label='Edit Distance Score', fill=True)
sns.kdeplot(jaccard_scores, color='salmon', label='Jaccard Score', fill=True)

plt.title('Edit Distance and Jaccard Score Density')
plt.xlabel('Score')
plt.ylabel('Density')
plt.legend(loc="upper left")

plt.show()

# Round4

In [52]:
GT_file = './data/dataset_GT/Round4_f3.csv'
chunk_size = 1000  # Adjust this based on your memory constraints

ids = {}
column_names = ["table_name", "row", "col", "url"] 
total_rows = sum(1 for line in open(GT_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk


with tqdm(total=total_iterations) as pbar:
    for chunk_GT in pd.read_csv(GT_file, chunksize=chunk_size):
        items = chunk_GT[chunk_GT['target'] == 1]
        for _, row in items.iterrows():
            
            if row['name'] == "imo 9528017":
                print(row)
                break

            
            ids[row["key"]] = {
                "id": 'https://www.wikidata.org/entity/' + row['id'],
                "name": row['name'],
                "ed_score": row['ed_score'],
                "jaccard_score": row['jaccard_score']
            }
        pbar.update(1)  # Update progress bar for each chunk iteration

print("Processing complete.")

 29%|██▉       | 5302/18384 [01:06<02:43, 79.81it/s]

tableName                                                    95RMZLI4
key                                                      95RMZLI4 9 0
id                                                          Q83641013
name                                                      imo 9528017
types                      [{'id': 'Q15276', 'name': 'bulk carrier'}]
ambiguity_mention                                                 0.0
corrects_tokens                                                   1.0
ntoken_mention                                                      2
ntoken_entity                                                       2
length_mention                                                     14
length_entity                                                      11
popularity                                                        0.0
pos_score                                                       0.002
es_score                                                          1.0
ed_score            

 88%|████████▊ | 16258/18384 [03:27<00:26, 80.42it/s]

tableName                                                    RDL6Z4OQ
key                                                      RDL6Z4OQ 7 0
id                                                          Q83641013
name                                                      imo 9528017
types                      [{'id': 'Q15276', 'name': 'bulk carrier'}]
ambiguity_mention                                                 0.0
corrects_tokens                                                   1.0
ntoken_mention                                                      2
ntoken_entity                                                       2
length_mention                                                     14
length_entity                                                      11
popularity                                                        0.0
pos_score                                                       0.002
es_score                                                          1.0
ed_score            

100%|██████████| 18384/18384 [03:55<00:00, 77.97it/s]

Processing complete.





In [None]:
# find the mention in the table
tables = "./data/Dataset/Dataset/Round4_2020/tables/"
cea_file = './data/Dataset/Dataset/Round4_2020/gt/cea.csv'
os.listdir(tables)
df = pd.read_csv(cea_file, header=None)
df["key"] = df[0] + " " + df[1].astype('str') + " " + df[2].astype('str')
cea_keys = set(df["key"].values)
key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {row+1} {col}"
            if key in cea_keys:
                cell_value = df.iloc[row, col]
                key_to_cell[key] = cell_value


In [None]:
cea_file = './data/Dataset/Dataset/Round4_2020/gt/cea.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        parts = row['url'].split('/')
        wikidata_id = parts[-1]
        num_rows, num_columns = df.shape
        key = f"{row['table_name']} {row['row']} {row['col']}"
        if key in ids:
            cell_value = key_to_cell[key]
            data = ids[key]
            mentions[cell_value] = data

print("Processing complete.")

In [None]:
sorted_mentions = sorted(mentions.items(), key=lambda x: x[1]["ed_score"])

json_file_path = "./data/Round4_sorted_mentions.json"

# Save the sorted_mentions dictionary to a JSON file
with open(json_file_path, "w") as json_file:
    json.dump(sorted_mentions, json_file, indent=4)

print(f"Sorted mentions saved to {json_file_path}")

In [None]:
####################
# READ THE JSON
#####################

json_file_path = "./data/Round4_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R4_sorted_mentions = json.load(file)

In [None]:
# Extract the ed_score and jaccard_score values
ed_scores = [item[1]['ed_score'] for item in R4_sorted_mentions]
jaccard_scores = [item[1]['jaccard_score'] for item in R4_sorted_mentions]

# Convert to a pandas DataFrame for easier analysis
df = pd.DataFrame({'ED Score': ed_scores, 'Jaccard Score': jaccard_scores})

# Density Plot for ED Score and Jaccard Score
plt.figure(figsize=(10, 6))

# Plot for ED Score
sns.kdeplot(df['ED Score'], fill=True, label='ED Score')

# Plot for Jaccard Score
sns.kdeplot(df['Jaccard Score'], fill=True, label='Jaccard Score')

plt.xlabel('Score')
plt.ylabel('Density')
plt.title('Density Plot of ED and Jaccard Scores')
plt.legend(loc='upper left')  # Show legend with labels
plt.show()


In [None]:
df.describe()

## Sample extraction

In [None]:
## Sample extraction
# SPLIT OVER THE QUARTILES

n = len(R4_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = R4_sorted_mentions[:q1_idx]
q2 = R4_sorted_mentions[q1_idx:q2_idx]
q3 = R4_sorted_mentions[q2_idx:q3_idx]
q4 = R4_sorted_mentions[q3_idx:]

In [None]:
sample_size = int(len(R4_sorted_mentions)/40) 
R4_sample_keys = []
R4_sample_keys = R4_sample_keys + random.sample(q1, sample_size)
R4_sample_keys = R4_sample_keys + random.sample(q2, sample_size)
R4_sample_keys = R4_sample_keys + random.sample(q3, sample_size)
R4_sample_keys = R4_sample_keys + random.sample(q4, sample_size)

In [None]:

# Extracting ED scores and Jaccard scores
ed_scores = [score[1]['ed_score'] for score in R4_sample_keys]
jaccard_scores = [score[1]['jaccard_score'] for score in R4_sample_keys]

plt.figure(figsize=(8, 5))

sns.kdeplot(ed_scores, color='skyblue', label='Edit Distance Score', fill=True)
sns.kdeplot(jaccard_scores, color='salmon', label='Jaccard Score', fill=True)

plt.title('Edit Distance and Jaccard Score Density')
plt.xlabel('Score')
plt.ylabel('Density')
plt.legend(loc="upper left")

plt.show()

## Datasets Comparison

In [None]:
def extract_scores(data):
    return [item[1]['ed_score'] for item in data]

ed_scores_R1 = extract_scores(R1_sample_keys)
ed_scores_R3 = extract_scores(R3_sample_keys)
ed_scores_R4 = extract_scores(R4_sample_keys)
ed_scores_R4_2T = extract_scores(R4_2T_sample_keys)

# Plot the KDE plots
plt.figure(figsize=(10, 6))

sns.kdeplot(ed_scores_R1, color='skyblue', label='R1 Edit Distance Score', fill=True)
sns.kdeplot(ed_scores_R3, color='green', label='R3 Edit Distance Score', fill=True)
sns.kdeplot(ed_scores_R4, color='red', label='R4 Edit Distance Score', fill=True)
sns.kdeplot(ed_scores_R4_2T, color='purple', label='R4_2T Edit Distance Score', fill=True)

plt.xlabel('Edit Distance Score')
plt.ylabel('Density')
plt.title('Density Plot of Edit Distance Scores for Different Rounds')
plt.legend(loc='upper left')
plt.show()