In [2]:
import pandas as pd

# Load the two CSV files for comparison
file1_df = pd.read_csv('filtered_file1.csv')
file2_df = pd.read_csv('filtered_file2.csv')

# Define the column pairs to compare
columns_to_compare = [
    ("Sellers", "Sellers"),
    ("Buyers", "Buyers"),
    ("Brokers", "Brokers"),
    ("Surgery", "Surgery")
]

columns_to_compare2 = [
    ("Sellers_x", "Sellers_y"),
    ("Buyers_x", "Buyers_y"),
    ("Brokers_x", "Brokers_y"),
    ("Surgery_x", "Surgery_y")
]

# Mapping of common abbreviations to full country names
country_mapping = {
    'UK': 'United Kingdom',
    'US': 'United States',
    'England': 'United Kingdom',
    'USA': 'United States',
}

# Function to replace abbreviations with full country names
def replace_abbreviations(cell):
    if pd.isna(cell):
        return cell
    for abbr, full_name in country_mapping.items():
        cell = str(cell).replace(abbr, full_name)
    return cell

# Apply the function to relevant columns in both dataframes
for col1, _ in columns_to_compare:
    file1_df[col1] = file1_df[col1].apply(replace_abbreviations)
for _, col2 in columns_to_compare:
    file2_df[col2] = file2_df[col2].apply(replace_abbreviations)

# Function to compare two cells
def cell_comparison(cell1, cell2):
    set1 = set(str(cell1).split(', '))
    set2 = set(str(cell2).split(', '))
    return not set1.isdisjoint(set2)

# Merge the two dataframes on ID
merged_df = pd.merge(file1_df, file2_df, on="ID")

# Print match counts for each column pair
for col1, col2 in columns_to_compare2:
    comparison = merged_df.apply(lambda row: cell_comparison(row[col1], row[col2]), axis=1)
    match_count = comparison.sum()
    total_count = len(comparison)
    print(f"{col1}-{col2}: {match_count}/{total_count} matches ({(match_count / total_count) * 100:.2f}%)")


Sellers_x-Sellers_y: 233/270 matches (86.30%)
Buyers_x-Buyers_y: 240/270 matches (88.89%)
Brokers_x-Brokers_y: 252/270 matches (93.33%)
Surgery_x-Surgery_y: 259/270 matches (95.93%)


In [8]:
# Calculate precision, recall, F1, and full/partial match accuracy
results = {}

for col in ['Sellers', 'Buyers', 'Brokers', 'Surgery']:
    precisions, recalls, f1s = [], [], []
    full_matches, partial_matches = 0, 0

    for _, row in merged_df.iterrows():
        true_str = row[f"{col}_x"]
        pred_str = row[f"{col}_y"]

        if pd.isna(true_str) and pd.isna(pred_str):
            precisions.append(1.0)
            recalls.append(1.0)
            f1s.append(1.0)
            full_matches += 1
            continue

        true_set = set(map(str.strip, str(true_str).replace('\n', ',').split(','))) if pd.notna(true_str) else set()
        pred_set = set(map(str.strip, str(pred_str).replace('\n', ',').split(','))) if pd.notna(pred_str) else set()

        tp = len(true_set & pred_set)
        fp = len(pred_set - true_set)
        fn = len(true_set - pred_set)

        precision = tp / (tp + fp) if (tp + fp) else 0.0
        recall = tp / (tp + fn) if (tp + fn) else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0

        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

        if true_set == pred_set:
            full_matches += 1
        elif tp > 0:
            partial_matches += 1

    total = len(merged_df)
    results[col] = {
        'Precision (%)': round(sum(precisions) / total * 100, 2),
        'Recall (%)': round(sum(recalls) / total * 100, 2),
        'F1 Score (%)': round(sum(f1s) / total * 100, 2),
        'Full Match Accuracy (%)': round(full_matches / total * 100, 2),
        'Partial Match Accuracy (%)': round(partial_matches / total * 100, 2),
        'Overall Accuracy (%)': round((full_matches + partial_matches) / total * 100, 2)
    }

# Print results
for col, metrics in results.items():
    print(f"{col}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value}%")
    print()


Sellers:
  Precision (%): 74.37%
  Recall (%): 75.65%
  F1 Score (%): 71.19%
  Full Match Accuracy (%): 51.48%
  Partial Match Accuracy (%): 34.81%
  Overall Accuracy (%): 86.3%

Buyers:
  Precision (%): 79.21%
  Recall (%): 77.5%
  F1 Score (%): 74.5%
  Full Match Accuracy (%): 54.44%
  Partial Match Accuracy (%): 34.44%
  Overall Accuracy (%): 88.89%

Brokers:
  Precision (%): 86.16%
  Recall (%): 70.42%
  F1 Score (%): 72.84%
  Full Match Accuracy (%): 46.3%
  Partial Match Accuracy (%): 47.04%
  Overall Accuracy (%): 93.33%

Surgery:
  Precision (%): 89.32%
  Recall (%): 84.25%
  F1 Score (%): 83.28%
  Full Match Accuracy (%): 65.56%
  Partial Match Accuracy (%): 30.37%
  Overall Accuracy (%): 95.93%

