<a href="https://colab.research.google.com/github/swayam305/SwayamParhi_NLP/blob/main/Assignment4_Text_Search/Assignment4_J066_Part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from thefuzz import process, fuzz
import re

def preprocess_name(name):
    """Clean and normalize names for better matching"""
    if pd.isna(name):
        return ""

    # Convert to lowercase
    name = str(name).lower()

    # Remove non-alphanumeric characters except spaces, apostrophes, and hyphens
    name = re.sub(r'[^a-z\'\- ]', '', name)

    # Normalize spaces - replace multiple spaces with single space
    name = re.sub(r' +', ' ', name).strip()

    return name

# Read CSV files using pandas
base_df = pd.read_csv('base_names.csv')
variations_df = pd.read_csv('name_variations.csv')

# Preprocess base names and create a mapping
base_names_mapping = {}
for _, row in base_df.iterrows():
    clean_name = preprocess_name(row['Base_Name'])
    base_names_mapping[clean_name] = row['Base_Name']

# Function to find the best match for a variation
def find_best_match(variation):
    clean_variation = preprocess_name(variation)

    # Use process.extractOne to find the best match
    best_match, score = process.extractOne(
        clean_variation,
        list(base_names_mapping.keys()),
        scorer=fuzz.token_sort_ratio
    )

    if score >= 80:
        return base_names_mapping[best_match], score
    else:
        return "No match found", score

# Apply the matching function to all variations
variations_df[['Matched_Base_Name', 'Confidence_Score']] = variations_df['Variation'].apply(
    lambda x: pd.Series(find_best_match(x))
)

# Display results
print("Matching Results:")
print("=" * 50)
print(variations_df.to_string(index=False))

# Save results to CSV
variations_df.to_csv('name_matches_pandas.csv', index=False)
print(f"\nResults saved to 'name_matches_pandas.csv'")

# Optional: Show summary statistics
print(f"\nSummary Statistics:")
print(f"Total variations: {len(variations_df)}")
matched_count = (variations_df['Matched_Base_Name'] != "No match found").sum()
print(f"Successfully matched: {matched_count} ({matched_count/len(variations_df)*100:.1f}%)")

# Show matches with confidence scores
print(f"\nDetailed matches with confidence scores:")
for _, row in variations_df.iterrows():
    status = "✓" if row['Matched_Base_Name'] != "No match found" else "✗"
    print(f"{status} {row['Variation']} -> {row['Matched_Base_Name']} ({row['Confidence_Score']})")

Matching Results:
         Variation Matches_With_Base_Name Matched_Base_Name  Confidence_Score
      Thomas  King            Thomas King       Thomas King               100
        ThomasKing            Thomas King    No match found                57
      Maria Garcia           Maria Garcia      Maria Garcia               100
         MaryLewis             Mary Lewis    No match found                53
          Nancy W.           Nancy Wright    No match found                74
      Dani3l Scott           Daniel Scott      Daniel Scott                96
       JOHN  smith             John Smith        John Smith               100
     linda johnson          Linda Johnson     Linda Johnson               100
      N@ncy Wright           Nancy Wright      Nancy Wright                96
     William Davis          William Davis     William Davis               100
      Susan  Clark            Susan Clark       Susan Clark               100
        SusanClark            Susan Clark    N