In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# -----------------------------------
# Utility Functions
# -----------------------------------
def isnan(val):
   return val != val
def clean_text(text):
   if pd.isna(text):
       return ""
   return str(text).lower().strip()
def find_best_matching_company(extracted_company, master_company_list, threshold=0.7):
   extracted_company = clean_text(extracted_company)
   if not extracted_company:
       return None
   # Vectorizing
   vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 4))
   tfidf = vectorizer.fit_transform([extracted_company] + master_company_list)
   sims = cosine_similarity(tfidf[0:1], tfidf[1:])[0]
   best_idx = np.argmax(sims)
   best_score = sims[best_idx]
   if best_score >= threshold:
       return master_company_list[best_idx]
   return None

# -----------------------------------
# Core Matching Engine
# -----------------------------------
def get_scores_df(names, ref_names, range=(2, 2)):
   matches_df = pd.DataFrame()
   matches_df["names"] = names
   vectorizer = TfidfVectorizer(
       strip_accents="ascii",
       lowercase=True,
       analyzer="char",
       ngram_range=range,
   ).fit(ref_names + names)
   matrix = vectorizer.transform(names)
   matrix_ref = vectorizer.transform(ref_names)
   for i, ref_name in enumerate(ref_names):
       matches = cosine_similarity(matrix_ref[i:i+1], matrix)
       matches_df[ref_name] = matches[0]
   return matches_df
def score_name(name, ref_name, verbose=False):
   score_thresh = 0.5
   middle_weight = 0.7
   first_weight = 1.0
   last_weight = 1.0
   name = name.split(" ")
   ref_name = ref_name.split(" ")

   original_ref_name = ref_name.copy()
   ref_first_name = original_ref_name[0]
   ref_last_name = original_ref_name[-1]
   initials = [x[0] for x in ref_name]
   original_name = name.copy()
   extra_names = []
   df = pd.DataFrame()
   df["ref"] = ref_name
   df["name"] = np.nan
   df["name_pos"] = np.nan
   df["score"] = np.nan
   df["is_initial"] = 0

   
   df['name'] = df['name'].astype(object)
   df['name_pos'] = df['name_pos'].astype(object)  
   df['score'] = df['score'].astype(float)
   df['is_initial'] = df['is_initial'].astype(int)


   df = df.set_index("ref")
   while (len(ref_name) > 0) and (len(name) > 0):
       df_scores = get_scores_df(name, ref_name, range=(1,3)).drop_duplicates(keep="first")
       df_scores = df_scores.set_index("names")
       (name_split, ref_split) = df_scores.stack().idxmax()
       score = df_scores.at[name_split, ref_split]
       original_index = original_name.index(name_split)
       original_ref_index = original_ref_name.index(ref_split)
       if score > score_thresh:
           df.loc[ref_split, "name"] = name_split
           df.loc[ref_split, "name_pos"] = original_index
           df.loc[ref_split, "score"] = score


           ref_name.remove(ref_split)
           name.remove(name_split)
           initials[original_ref_index] = ""
       else:
           if len(name_split) > 1:
               extra_names.append(name_split)
               name.remove(name_split)
           elif name_split in initials:
               if isnan(df.at[ref_first_name, "name"]) and name_split == ref_first_name[0]:
                   ref_split = ref_first_name
                   original_index = 0
                   df.loc[ref_split, "score"] = 0.5
               elif isnan(df.at[ref_last_name, "name"]) and name_split == ref_last_name[0]:
                   ref_split = ref_last_name
                   original_index = len(original_ref_name)-1
                   df["score"][ref_split] = 0.6
               else:
                   df["score"][ref_split] = 0.8
               df.loc[ref_split, "name"] = name_split
               df.loc[ref_split, "name_pos"] = original_name.index(name_split)
               df.loc[ref_split, "is_initial"] = 1
               ref_name.remove(ref_split)
               name.remove(name_split)
               initials[original_ref_index] = ""
           else:
               name.remove(name_split)
               extra_names.append(name_split)
   if len(name) > 0:
       extra_names.extend(name)
   df = df.reset_index()
   df["is_missing"] = 0
   df.loc[df["name"].isnull(), "is_missing"] = 1
   df["is_first_name"] = 0
   df.loc[0, "is_first_name"] = 1
   df["is_last_name"] = 0
   df.loc[len(original_ref_name)-1, "is_last_name"] = 1
   df.at[len(original_ref_name)-1, "is_last_name"] = 1
   df["is_middle_name"] = 1
   df.loc[(df["is_first_name"] + df["is_last_name"] >= 1), "is_middle_name"] = 0
   df["weight"] = middle_weight
   df.at[0, "weight"] = first_weight
   df.at[(len(original_ref_name)-1), "weight"] = last_weight
   pos_df = df.dropna(subset=["name"]).reset_index(drop=True)
   order_score = 0.5
   if pos_df["name_pos"].is_monotonic_increasing:
       order_score = 1.0
   elif pos_df.at[0, "is_first_name"] == 1 and pos_df.at[len(pos_df)-1, "is_last_name"] == 1:
       order_score = 0.8
   elif pos_df.at[0, "is_last_name"] == 1 and pos_df.at[1, "is_first_name"] == 1:
       if pos_df.iloc[2:, :]["name_pos"].is_monotonic_increasing:
           order_score = 0.7
       else:
           order_score = 0.6
   missing_score = 1
   if df[df["is_first_name"] == 1]["is_missing"].sum() == 1:
       missing_score *= 0.15
   if df[df["is_last_name"] == 1]["is_missing"].sum() == 1:
       missing_score *= 0.6
   if df[df["is_middle_name"] == 1]["is_missing"].sum() > 0:
       missing_percentage = df[df["is_middle_name"] == 1]["is_missing"].sum() / len(df[df["is_middle_name"] == 1])
       if missing_percentage == 1:
           missing_score *= 0.6
       elif missing_percentage >= 0.5:
           missing_score *= 0.65
       else:
           missing_score *= 0.7
   splits_score = (pos_df["score"] * pos_df["weight"]).sum() / pos_df["weight"].sum()
   extra_penalty = 0.5 / len(extra_names) if len(extra_names) > 0 else 1
   single_name_penalty = 0.5 if len(original_name) == 1 else 1
   scores = [order_score, missing_score, splits_score]
   weights = [0.8, 1.0, 1.5]
   if (df[df["is_missing"] == 0]["is_initial"] == 1).all():
       score = 0.3
   else:
       score = single_name_penalty * extra_penalty * np.average(scores, weights=weights)
   if np.isnan(score):
       score = 0
   return score
# -----------------------------------
# Use Case Matching Function
def is_abbreviation(abbrev, full_text):
   abbrev = abbrev.lower()
   full_words = full_text.lower().split()
   expected_abbrev = ''.join(word[0] for word in full_words if word)
   return abbrev == expected_abbrev
def abbreviation_overlap_score(abbrev, full_text):
   abbrev = abbrev.lower()
   full_words = full_text.lower().split()
   expected_abbrev = ''.join(word[0] for word in full_words if word)
   # Count matching characters in order
   match_count = 0
   min_len = min(len(abbrev), len(expected_abbrev))
   for i in range(min_len):
       if abbrev[i] == expected_abbrev[i]:
           match_count += 1
   if match_count == 0:
       return 0.0
   overlap_ratio = match_count / len(expected_abbrev)
   # Scale score between 0.6 and 0.85
   return 0.6 + (0.25 * overlap_ratio)
def score_text_similarity(text1, text2):
   if not text1 or not text2:
       return 0.0
   # Handling abbreviations
   if is_abbreviation(text1, text2) or is_abbreviation(text2, text1):
       return 0.85  # full match but slightly penalized
   # Handling partial abbreviation overlaps
   score1 = abbreviation_overlap_score(text1, text2)
   score2 = abbreviation_overlap_score(text2, text1)
   max_abbrev_score = max(score1, score2)
   if max_abbrev_score >= 0.6:
       return max_abbrev_score
   # Fallback to character-level similarity
   vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(2, 4))
   tfidf = vectorizer.fit_transform([text1.lower(), text2.lower()])
   return cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]

def match_signatory(row, df_master):
   name = clean_text(row["extracted_name"])
   designation = clean_text(row["extracted_designation"])
   extracted_company = clean_text(row["company_name"])
   master_companies = df_master["company_name"].unique().tolist()
   matched_company = find_best_matching_company(extracted_company, master_companies)
   if not matched_company:
       return pd.Series([None, None, 0.0, 0.0, 0.0])
   df_company = df_master[df_master["company_name"] == matched_company]

   if df_company.empty:
       return pd.Series([None, None, 0.0, 0.0, 0.0])
   scores = []
   for _, master_row in df_company.iterrows():
       name_score = score_name(name, master_row["signatory_name"])
       designation_score = score_text_similarity(designation, master_row["designation"])
       combined_score = np.average([name_score, designation_score], weights=[0.7, 0.3])
       scores.append((master_row["signatory_name"], master_row["designation"], name_score, designation_score, combined_score))
   best_match = max(scores, key=lambda x: x[4])
   return pd.Series(best_match)
# -----------------------------------
# Main Pipeline
# -----------------------------------
# master list
df_master = pd.read_csv("master_signatory_list.csv")
df_master["company_name"] = df_master["company_name"].apply(clean_text)
df_master["signatory_name"] = df_master["signatory_name"].apply(clean_text)
df_master["designation"] = df_master["designation"].apply(clean_text)
# extracted data
df_extracted = pd.read_csv("extracted_signatory_data.csv")
df_extracted["company_name"] = df_extracted["company_name"].apply(clean_text)
df_extracted["extracted_name"] = df_extracted["extracted_name"].apply(clean_text)
df_extracted["extracted_designation"] = df_extracted["extracted_designation"].apply(clean_text)
# Apply matching
df_extracted[[
   "matched_name",
   "matched_designation",
   "name_match_score",
   "designation_match_score",
   "combined_score"
]] = df_extracted.apply(lambda row: match_signatory(row, df_master), axis=1)
# Filter high confidence matches
#df_high_confidence = df_extracted[df_extracted["name_match_score"] >= 0.8]
df_extracted.to_csv("signatory_matching_output.csv", index=False)

In [2]:
df_extracted

Unnamed: 0,company_name,extracted_name,extracted_designation,matched_name,matched_designation,name_match_score,designation_match_score,combined_score
0,abc corp,j smith,cfo,john smith,chief financial officer,0.886364,0.85,0.875455
