In [2]:
!pip install rapidfuzz

Collecting rapidfuzz
  Obtaining dependency information for rapidfuzz from https://files.pythonhosted.org/packages/ae/b8/a79e997baf4f4467c8428feece5d7b9ac22ff0918ebf793ed247ba5a3f3a/rapidfuzz-3.14.0-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading rapidfuzz-3.14.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.0-cp311-cp311-macosx_11_0_arm64.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.0


In [None]:
#Part A

In [14]:
import pandas as pd
import re
from rapidfuzz import fuzz, process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [11]:
# ----------------------
# Step 1: Load the files
# ----------------------
resolved_df = pd.read_csv("resolved_queries.csv")  # columns: Query_ID, Pre_Resolved_Query
new_df = pd.read_csv("new_queries.csv")            # columns: Variation_Query

# ----------------------
# Step 2: Preprocessing function
# ----------------------
def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

resolved_df["clean"] = resolved_df["Pre_Resolved_Query"].apply(preprocess)
new_df["clean"] = new_df["Variation_Query"].apply(preprocess)

# ----------------------
# Step 3: Fuzzy Matching
# ----------------------
def fuzzy_match(query, resolved_df, threshold=75):
    best_match, score, idx = process.extractOne(
        query,
        resolved_df["clean"].tolist(),
        scorer=fuzz.token_set_ratio
    )
    if score >= threshold:
        match_id = resolved_df.iloc[idx]["Query_ID"]
        return match_id, score
    else:
        return None, score

# ----------------------
# Step 4: TF-IDF + Cosine Similarity
# ----------------------
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(resolved_df["clean"])

def cosine_match(query, resolved_df, threshold=0.6):
    query_vec = vectorizer.transform([query])
    sims = cosine_similarity(query_vec, tfidf_matrix).flatten()
    best_idx = sims.argmax()
    best_score = sims[best_idx]
    if best_score >= threshold:
        return resolved_df.iloc[best_idx]["Query_ID"], best_score
    else:
        return None, best_score

# ----------------------
# Step 5: Match new queries
# ----------------------
results = []
for q in new_df["clean"]:
    fuzzy_id, fuzzy_score = fuzzy_match(q, resolved_df)
    cosine_id, cosine_score = cosine_match(q, resolved_df)

    # Normalize fuzzy score (0–1)
    fuzzy_score_norm = fuzzy_score / 100.0

    # Decide best match: prefer cosine if available, else fuzzy
    if cosine_id is not None:
        final_id = cosine_id
    else:
        final_id = fuzzy_id

    results.append({
        "Variation_Query": q,
        "Fuzzy_Match_ID": fuzzy_id,
        "Fuzzy_Score": fuzzy_score,
        "Fuzzy_Score_Norm": fuzzy_score_norm,
        "Cosine_Match_ID": cosine_id,
        "Cosine_Score": cosine_score,
        "Matches_With_Query_ID": final_id
    })

results_df = pd.DataFrame(results)



In [12]:
results_df

Unnamed: 0,Variation_Query,Fuzzy_Match_ID,Fuzzy_Score,Fuzzy_Score_Norm,Cosine_Match_ID,Cosine_Score,Matches_With_Query_ID
0,unabel to conect to the internet,1.0,94.915254,0.949153,1.0,0.839042,1.0
1,cant connect to internet,1.0,88.372093,0.883721,1.0,0.836936,1.0
2,intenet not working,,48.979592,0.489796,,0.0,
3,payment failed while chekout,2.0,82.758621,0.827586,2.0,0.707107,2.0
4,payment did not go through during chckout,,70.422535,0.704225,2.0,0.707107,2.0
5,payment issue at check out,,57.142857,0.571429,,0.5,
6,application crashes when opening setings,3.0,87.671233,0.876712,3.0,0.774597,3.0
7,app crash when going to settings,3.0,86.153846,0.861538,3.0,0.722471,3.0
8,settings cause the app to chrash,,64.615385,0.646154,,0.508047,
9,forgot passwrd and cant reset,4.0,75.0,0.75,4.0,0.782698,4.0


In [None]:
#In cosine similarity, a higher score means the queries are more similar.
#Range: 0 → 1
#0 = completely different
#1 = identical vectors (perfect match)

#In practice for query matching:
#>0.7 → very strong similarity
#0.5–0.7 → moderate, possible match
#<0.5 → usually weak or irrelevant

In [None]:
#Part B

In [18]:
import pandas as pd

print(pd.read_csv("base_names.csv").head())
print(pd.read_csv("name_variations.csv").head())


   Base_Name_ID         Base_Name
0             1        John Smith
1             2    Jennifer Brown
2             3  Michael O'Connor
3             4      Maria Garcia
4             5        Robert Lee
      Variation Matches_With_Base_Name
0  Thomas  King            Thomas King
1    ThomasKing            Thomas King
2  Maria Garcia           Maria Garcia
3     MaryLewis             Mary Lewis
4      Nancy W.           Nancy Wright


In [19]:
import pandas as pd
from rapidfuzz import process, fuzz

base_names = pd.read_csv("base_names.csv")["Base_Name"].dropna().tolist()
name_variations = pd.read_csv("name_variations.csv")["Variation"].dropna().tolist()

# Preprocessing helper function
def clean_name(name):
    name = str(name).lower().strip()
    name = name.replace(",", "")  # remove commas
    name = " ".join(name.split())  # normalize spaces
    return name

base_names_clean = [clean_name(n) for n in base_names]
name_variations_clean = [clean_name(n) for n in name_variations]

# Perform fuzzy matching
matches = []
for base, base_clean in zip(base_names, base_names_clean):
    best_match, score, _ = process.extractOne(
        base_clean, name_variations_clean, scorer=fuzz.token_sort_ratio
    )
    # Get original form of matched name (not cleaned)
    matched_original = name_variations[name_variations_clean.index(best_match)]
    matches.append([base, matched_original, score])

results_df = pd.DataFrame(matches, columns=["Base_Name", "Matched_Variation", "Similarity_Score"])
results_df

Unnamed: 0,Base_Name,Matched_Variation,Similarity_Score
0,John Smith,JOHN smith,100.0
1,Jennifer Brown,Jennifer Brown,100.0
2,Michael O'Connor,Michael O'Connor,100.0
3,Maria Garcia,Maria Garcia,100.0
4,Robert Lee,Robert Lee,100.0
5,Linda Johnson,linda johnson,100.0
6,William Davis,William Davis,100.0
7,Elizabeth Wilson,elizabeth wilson,100.0
8,David Martinez,DAVID martinez,100.0
9,Susan Clark,Susan Clark,100.0
