In [2]:
!pip install fuzzywuzzy[speedup] scikit-learn nltk

Collecting fuzzywuzzy[speedup]
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-levenshtein>=0.12 (from fuzzywuzzy[speedup])
  Downloading python_Levenshtein-0.26.0-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.0 (from python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading levenshtein-0.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.0->python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading rapidfuzz-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.0-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Do

In [3]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re

# Ensure you download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing Function
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back to string
    return ' '.join(tokens)

# Load Data
resolved_queries = pd.read_csv('resolved_queries.csv')

variation_queries = pd.read_csv('new_queries.csv')

# Preprocess both datasets
resolved_queries['processed'] = resolved_queries['Pre_Resolved_Query'].apply(preprocess_text)
variation_queries['processed'] = variation_queries['Variation_Query'].apply(preprocess_text)

# Step 1: Fuzzy Search Method
def fuzzy_match(unresolved_query, resolved_queries):
    best_match, score = process.extractOne(unresolved_query, resolved_queries, scorer=fuzz.ratio)
    return best_match, score

# Applying Fuzzy Search for each unresolved query
fuzzy_results = []
for query in variation_queries['processed']:
    match, score = fuzzy_match(query, resolved_queries['processed'].tolist())
    fuzzy_results.append((query, match, score))

# Convert Fuzzy Results to DataFrame
fuzzy_df = pd.DataFrame(fuzzy_results, columns=['Unresolved Query', 'Resolved Query', 'Score'])

# Step 2: BoW/TF-IDF + Cosine Similarity
# Vectorizing queries
def vectorize_and_match(unresolved_queries, resolved_queries, method='tfidf'):
    if method == 'tfidf':
        vectorizer = TfidfVectorizer()
    else:
        vectorizer = CountVectorizer()

    # Combine both queries for vectorization
    combined_queries = unresolved_queries.tolist() + resolved_queries.tolist()
    vectorized_queries = vectorizer.fit_transform(combined_queries)

    # Split vectorized data back into unresolved and resolved queries
    unresolved_vecs = vectorized_queries[:len(unresolved_queries)]
    resolved_vecs = vectorized_queries[len(unresolved_queries):]

    # Compute cosine similarity between unresolved and resolved queries
    cosine_sim = cosine_similarity(unresolved_vecs, resolved_vecs)
    return cosine_sim

# Compute Cosine Similarity for BoW and TF-IDF
bow_cosine_sim = vectorize_and_match(variation_queries['processed'], resolved_queries['processed'], method='bow')
tfidf_cosine_sim = vectorize_and_match(variation_queries['processed'], resolved_queries['processed'], method='tfidf')

# Getting best matches using cosine similarity
def get_best_matches(cosine_sim_matrix, unresolved_queries, resolved_queries):
    best_matches = []
    for idx, row in enumerate(cosine_sim_matrix):
        best_match_idx = row.argmax()
        best_matches.append((unresolved_queries[idx], resolved_queries[best_match_idx], row[best_match_idx]))
    return pd.DataFrame(best_matches, columns=['Unresolved Query', 'Resolved Query', 'Similarity'])

# Apply the best match extraction for both BoW and TF-IDF
bow_results = get_best_matches(bow_cosine_sim, variation_queries['processed'].tolist(), resolved_queries['processed'].tolist())
tfidf_results = get_best_matches(tfidf_cosine_sim, variation_queries['processed'].tolist(), resolved_queries['processed'].tolist())

# Display Results
print("Fuzzy Match Results:")
print(fuzzy_df.head(20))
print("\nBoW + Cosine Similarity Results:")
print(bow_results.head(20))
print("\nTF-IDF + Cosine Similarity Results:")
print(tfidf_results.head(20))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Fuzzy Match Results:
                      Unresolved Query                Resolved Query  Score
0               unabel conect internet       unable connect internet     93
1                cant connect internet       unable connect internet     82
2                      intenet working       unable connect internet     42
3               payment failed chekout       payment failed checkout     98
4                   payment go chckout       payment failed checkout     78
5                  payment issue check       payment failed checkout     76
6    application crash opening setings     app crash opening setting     83
7              app crash going setting     app crash opening setting     92
8             setting cause app chrash     app crash opening setting     37
9            forgot passwrd cant reset  forgot password unable reset     83
10     forgotten password unable reset  forgot password unable reset     95
11                 cant reset password  forgot password unable rese