<a href="https://colab.research.google.com/github/swayam305/SwayamParhi_NLP/blob/main/Assignment4_Text_Search/Assignment4_J066_Part1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from thefuzz import fuzz, process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from collections import defaultdict

# Download stopwords if not already available
nltk.download('stopwords')

# Load data with proper column names
resolved_df = pd.read_csv('resolved_queries.csv')
new_df = pd.read_csv('new_queries.csv')

# Display data structure
print("Resolved queries columns:", resolved_df.columns.tolist())
print("New queries columns:", new_df.columns.tolist())
print("\nResolved queries sample:")
print(resolved_df.head())
print("\nNew queries sample:")
print(new_df.head())

# Preprocessing function
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.split()
    tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

# Preprocess all queries
resolved_df['processed'] = resolved_df['Pre_Resolved_Query'].apply(preprocess)
new_df['processed'] = new_df['Variation_Query'].apply(preprocess)

# Create a mapping from Query_ID to processed and original queries
id_to_processed = dict(zip(resolved_df['Query_ID'], resolved_df['processed']))
id_to_original = dict(zip(resolved_df['Query_ID'], resolved_df['Pre_Resolved_Query']))
processed_to_id = dict(zip(resolved_df['processed'], resolved_df['Query_ID']))

# Create a list of processed resolved queries for fuzzy matching
resolved_processed_list = resolved_df['processed'].tolist()

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(resolved_df['processed'])

# Function to get best match using all fuzzy methods
def get_fuzzy_match(query, choices, threshold=85):
    best_match = None
    best_score = 0
    best_method = ""

    # Try all fuzzy methods
    methods = [
        ('Simple Ratio', fuzz.ratio),
        ('Partial Ratio', fuzz.partial_ratio),
        ('Token Sort Ratio', fuzz.token_sort_ratio),
        ('Token Set Ratio', fuzz.token_set_ratio),
        ('Partial Token Sort Ratio', fuzz.partial_token_sort_ratio)
    ]

    for method_name, scorer in methods:
        match = process.extractOne(query, choices, scorer=scorer)
        if match and match[1] > best_score:
            best_score = match[1]
            best_match = match[0]
            best_method = method_name

    if best_score >= threshold:
        return best_match, best_score, best_method
    return None, best_score, best_method

# Function to get best cosine similarity match
def get_cosine_match(query, tfidf_matrix, vectorizer, threshold=0.7):
    query_vec = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    max_index = cosine_similarities.argmax()
    max_score = cosine_similarities[max_index]

    if max_score >= threshold:
        return resolved_processed_list[max_index], max_score
    return None, max_score

# Apply matching to new queries and evaluate against ground truth
results = []
method_performance = defaultdict(lambda: {'correct': 0, 'total': 0, 'scores': []})

for idx, row in new_df.iterrows():
    new_query = row['processed']
    original_query = row['Variation_Query']
    ground_truth_id = row['Matches_With_Query_ID']

    # Get ground truth processed query
    ground_truth_processed = id_to_processed.get(ground_truth_id, "")

    # Get fuzzy match
    fuzzy_match, fuzzy_score, fuzzy_method = get_fuzzy_match(
        new_query, resolved_processed_list, threshold=85
    )

    # Get cosine match
    cosine_match, cosine_score = get_cosine_match(
        new_query, tfidf_matrix, vectorizer, threshold=0.7
    )

    # Determine predicted ID for each method
    fuzzy_predicted_id = processed_to_id.get(fuzzy_match, None) if fuzzy_match else None
    cosine_predicted_id = processed_to_id.get(cosine_match, None) if cosine_match else None

    # Check if predictions are correct
    fuzzy_correct = fuzzy_predicted_id == ground_truth_id if fuzzy_predicted_id else False
    cosine_correct = cosine_predicted_id == ground_truth_id if cosine_predicted_id else False

    # Update performance metrics for individual methods
    if fuzzy_match:
        method_performance[fuzzy_method]['correct'] += 1 if fuzzy_correct else 0
        method_performance[fuzzy_method]['total'] += 1
        method_performance[fuzzy_method]['scores'].append(fuzzy_score)

    if cosine_match:
        method_performance['TF-IDF Cosine']['correct'] += 1 if cosine_correct else 0
        method_performance['TF-IDF Cosine']['total'] += 1
        method_performance['TF-IDF Cosine']['scores'].append(cosine_score)

    # Store results for each method separately
    results.append({
        'Variation_Query': original_query,
        'Ground_Truth_ID': ground_truth_id,
        'Ground_Truth_Query': id_to_original.get(ground_truth_id, ""),
        'Fuzzy_Predicted_ID': fuzzy_predicted_id,
        'Fuzzy_Predicted_Query': id_to_original.get(fuzzy_predicted_id, "") if fuzzy_predicted_id else "",
        'Fuzzy_Method': fuzzy_method if fuzzy_match else "None",
        'Fuzzy_Score': fuzzy_score,
        'Fuzzy_Correct': fuzzy_correct,
        'TFIDF_Predicted_ID': cosine_predicted_id,
        'TFIDF_Predicted_Query': id_to_original.get(cosine_predicted_id, "") if cosine_predicted_id else "",
        'TFIDF_Score': cosine_score,
        'TFIDF_Correct': cosine_correct
    })

# Create results dataframe
results_df = pd.DataFrame(results)

# Calculate performance metrics for each method
performance_data = []
for method, stats in method_performance.items():
    if stats['total'] > 0:
        accuracy = stats['correct'] / stats['total']
        avg_score = np.mean(stats['scores']) if stats['scores'] else 0
        performance_data.append({
            'Method': method,
            'Accuracy': accuracy,
            'Coverage': stats['total'] / len(new_df),
            'Average_Score': avg_score,
            'Correct_Matches': stats['correct'],
            'Total_Matches': stats['total']
        })

performance_df = pd.DataFrame(performance_data)

# Save results
results_df.to_csv('matched_queries_results_individual.csv', index=False)
performance_df.to_csv('method_performance_comparison_individual.csv', index=False)

# Display results
print(f"\nProcessed {len(new_df)} queries")

print("\nMethod Performance Comparison (Individual Methods Only):")
print(performance_df.sort_values(['Accuracy', 'Coverage'], ascending=[False, False]))

print("\nSample matches:")
print(results_df[['Variation_Query', 'Ground_Truth_Query',
                 'Fuzzy_Predicted_Query', 'Fuzzy_Method', 'Fuzzy_Correct',
                 'TFIDF_Predicted_Query', 'TFIDF_Correct']].head(10))

# Detailed analysis by method
print("\n\nDetailed analysis by method:")
for method in performance_df['Method'].unique():
    method_data = performance_df[performance_df['Method'] == method].iloc[0]
    print(f"\n{method}:")
    print(f"  Accuracy: {method_data['Accuracy']:.2%}")
    print(f"  Coverage: {method_data['Coverage']:.2%}")
    print(f"  Average Score: {method_data['Average_Score']:.2f}")
    print(f"  Correct Matches: {method_data['Correct_Matches']}/{method_data['Total_Matches']}")

# Find the best individual method
best_method = performance_df.sort_values(['Accuracy', 'Coverage'], ascending=[False, False]).iloc[0]
print(f"\nBest individual method: {best_method['Method']} with accuracy {best_method['Accuracy']:.2%} and coverage {best_method['Coverage']:.2%}")

# Calculate overall statistics for each method type
fuzzy_methods = [m for m in performance_df['Method'] if m != 'TF-IDF Cosine']
fuzzy_stats = performance_df[performance_df['Method'].isin(fuzzy_methods)]

print(f"\nFuzzy Methods Combined:")
print(f"  Total Correct: {fuzzy_stats['Correct_Matches'].sum()}")
print(f"  Total Matches: {fuzzy_stats['Total_Matches'].sum()}")
print(f"  Overall Accuracy: {fuzzy_stats['Correct_Matches'].sum() / fuzzy_stats['Total_Matches'].sum():.2%}")
print(f"  Overall Coverage: {fuzzy_stats['Total_Matches'].sum() / len(new_df):.2%}")

tfidf_stats = performance_df[performance_df['Method'] == 'TF-IDF Cosine']
print(f"\nTF-IDF Cosine:")
print(f"  Correct Matches: {tfidf_stats['Correct_Matches'].iloc[0]}")
print(f"  Total Matches: {tfidf_stats['Total_Matches'].iloc[0]}")
print(f"  Accuracy: {tfidf_stats['Accuracy'].iloc[0]:.2%}")
print(f"  Coverage: {tfidf_stats['Coverage'].iloc[0]:.2%}")

Resolved queries columns: ['Query_ID', 'Pre_Resolved_Query']
New queries columns: ['Variation_Query', 'Matches_With_Query_ID']

Resolved queries sample:
   Query_ID                    Pre_Resolved_Query
0         1     Unable to connect to the internet
1         2        Payment failed during checkout
2         3     App crashes when opening settings
3         4   Forgot password and unable to reset
4         5  Unable to upload files to the server

New queries sample:
                             Variation_Query  Matches_With_Query_ID
0           Unabel to conect to the internet                      1
1                  Can’t connect to internet                      1
2                        Intenet not working                      1
3               Payment failed while chekout                      2
4  Payment did not go through during chckout                      2

Processed 20 queries

Method Performance Comparison (Individual Methods Only):
                     Method  Accuracy 

[nltk_data] Downloading package stopwords to C:\Users\Ryan
[nltk_data]     Matthew\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
