# IS4200 Project
## Query suggestion
Ai Hsiao, Kaito Minami, Nadezhda Shiroglazova, Zonne Smit

In [1]:
import pandas as pd

df = pd.read_csv('60_queries.csv')

## TF-IDF Query suggestion

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random

# Ensure required columns exist
if not {'Query', 'Suggested_Followup', 'Annotation'}.issubset(df.columns):
    raise ValueError("The CSV file must contain 'Query', 'Suggested_Followup', and 'Annotation' columns.")

# Step 2: Prepare Corpus and TF-IDF Model
corpus = df["Query"].tolist() + df["Suggested_Followup"].tolist()
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000, ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(corpus)

# Step 3: Generate Top-5 Suggestions
def generate_suggestions(query, tfidf_matrix, corpus, top_k=5):
    query_vec = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    ranked_indices = cosine_similarities.argsort()[::-1][1:top_k+1]  # Skip the query itself
    return [corpus[i] for i in ranked_indices]

# Step 4: Assign Relevance Scores Based on Word Matches
def assign_relevance_score(annotations, generated_query):
    # Convert to lowercase and split into sets of words
    annotation_words = set(annotations.lower().replace(",", "").split())  # Lowercase, remove commas, split
    query_words = set(generated_query.lower().split())  # Lowercase and split query into words

    # Check for matches
    matched_words = annotation_words.intersection(query_words)

    # Assign relevance scores based on matches
    if len(matched_words) >= 2:
        return 2  # Highly relevant
    elif len(matched_words) == 1:
        return 1  # Somewhat relevant
    else:
        return 0  # Irrelevant

# Step 5: Compute nDCG@5
def compute_ndcg_at_k(annotations, generated_suggestions, k=5):
    # Calculate relevance scores for each generated suggestion
    relevance_scores = [assign_relevance_score(annotations, gen) for gen in generated_suggestions[:k]]

    # Calculate DCG@5
    dcg = sum(rel / np.log2(idx + 2) for idx, rel in enumerate(relevance_scores))

    # Calculate IDCG@5 (Ideal DCG)
    idcg = sum(sorted(relevance_scores, reverse=True)[i] / np.log2(i + 2) for i in range(len(relevance_scores)))

    return dcg / idcg if idcg > 0 else 0, relevance_scores

# Step 6: Test with Random Queries
random_queries = random.sample(df["Query"].tolist(), 5)
results = []

for query in random_queries:
    # Generate top-5 suggestions
    generated_suggestions = generate_suggestions(query, tfidf_matrix, corpus, top_k=5)

    # Get the actual annotations and suggested follow-up for the query
    row = df[df["Query"] == query].iloc[0]
    annotations = row["Annotation"]
    actual_suggested_followup = row["Suggested_Followup"]

    # Calculate nDCG@5 and relevance scores
    ndcg_score, relevance_scores = compute_ndcg_at_k(annotations, generated_suggestions)

    results.append({
        "Query": query,
        "Generated Suggestions": generated_suggestions,
        "Relevance Scores": relevance_scores,
        "Actual Suggested Follow-up": actual_suggested_followup,
        "Annotations": annotations,
        "nDCG@5": ndcg_score
    })

# Step 7: Display Results
for result in results:
    print("Query:", result["Query"])
    print("Generated Suggestions:")
    for suggestion, relevance in zip(result["Generated Suggestions"], result["Relevance Scores"]):
        print(f"  - {suggestion} (Relevance: {relevance})")
    print("Actual Suggested Follow-up:", result["Actual Suggested Follow-up"])
    print("Annotations:", result["Annotations"])
    print("nDCG@5:", result["nDCG@5"])
    print("-" * 50)


Query: What is machine learning?
Generated Suggestions:
  - What is machine learning? (Relevance: 1)
  - How is machine learning different from Artificial Intelligence? (Relevance: 2)
  - How does machine learning differ from traditional programming? (Relevance: 2)
  - What are the benefits of learning a second language? (Relevance: 1)
  - How does e-learning improve access to education? (Relevance: 0)
Actual Suggested Follow-up: How is machine learning different from Artificial Intelligence?
Annotations: Machine Learning, Artificial Intelligence, Algorithms, Data Science
nDCG@5: 0.8807404415317287
--------------------------------------------------
Query: How does 5G technology work?
Generated Suggestions:
  - What is blockchain technology? (Relevance: 0)
  - What is the role of technology in education? (Relevance: 1)
  - How does renewable energy work? (Relevance: 0)
  - How does solar energy work? (Relevance: 0)
  - How does blockchain work in cryptocurrency? (Relevance: 0)
Actual Su

## Simple Keyword Extraction

In [3]:
def extract_keywords(tfidf_matrix, vectorizer, top_k=5):
    """
    Extract top-k keywords (including n-grams) for each document in the TF-IDF matrix.

    Args:
    - tfidf_matrix: TF-IDF matrix.
    - vectorizer: Fitted TfidfVectorizer with n-grams enabled.
    - top_k: Number of top keywords/phrases to extract.

    Returns:
    - List of top-k keywords or n-grams for each document.
    """
    feature_names = vectorizer.get_feature_names_out()
    keywords = []
    for row in tfidf_matrix:
        sorted_indices = row.toarray().flatten().argsort()[-top_k:][::-1]
        top_keywords = [feature_names[i] for i in sorted_indices if row.toarray()[0, i] > 0]
        keywords.append(top_keywords)
    return keywords


# Prepare the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(df["Query"].tolist() + df["Suggested_Followup"].tolist())

# Extract keywords for each query and response
df["Query_Keywords"] = extract_keywords(vectorizer.transform(df["Query"]), vectorizer)
df["Suggested_Followup_Keywords"] = extract_keywords(vectorizer.transform(df["Suggested_Followup"]), vectorizer)

# Display the updated dataset with extracted keywords
df[["Query", "Query_Keywords", "Suggested_Followup_Keywords"]].head()


Unnamed: 0,Query,Query_Keywords,Suggested_Followup_Keywords
0,What are the benefits of regular exercise?,"[benefits regular exercise, regular, regular e...","[help weight, weight, management, exercise hel..."
1,What is a balanced diet?,"[balanced, balanced diet, diet]","[balanced diet improve, does balanced diet, di..."
2,What is meditation?,[meditation],"[improve sleep quality, sleep quality, meditat..."
3,What are the benefits of a plant-based diet?,"[benefits plant, benefits plant based, plant, ...","[diet help reducing, does plant based, help re..."
4,What are some ways to manage stress?,"[manage stress, ways, ways manage, ways manage...","[reduce stress, help reduce stress, help reduc..."


## Keyword-based Suggestion

In [4]:
def generate_keyword_based_suggestions(query_keywords, all_responses, top_k=5):
    """
    Generate suggestions based on multi-keyword overlap.

    Args:
    - query_keywords: List of keywords or n-grams for the input query.
    - all_responses: List of all responses in the dataset.
    - top_k: Number of suggestions to return.

    Returns:
    - List of top-k suggestions ranked by n-gram overlap.
    """
    response_scores = []
    for response in all_responses:
        response_keywords = set(response.lower().split())  # Split response into words
        overlap = len(set(query_keywords).intersection(response_keywords))  # Count overlapping n-grams
        response_scores.append((response, overlap))

    # Rank responses by overlap
    response_scores.sort(key=lambda x: x[1], reverse=True)
    return [resp[0] for resp in response_scores[:top_k]]


# Example: Generate suggestions for a query
query = "What is machine learning?"
query_keywords = extract_keywords(vectorizer.transform([query]), vectorizer)[0]
all_responses = df["Suggested_Followup"].tolist()
suggestions = generate_keyword_based_suggestions(query_keywords, all_responses)
print("Query Keywords:", query_keywords)
print("Suggestions:", suggestions)


Query Keywords: ['machine', 'machine learning', 'learning']
Suggestions: ['How is machine learning different from Artificial Intelligence?', 'How does machine learning differ from traditional programming?', 'How does exercise help with weight management?', 'How does a balanced diet improve mental health?', 'How can meditation improve sleep quality?']


In [None]:
# analysis code to compare manual vs computer generated relevance judgement

'''
from scipy.stats import pearsonr

# Assuming you have manual scores in a DataFrame `df_manual`
# with columns: `Query`, `Generated_Suggestions`, `Manual_Relevance`, `Automatic_Relevance`

# Calculate correlation between manual and automatic scores
manual_scores = df_manual["Manual_Relevance"]
automatic_scores = df_manual["Automatic_Relevance"]
correlation, p_value = pearsonr(manual_scores, automatic_scores)
print(f"Correlation between manual and automatic scores: {correlation}")'''


'\nfrom scipy.stats import pearsonr\n\n# Assuming you have manual scores in a DataFrame `df_manual`\n# with columns: `Query`, `Generated_Suggestions`, `Manual_Relevance`, `Automatic_Relevance`\n\n# Calculate correlation between manual and automatic scores\nmanual_scores = df_manual["Manual_Relevance"]\nautomatic_scores = df_manual["Automatic_Relevance"]\ncorrelation, p_value = pearsonr(manual_scores, automatic_scores)\nprint(f"Correlation between manual and automatic scores: {correlation}")'

## Multi-word Keyword Extraction

In [2]:
!pip install rake-nltk



In [3]:
from rake_nltk import Rake

def rake_keyword(query):
    # Use RAKE to extract keywords
    r = Rake()
    r.extract_keywords_from_text(query)
    keywords = r.get_ranked_phrases()
    return set(keywords)

# rake_keyword(input_query)

In [13]:
for i in range(len(df)):
    try:
        df.loc[i,'Query_Multi'] = ', '.join(rake_keyword(df.loc[i,'Query']))
    except:
        continue

    try:
        df.loc[i,'Suggested_Followup_Multi'] = ', '.join(rake_keyword(df.loc[i,'Suggested_Followup']))
    except:
        continue

## Context Analysis Suggestion

In [12]:
from fuzzywuzzy import process
import re

# Text Preprocessing Function
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Preprocess the dataset columns
df['Query'] = df['Query'].apply(preprocess_text)
df['Suggested_Followup'] = df['Suggested_Followup'].apply(preprocess_text)
df['Query_Multi'] = df['Query_Multi'].apply(preprocess_text)
df['Suggested_Followup_Multi'] = df['Suggested_Followup_Multi'].apply(preprocess_text)

# Function to find the best match and its follow-ups
def find_best_match_and_followups(input_query, data, threshold=70, top_k=5):
    """
    Find the best matching query in the dataset for the input query and return its follow-ups.

    Args:
        input_query (str): The user's input query.
        data (DataFrame): The dataframe containing 'Query', 'Suggested_Followup', and keywords.
        threshold (int): Minimum score for a match to be considered valid (0-100).

    Returns:
        tuple: The best match from the dataset and a list of its suggested follow-ups.
    """
    # Preprocess the input query
    input_query_processed = preprocess_text(input_query)
    
    # Combine all keywords into a single search field
    data['combined_keywords'] = data['Query_Multi'] + " " + data['Suggested_Followup_Multi']
    
    # Search for the best match across queries and keywords
    all_options = data['combined_keywords'].tolist()
    matches = [[k,v] for k, v in process.extract(input_query_processed, all_options, limit=top_k)]

    for i in range(len(matches)-1,-1,-1):
        matches[i][0] = data[data['combined_keywords'] == matches[i][0]]['Query'].iloc[0]
    
        # Check if the match score meets the threshold
        if matches[i][1] < threshold:
            matches.pop(i)
    
        followups = data[data['Query'] == matches[i][0]]['Suggested_Followup'].tolist()
        
        # Format suggestions: capitalize and ensure ending question marks
        formatted_followups = [
            suggestion.capitalize() + "?"
            if not suggestion.endswith("?") else suggestion.capitalize()
            for suggestion in followups
        ]
        matches[i][1] = formatted_followups
        
        # Capitalize and punctuate the matched query
        matches[i][0] = matches[i][0].capitalize() + "?"

    if matches:
        return matches
    else:
        return None, ["No matching query found in the dataset."]

# Test with an input query
input_query = "yoga benefits"
matches = find_best_match_and_followups(input_query, df)

print("Input Query:", input_query)
for match in matches:
    print(f"Match: {match[0]}\nSuggested Follow-up: {match[1][0]}\n")

Input Query: yoga benefits
Match: What are the benefits of regular exercise?
Suggested Follow-up: How does exercise help with weight management?

Match: What are the benefits of a plantbased diet?
Suggested Follow-up: How does a plantbased diet help in reducing cholesterol?

Match: How does yoga benefit your health?
Suggested Follow-up: What are the mental health benefits of yoga?

Match: What is cloud storage?
Suggested Follow-up: What are the benefits of using cloud storage?

Match: What is ecofriendly living?
Suggested Follow-up: What are the benefits of using renewable resources?



## BM25 and Word Embedding Re-ranking
reference: 
- https://dev.to/mage_ai/how-to-build-a-search-engine-with-word-embeddings-56jd
- https://github.com/czhu12/semantic-search/blob/master/search.py

In [None]:
# !pip install rank_bm25
%pip install rank_bm25
%pip install gensim

In [None]:
from rank_bm25 import BM25Okapi as BM25
import gensim
from gensim import corpora
import gensim.downloader as api
import numpy as np
import logging
logging.basicConfig(level=logging.DEBUG)

class Retriever(object):
    def __init__(self, documents):
        self.corpus = documents
        self.bm25 = BM25(self.corpus)

    def query(self, tokenized_query, n=100):
        scores = self.bm25.get_scores(tokenized_query)
        best_docs = sorted(range(len(scores)), key=lambda i: -scores[i])[:n]
        return best_docs, [scores[i] for i in best_docs]


class Ranker(object):
    def __init__(self, query_embedding, document_embedding):
        self.query_embedding = query_embedding
        self.document_embedding = document_embedding

    def _create_mean_embedding(self, word_embeddings):
        return np.mean(
            word_embeddings,
            axis=0,
        )

    def _create_max_embedding(self, word_embeddings):
        return np.amax(
            word_embeddings,
            axis=0,
        )

    def _embed(self, tokens, embedding):
        word_embeddings = np.array([embedding[token] for token in tokens if token in embedding])
        mean_embedding = self._create_mean_embedding(word_embeddings)
        max_embedding = self._create_max_embedding(word_embeddings)
        embedding = np.concatenate([mean_embedding, max_embedding])
        unit_embedding = embedding / (embedding**2).sum()**0.5
        return unit_embedding

    def rank(self, tokenized_query, tokenized_documents):
        """
        Re-ranks a set of documents according to embedding distance
        """
        query_embedding = self._embed(tokenized_query, self.query_embedding) # (E,)
        document_embeddings = np.array([self._embed(document, self.document_embedding) for document in tokenized_documents]) # (N, E)
        scores = document_embeddings.dot(query_embedding)
        index_rankings = np.argsort(scores)[::-1]
        return index_rankings, np.sort(scores)[::-1]


def tokenize(document):
    return list(gensim.utils.tokenize(document.lower()))


def show_scores(documents, scores, n=10):
    for i in range(n):
        print("======== RANK: {} | SCORE: {} =======".format(i + 1, scores[i]))
        print(documents[i])
        print("")
    print("\n")

In [None]:
input_query = "quantum computing"

print("Input Query: {}".format(input_query))

# Tokenize
documents = df['Query'].tolist() + df['Suggested_Followup'].tolist()
corpus = [list(gensim.utils.tokenize(doc.lower())) for doc in documents]
tokenized_query = tokenize(input_query)

# Retrieval
retriever = Retriever(corpus)
retrieval_indexes, retrieval_scores = retriever.query(tokenized_query)

# Deduplicate documents during retrieval
retrieved_documents = []
for idx in retrieval_indexes:
    if documents[idx] not in retrieved_documents:
        retrieved_documents.append(documents[idx])

print("======== BM25 ========")
show_scores(retrieved_documents, retrieval_scores, 20)

# Deduplicate tokenized retrieved documents during retrieval
tokenzed_retrieved_documents = []
for idx in retrieval_indexes:
    if corpus[idx] not in tokenzed_retrieved_documents:
        tokenzed_retrieved_documents.append(corpus[idx])

# Re-ranking
print("Loading glove embeddings...", end="")
query_embedding = api.load('glove-wiki-gigaword-50')
print(" [DONE]")

ranker = Ranker(query_embedding=query_embedding, document_embedding=query_embedding)
ranker_indexes, ranker_scores = ranker.rank(tokenized_query, tokenzed_retrieved_documents)

# Deduplicate reranked documents
reranked_documents = []
for idx in ranker_indexes:
    if retrieved_documents[idx] not in reranked_documents:
        reranked_documents.append(retrieved_documents[idx])

print("======== Embedding ========")
show_scores(reranked_documents, ranker_scores, 20)


## Keyword-based Sentence-BERT

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load a pre-trained semantic similarity model
model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_refined_suggestions(query, all_responses, query_keywords, top_k=5):
    """
    Generate suggestions based on semantic similarity and keyword overlap.

    Args:
    - query: Input query.
    - all_responses: List of all responses in the dataset.
    - query_keywords: Extracted keywords for the query.
    - top_k: Number of suggestions to return.

    Returns:
    - List of top-k suggestions ranked by combined score.
    """
    query_embedding = model.encode([query])
    response_embeddings = model.encode(all_responses)
    semantic_scores = cosine_similarity(query_embedding, response_embeddings)[0]

    keyword_scores = [
        len(set(query_keywords).intersection(set(resp.lower().split())))
        for resp in all_responses
    ]

    # Combine scores with weighting
    combined_scores = 0.7 * semantic_scores + 0.3 * np.array(keyword_scores)
    ranked_indices = combined_scores.argsort()[::-1][:top_k]
    return [all_responses[i] for i in ranked_indices]

# Example: Generate refined suggestions
refined_suggestions = generate_refined_suggestions(query, all_responses, query_keywords)
print("Refined Suggestions:", refined_suggestions)


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Refined Suggestions: ['How is machine learning different from Artificial Intelligence?', 'How does machine learning differ from traditional programming?', 'How is AI used in everyday applications?', 'What are the types of Artificial Intelligence systems?', 'What are the benefits of AI in medical diagnosis?']


## TF-IDF Manual Judgment

In [None]:
# Generate five query suggestions for each query using the TF-IDF-based system
def generate_suggestions(query, tfidf_matrix, corpus, top_k=5):
    query_vec = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    ranked_indices = cosine_similarities.argsort()[::-1][1:top_k+1]  # Skip the query itself
    return [corpus[i] for i in ranked_indices]

# Prepare the TF-IDF corpus
corpus = df["Query"].tolist() + df["Suggested_Followup"].tolist()
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
tfidf_matrix = vectorizer.fit_transform(corpus)

# Generate suggestions for each query in the dataset
query_suggestions = []
for query in df["Query"].tolist():
    suggestions = generate_suggestions(query, tfidf_matrix, corpus, top_k=5)
    for suggestion in suggestions:
        query_suggestions.append({
            "Query": query,
            "Suggested_Follow-Up": suggestion
        })

# Create a DataFrame for manual relevance judgment
manual_relevance_df = pd.DataFrame(query_suggestions)
manual_relevance_df["Manual Relevance"] = ""  # Placeholder for manual ratings

# Save the new Excel file for manual relevance judgment
manual_relevance_file_path = 'manual_relevance_judgment.xlsx'
manual_relevance_df.to_excel(manual_relevance_file_path, index=False)

# Download the file using the 'files' UI:
from google.colab import files
files.download('manual_relevance_judgment.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Sentence-BERT

In [None]:
pip install sentence-transformers




In [None]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np

# Step 2: Load Pre-trained Sentence-BERT Model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 3: Encode All Responses in the Dataset
response_embeddings = model.encode(df["Suggested_Followup"].tolist(), convert_to_tensor=True)

# Step 4: Function to Generate Suggestions with BERT
def generate_bert_suggestions(query, df, response_embeddings, top_k=5):
    """
    Generate suggestions using BERT-based embeddings and semantic similarity.
    Args:
        query: The input query string.
        df: The DataFrame containing the responses.
        response_embeddings: Precomputed embeddings for all responses.
        top_k: Number of suggestions to return.

    Returns:
        List of top-k suggestions ranked by semantic similarity.
    """
    # Encode the query
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Compute cosine similarity with all responses
    similarity_scores = util.pytorch_cos_sim(query_embedding, response_embeddings)[0]

    # Rank responses by similarity
    top_k_indices = similarity_scores.argsort(descending=True)[:top_k]

    # Retrieve suggestions
    suggestions = df.iloc[top_k_indices.cpu().numpy()]["Suggested_Followup"].tolist()

    # Include similarity scores for debugging
    scores = similarity_scores[top_k_indices].cpu().numpy()

    return suggestions, scores



Query: What is machine learning?
1. How is machine learning different from Artificial Intelligence? (Score: 0.7131)
2. How does machine learning differ from traditional programming? (Score: 0.6987)
3. How is AI used in everyday applications? (Score: 0.4849)
4. What are the types of Artificial Intelligence systems? (Score: 0.4574)
5. What are the benefits of AI in medical diagnosis? (Score: 0.3846)


In [None]:

# Example: Generate suggestions for a query
query = "What is machine learning?"
suggestions, scores = generate_bert_suggestions(query, df, response_embeddings, top_k=5)

# Print the results
print(f"Query: {query}")
for i, (suggestion, score) in enumerate(zip(suggestions, scores), start=1):
    print(f"{i}. {suggestion} (Score: {score:.4f})")

Query: What is machine learning?
1. How is machine learning different from Artificial Intelligence? (Score: 0.7131)
2. How does machine learning differ from traditional programming? (Score: 0.6987)
3. How is AI used in everyday applications? (Score: 0.4849)
4. What are the types of Artificial Intelligence systems? (Score: 0.4574)
5. What are the benefits of AI in medical diagnosis? (Score: 0.3846)


In [None]:
# Test the system on queries from the dataset
test_results = []
for query in df["Query"].tolist():
    suggestions, scores = generate_bert_suggestions(query, df, response_embeddings, top_k=5)
    test_results.append({
        "Query": query,
        "Suggestions": suggestions,
        "Scores": scores.tolist()  # Convert tensor to list for easier storage
    })

# Save the test results for review
test_results_path = 'bert_results.xlsx'
test_results_df = pd.DataFrame(test_results)
test_results_df.to_excel(test_results_path, index=False)

# Download the file using the 'files' UI:
from google.colab import files
files.download('bert_results.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Word-Embedding KNN

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# Load a pre-trained semantic similarity model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for corpus
def generate_embeddings(corpus):
    return model.encode(corpus)

# Prepare the TF-IDF corpus
corpus = df["Query"].tolist() + df["Suggested_Followup"].tolist()
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
tfidf_matrix = vectorizer.fit_transform(corpus)

# Generate sentence embeddings
embeddings = generate_embeddings(corpus)

# Build the k-NN model
knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(embeddings)

# Function to generate refined suggestions using k-NN and keyword overlap
def generate_refined_suggestions(query, all_responses, query_keywords, top_k=5):
    """
    Generate suggestions based on semantic similarity, k-NN, and keyword overlap.

    Args:
    - query: Input query.
    - all_responses: List of all responses in the dataset.
    - query_keywords: Extracted keywords for the query.
    - top_k: Number of suggestions to return.

    Returns:
    - List of top-k suggestions ranked by combined score.
    """
    # Encode the query
    query_embedding = model.encode([query])

    # Get k-NN suggestions
    knn_distances, knn_indices = knn.kneighbors(query_embedding)

    # Ensure knn_indices is valid
    if knn_indices.shape[0] == 0:
        return []

    knn_suggestions = [all_responses[i] for i in knn_indices[0] if i < len(all_responses)]

    # Compute semantic similarity scores
    response_embeddings = model.encode(all_responses)
    semantic_scores = cosine_similarity(query_embedding, response_embeddings)[0]

    # Compute keyword overlap scores
    keyword_scores = [
        len(set(query_keywords).intersection(set(resp.lower().split())))
        for resp in all_responses
    ]

    # Combine scores with weighting
    combined_scores = 0.5 * semantic_scores + 0.3 * np.array(keyword_scores)

    # Include k-NN suggestions in ranking
    for knn_suggestion in knn_suggestions:
        if knn_suggestion in all_responses:
            index = all_responses.index(knn_suggestion)
            combined_scores[index] += 0.2  # Boost k-NN suggestions

    ranked_indices = np.argsort(combined_scores)[::-1][:top_k]
    return [all_responses[i] for i in ranked_indices]

# Example: Generate refined suggestions
query = "What are the benefits of yoga?"
query_keywords = ["benefits", "yoga"]  # Example extracted keywords
all_responses = df["Suggested_Followup"].tolist()
refined_suggestions = generate_refined_suggestions(query, all_responses, query_keywords)

print("Refined Suggestions:", refined_suggestions)

# Generate suggestions for each query in the dataset
query_suggestions = []
for query in df["Query"].tolist():
    suggestions = generate_refined_suggestions(query, all_responses, query_keywords)
    for suggestion in suggestions:
        query_suggestions.append({
            "Query": query,
            "Suggested Follow-Up": suggestion
        })

# Create a DataFrame for manual relevance judgment
manual_relevance_df = pd.DataFrame(query_suggestions)
manual_relevance_df["Manual Relevance"] = ""  # Placeholder for manual ratings

# Save the new Excel file for manual relevance judgment
manual_relevance_file_path = 'manual_relevance_judgment_knn.xlsx'
manual_relevance_df.to_excel(manual_relevance_file_path, index=False)

# Download the file using the 'files' UI:
from google.colab import files
files.download(manual_relevance_file_path)

Refined Suggestions: ['What are the mental health benefits of yoga?', 'What are the benefits of solar power?', 'What are the benefits of using renewable resources?', 'What are the benefits of using cloud storage?', 'What are the benefits of AI in medical diagnosis?']


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Evaluation

In [None]:
import pandas as pd

manual_relevance_df = pd.read_excel('manual_relevance_judgment.xlsx')
print(manual_relevance_df.columns)
if not {'Query', 'Suggested_Follow-Up', 'Manual Relevance'}.issubset(manual_relevance_df.columns):
    raise ValueError("The file must contain 'Query', 'Suggested Follow-Up', and 'Manual Relevance' columns.")

manual_relevance_df['Manual Relevance'] = manual_relevance_df['Manual Relevance'].astype(int)


Index(['Query', 'Suggested_Follow-Up', 'Manual Relevance'], dtype='object')


In [None]:
def precision_at_k(relevance_scores, k):
    """
    Calculate Precision@k.

    Args:
    - relevance_scores (list): List of relevance scores for the top-k results.
    - k (int): Number of top results to consider.

    Returns:
    - float: Precision@k score.
    """
    return sum(1 for score in relevance_scores[:k] if score > 0) / k

# Example usage
precision_scores = []
k = 5

for query in manual_relevance_df['Query'].unique():
    query_data = manual_relevance_df[manual_relevance_df['Query'] == query]
    relevance_scores = query_data['Manual Relevance'].tolist()
    precision = precision_at_k(relevance_scores, k)
    precision_scores.append(precision)

average_precision = sum(precision_scores) / len(precision_scores)
print("Average Precision@5:", average_precision)


Average Precision@5: 0.7018181818181817


In [None]:
def recall_at_k(relevance_scores, total_relevant, k):
    """
    Calculate Recall@k.

    Args:
    - relevance_scores (list): List of relevance scores for the top-k results.
    - total_relevant (int): Total number of relevant items for the query.
    - k (int): Number of top results to consider.

    Returns:
    - float: Recall@k score.
    """
    if total_relevant == 0:
        return 0.0
    else:
        return sum(1 for score in relevance_scores[:k] if score > 0) / total_relevant

recall_scores = []

for query in manual_relevance_df['Query'].unique():
    query_data = manual_relevance_df[manual_relevance_df['Query'] == query]
    relevance_scores = query_data['Manual Relevance'].tolist()
    total_relevant = sum(1 for score in relevance_scores if score > 0)
    recall = recall_at_k(relevance_scores, total_relevant, k)
    recall_scores.append(recall)

average_recall = sum(recall_scores) / len(recall_scores)
print("Average Recall@5:", average_recall)


Average Recall@5: 0.9350649350649352


In [None]:
def ndcg_at_k(relevance_scores, k):
    """
    Calculate nDCG@k.

    Args:
    - relevance_scores (list): List of relevance scores for the top-k results.
    - k (int): Number of top results to consider.

    Returns:
    - float: nDCG@k score.
    """
    dcg = sum(rel / np.log2(idx + 2) for idx, rel in enumerate(relevance_scores[:k]))
    idcg = sum(sorted(relevance_scores, reverse=True)[:k][i] / np.log2(i + 2) for i in range(k))
    return dcg / idcg if idcg > 0 else 0

ndcg_scores = []

for query in manual_relevance_df['Query'].unique():
    query_data = manual_relevance_df[manual_relevance_df['Query'] == query]
    relevance_scores = query_data['Manual Relevance'].tolist()
    ndcg = ndcg_at_k(relevance_scores, k)
    ndcg_scores.append(ndcg)

average_ndcg = sum(ndcg_scores) / len(ndcg_scores)
print("Average nDCG@5:", average_ndcg)


Average nDCG@5: 0.9307821128181221
