In [1]:
import pandas as pd

In [2]:
# Step 1: Load Dataset
# Replace 'your_dataset.csv' with the path to your dataset
df = pd.read_csv('60_queries.csv')

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random

# Ensure required columns exist
if not {'Query', 'Suggested_Followup', 'Annotation'}.issubset(df.columns):
    raise ValueError("The CSV file must contain 'Query', 'Suggested_Followup', and 'Annotation' columns.")

# Step 2: Prepare Corpus and TF-IDF Model
corpus = df["Query"].tolist() + df["Suggested_Followup"].tolist()
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
tfidf_matrix = vectorizer.fit_transform(corpus)

# Step 3: Generate Top-5 Suggestions
def generate_suggestions(query, tfidf_matrix, corpus, top_k=5):
    query_vec = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    ranked_indices = cosine_similarities.argsort()[::-1][1:top_k+1]  # Skip the query itself
    return [corpus[i] for i in ranked_indices]

# Step 4: Assign Relevance Scores Based on Word Matches
def assign_relevance_score(annotations, generated_query):
    # Convert to lowercase and split into sets of words
    annotation_words = set(annotations.lower().replace(",", "").split())  # Lowercase, remove commas, split
    query_words = set(generated_query.lower().split())  # Lowercase and split query into words

    # Check for matches
    matched_words = annotation_words.intersection(query_words)

    # Assign relevance scores based on matches
    if len(matched_words) >= 2:
        return 2  # Highly relevant
    elif len(matched_words) == 1:
        return 1  # Somewhat relevant
    else:
        return 0  # Irrelevant

# Step 5: Compute nDCG@5
def compute_ndcg_at_k(annotations, generated_suggestions, k=5):
    # Calculate relevance scores for each generated suggestion
    relevance_scores = [assign_relevance_score(annotations, gen) for gen in generated_suggestions[:k]]

    # Calculate DCG@5
    dcg = sum(rel / np.log2(idx + 2) for idx, rel in enumerate(relevance_scores))

    # Calculate IDCG@5 (Ideal DCG)
    idcg = sum(sorted(relevance_scores, reverse=True)[i] / np.log2(i + 2) for i in range(len(relevance_scores)))

    return dcg / idcg if idcg > 0 else 0, relevance_scores

# Step 6: Test with Random Queries
random_queries = random.sample(df["Query"].tolist(), 5)
results = []

for query in random_queries:
    # Generate top-5 suggestions
    generated_suggestions = generate_suggestions(query, tfidf_matrix, corpus, top_k=5)

    # Get the actual annotations and suggested follow-up for the query
    row = df[df["Query"] == query].iloc[0]
    annotations = row["Annotation"]
    actual_suggested_followup = row["Suggested_Followup"]

    # Calculate nDCG@5 and relevance scores
    ndcg_score, relevance_scores = compute_ndcg_at_k(annotations, generated_suggestions)

    results.append({
        "Query": query,
        "Generated Suggestions": generated_suggestions,
        "Relevance Scores": relevance_scores,
        "Actual Suggested Follow-up": actual_suggested_followup,
        "Annotations": annotations,
        "nDCG@5": ndcg_score
    })

# Step 7: Display Results
for result in results:
    print("Query:", result["Query"])
    print("Generated Suggestions:")
    for suggestion, relevance in zip(result["Generated Suggestions"], result["Relevance Scores"]):
        print(f"  - {suggestion} (Relevance: {relevance})")
    print("Actual Suggested Follow-up:", result["Actual Suggested Follow-up"])
    print("Annotations:", result["Annotations"])
    print("nDCG@5:", result["nDCG@5"])
    print("-" * 50)


Query: What is genetic engineering?
Generated Suggestions:
  - How is CRISPR used in genetic engineering? (Relevance: 2)
  - How is AI used in everyday applications? (Relevance: 0)
  - What is eco-friendly living? (Relevance: 0)
  - How does renewable energy work? (Relevance: 0)
  - What is quantum computing? (Relevance: 0)
Actual Suggested Follow-up: How is CRISPR used in genetic engineering?
Annotations: Genetic Engineering, CRISPR, Science, Biotechnology
nDCG@5: 1.0
--------------------------------------------------
Query: What is the importance of recycling?
Generated Suggestions:
  - What is recycling? (Relevance: 0)
  - What is the importance of biodiversity? (Relevance: 0)
  - What is the importance of forests? (Relevance: 0)
  - What is the importance of vaccination? (Relevance: 0)
  - How does recycling reduce environmental pollution? (Relevance: 1)
Actual Suggested Follow-up: How does recycling reduce environmental pollution?
Annotations: Recycling, Environment, Pollution, Su

In [4]:
def extract_keywords(tfidf_matrix, vectorizer, top_k=5):
    """
    Extract top-k keywords for each document in the TF-IDF matrix.

    Args:
    - tfidf_matrix: TF-IDF matrix.
    - vectorizer: Fitted TfidfVectorizer.
    - top_k: Number of top keywords to extract.

    Returns:
    - List of top-k keywords for each document.
    """
    feature_names = vectorizer.get_feature_names_out()
    keywords = []
    for row in tfidf_matrix:
        # Get the top-k indices with the highest TF-IDF scores
        sorted_indices = row.toarray().flatten().argsort()[-top_k:][::-1]
        # Map indices to words
        top_keywords = [feature_names[i] for i in sorted_indices if row.toarray()[0, i] > 0]
        keywords.append(top_keywords)
    return keywords

# Prepare the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(df["Query"].tolist() + df["Suggested_Followup"].tolist())

# Extract keywords for each query and response
df["Query_Keywords"] = extract_keywords(vectorizer.transform(df["Query"]), vectorizer)
df["Suggested_Followup_Keywords"] = extract_keywords(vectorizer.transform(df["Suggested_Followup"]), vectorizer)

# Display the updated dataset with extracted keywords
df[["Query", "Query_Keywords", "Suggested_Followup_Keywords"]].head()


Unnamed: 0,Query,Query_Keywords,Suggested_Followup_Keywords
0,What are the benefits of regular exercise?,"[regular, exercise, benefits]","[management, weight, exercise, help, does]"
1,What is a balanced diet?,"[balanced, diet]","[balanced, diet, mental, health, improve]"
2,What is meditation?,[meditation],"[quality, meditation, sleep, improve]"
3,What are the benefits of a plant-based diet?,"[plant, based, diet, benefits]","[reducing, cholesterol, plant, based, diet]"
4,What are some ways to manage stress?,"[ways, stress, manage]","[stress, reduce, exercise, help, does]"


In [5]:
def generate_keyword_based_suggestions(query_keywords, all_responses, top_k=5):
    """
    Generate suggestions based on keyword overlap.

    Args:
    - query_keywords: List of keywords for the input query.
    - all_responses: List of all responses in the dataset.
    - top_k: Number of suggestions to return.

    Returns:
    - List of top-k suggestions ranked by keyword overlap.
    """
    response_scores = []
    for response in all_responses:
        response_keywords = set(response.lower().split())
        overlap = len(set(query_keywords).intersection(response_keywords))
        response_scores.append((response, overlap))

    # Rank responses by overlap score
    response_scores.sort(key=lambda x: x[1], reverse=True)
    return [resp[0] for resp in response_scores[:top_k]]

# Example: Generate suggestions for a query
query = "What is machine learning?"
query_keywords = extract_keywords(vectorizer.transform([query]), vectorizer)[0]
all_responses = df["Suggested_Followup"].tolist()
suggestions = generate_keyword_based_suggestions(query_keywords, all_responses)
print("Query Keywords:", query_keywords)
print("Suggestions:", suggestions)


Query Keywords: ['machine', 'learning']
Suggestions: ['How is machine learning different from Artificial Intelligence?', 'How does machine learning differ from traditional programming?', 'How does exercise help with weight management?', 'How does a balanced diet improve mental health?', 'How can meditation improve sleep quality?']


In [6]:
# analysis code to compare manual vs computer generated relevance judgement

'''
from scipy.stats import pearsonr

# Assuming you have manual scores in a DataFrame `df_manual`
# with columns: `Query`, `Generated_Suggestions`, `Manual_Relevance`, `Automatic_Relevance`

# Calculate correlation between manual and automatic scores
manual_scores = df_manual["Manual_Relevance"]
automatic_scores = df_manual["Automatic_Relevance"]
correlation, p_value = pearsonr(manual_scores, automatic_scores)
print(f"Correlation between manual and automatic scores: {correlation}")'''


'\nfrom scipy.stats import pearsonr\n\n# Assuming you have manual scores in a DataFrame `df_manual`\n# with columns: `Query`, `Generated_Suggestions`, `Manual_Relevance`, `Automatic_Relevance`\n\n# Calculate correlation between manual and automatic scores\nmanual_scores = df_manual["Manual_Relevance"]\nautomatic_scores = df_manual["Automatic_Relevance"]\ncorrelation, p_value = pearsonr(manual_scores, automatic_scores)\nprint(f"Correlation between manual and automatic scores: {correlation}")'

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load a pre-trained semantic similarity model
model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_refined_suggestions(query, all_responses, query_keywords, top_k=5):
    """
    Generate suggestions based on semantic similarity and keyword overlap.

    Args:
    - query: Input query.
    - all_responses: List of all responses in the dataset.
    - query_keywords: Extracted keywords for the query.
    - top_k: Number of suggestions to return.

    Returns:
    - List of top-k suggestions ranked by combined score.
    """
    query_embedding = model.encode([query])
    response_embeddings = model.encode(all_responses)
    semantic_scores = cosine_similarity(query_embedding, response_embeddings)[0]

    keyword_scores = [
        len(set(query_keywords).intersection(set(resp.lower().split())))
        for resp in all_responses
    ]

    # Combine scores with weighting
    combined_scores = 0.7 * semantic_scores + 0.3 * np.array(keyword_scores)
    ranked_indices = combined_scores.argsort()[::-1][:top_k]
    return [all_responses[i] for i in ranked_indices]

# Example: Generate refined suggestions
refined_suggestions = generate_refined_suggestions(query, all_responses, query_keywords)
print("Refined Suggestions:", refined_suggestions)


2024-12-10 21:23:41.798471: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [8]:
import pandas as pd

# Generate five query suggestions for each query using the TF-IDF-based system
def generate_suggestions(query, tfidf_matrix, corpus, top_k=5):
    query_vec = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    ranked_indices = cosine_similarities.argsort()[::-1][1:top_k+1]  # Skip the query itself
    return [corpus[i] for i in ranked_indices]

# Prepare the TF-IDF corpus
corpus = df["Query"].tolist() + df["Suggested_Followup"].tolist()
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
tfidf_matrix = vectorizer.fit_transform(corpus)

# Generate suggestions for each query in the dataset
query_suggestions = []
for query in df["Query"].tolist():
    suggestions = generate_suggestions(query, tfidf_matrix, corpus, top_k=5)
    for suggestion in suggestions:
        query_suggestions.append({
            "Query": query,
            "Suggested_Follow-Up": suggestion
        })

# Create a DataFrame for manual relevance judgment
manual_relevance_df = pd.DataFrame(query_suggestions)
manual_relevance_df["Manual Relevance"] = ""  # Placeholder for manual ratings

# Save the new Excel file for manual relevance judgment
manual_relevance_file_path = 'manual_relevance_judgment.xlsx'
manual_relevance_df.to_excel(manual_relevance_file_path, index=False)

# Download the file using the 'files' UI:
from google.colab import files
files.download('manual_relevance_judgment.xlsx')

ModuleNotFoundError: No module named 'google.colab'

In [9]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np

# Step 2: Load Pre-trained Sentence-BERT Model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 3: Encode All Responses in the Dataset
response_embeddings = model.encode(df["Suggested_Followup"].tolist(), convert_to_tensor=True)

# Step 4: Function to Generate Suggestions with BERT
def generate_bert_suggestions(query, df, response_embeddings, top_k=5):
    """
    Generate suggestions using BERT-based embeddings and semantic similarity.
    Args:
        query: The input query string.
        df: The DataFrame containing the responses.
        response_embeddings: Precomputed embeddings for all responses.
        top_k: Number of suggestions to return.

    Returns:
        List of top-k suggestions ranked by semantic similarity.
    """
    # Encode the query
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Compute cosine similarity with all responses
    similarity_scores = util.pytorch_cos_sim(query_embedding, response_embeddings)[0]

    # Rank responses by similarity
    top_k_indices = similarity_scores.argsort(descending=True)[:top_k]

    # Retrieve suggestions
    suggestions = df.iloc[top_k_indices.cpu().numpy()]["Suggested_Followup"].tolist()

    # Include similarity scores for debugging
    scores = similarity_scores[top_k_indices].cpu().numpy()

    return suggestions, scores



RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [10]:

# Example: Generate suggestions for a query
query = "What is machine learning?"
suggestions, scores = generate_bert_suggestions(query, df, response_embeddings, top_k=5)

# Print the results
print(f"Query: {query}")
for i, (suggestion, score) in enumerate(zip(suggestions, scores), start=1):
    print(f"{i}. {suggestion} (Score: {score:.4f})")

NameError: name 'generate_bert_suggestions' is not defined

In [None]:
# Test the system on queries from the dataset
test_results = []
for query in df["Query"].tolist():
    suggestions, scores = generate_bert_suggestions(query, df, response_embeddings, top_k=5)
    test_results.append({
        "Query": query,
        "Suggestions": suggestions,
        "Scores": scores.tolist()  # Convert tensor to list for easier storage
    })

# Save the test results for review
test_results_path = 'bert_results.xlsx'
test_results_df = pd.DataFrame(test_results)
test_results_df.to_excel(test_results_path, index=False)

# Download the file using the 'files' UI:
from google.colab import files
files.download('bert_results.xlsx')

## Evaluation

In [11]:
import pandas as pd

manual_relevance_df = pd.read_excel('manual_relevance_judgment.xlsx')
print(manual_relevance_df.columns)
if not {'Query', 'Suggested_Follow-Up', 'Manual Relevance'}.issubset(manual_relevance_df.columns):
    raise ValueError("The file must contain 'Query', 'Suggested Follow-Up', and 'Manual Relevance' columns.")

manual_relevance_df['Manual Relevance'] = manual_relevance_df['Manual Relevance'].astype(int)


Index(['Query', 'Suggested_Follow-Up', 'Manual Relevance'], dtype='object')


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [None]:
def precision_at_k(relevance_scores, k):
    """
    Calculate Precision@k.

    Args:
    - relevance_scores (list): List of relevance scores for the top-k results.
    - k (int): Number of top results to consider.

    Returns:
    - float: Precision@k score.
    """
    return sum(1 for score in relevance_scores[:k] if score > 0) / k

# Example usage
precision_scores = []
k = 5

for query in manual_relevance_df['Query'].unique():
    query_data = manual_relevance_df[manual_relevance_df['Query'] == query]
    relevance_scores = query_data['Manual Relevance'].tolist()
    precision = precision_at_k(relevance_scores, k)
    precision_scores.append(precision)

average_precision = sum(precision_scores) / len(precision_scores)
print("Average Precision@5:", average_precision)


In [None]:
def recall_at_k(relevance_scores, total_relevant, k):
    """
    Calculate Recall@k.

    Args:
    - relevance_scores (list): List of relevance scores for the top-k results.
    - total_relevant (int): Total number of relevant items for the query.
    - k (int): Number of top results to consider.

    Returns:
    - float: Recall@k score.
    """
    if total_relevant == 0:
        return 0.0
    else:
        return sum(1 for score in relevance_scores[:k] if score > 0) / total_relevant

recall_scores = []

for query in manual_relevance_df['Query'].unique():
    query_data = manual_relevance_df[manual_relevance_df['Query'] == query]
    relevance_scores = query_data['Manual Relevance'].tolist()
    total_relevant = sum(1 for score in relevance_scores if score > 0)
    recall = recall_at_k(relevance_scores, total_relevant, k)
    recall_scores.append(recall)

average_recall = sum(recall_scores) / len(recall_scores)
print("Average Recall@5:", average_recall)


In [None]:
def ndcg_at_k(relevance_scores, k):
    """
    Calculate nDCG@k.

    Args:
    - relevance_scores (list): List of relevance scores for the top-k results.
    - k (int): Number of top results to consider.

    Returns:
    - float: nDCG@k score.
    """
    dcg = sum(rel / np.log2(idx + 2) for idx, rel in enumerate(relevance_scores[:k]))
    idcg = sum(sorted(relevance_scores, reverse=True)[:k][i] / np.log2(i + 2) for i in range(k))
    return dcg / idcg if idcg > 0 else 0

ndcg_scores = []

for query in manual_relevance_df['Query'].unique():
    query_data = manual_relevance_df[manual_relevance_df['Query'] == query]
    relevance_scores = query_data['Manual Relevance'].tolist()
    ndcg = ndcg_at_k(relevance_scores, k)
    ndcg_scores.append(ndcg)

average_ndcg = sum(ndcg_scores) / len(ndcg_scores)
print("Average nDCG@5:", average_ndcg)
