In [None]:
import pandas as pd

data = pd.read_csv('quora_question_pairs.csv')
duplicates = data[data['is_duplicate'] == 1]
filtered_data = duplicates[['question1', 'question2']]


In [None]:
exploratory_starters = ['What', 'How', 'Why', 'Explain']

def is_exploratory(question):
    return any(question.strip().startswith(starter) for starter in exploratory_starters)

filtered_data = filtered_data[
    filtered_data['question1'].apply(is_exploratory) &
    filtered_data['question2'].apply(is_exploratory)
]


In [None]:
annotated_data = filtered_data.rename(columns={
    'question1': 'Query',
    'question2': 'Suggested_Followup'
})
annotated_data['Context'] = ""

annotated_data.to_csv('annotated_conversational_data.csv', index=False)


In [None]:
small_dataset = annotated_data.sample(50, random_state=42)
small_dataset.to_csv('small_annotated_data.csv', index=False)


# TF-IDF
### Start from here

In [None]:
import pandas as pd

data = pd.read_csv('annotated_conversational_data.csv')
queries = data['Query'].tolist()
suggestions = data['Suggested_Followup'].tolist()

documents = queries + suggestions


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
tfidf_matrix = vectorizer.fit_transform(documents)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def suggest_queries(input_query, vectorizer, tfidf_matrix, documents, top_n=3):
    """
    Suggest follow-up queries based on an input query.

    Args:
        input_query (str): The initial query input by the user.
        vectorizer (TfidfVectorizer): The fitted TF-IDF vectorizer.
        tfidf_matrix (sparse matrix): TF-IDF matrix of the documents.
        documents (list): List of documents (queries and suggestions).
        top_n (int): Number of suggestions to return.

    Returns:
        list: Ranked list of suggested queries.
    """
    input_vector = vectorizer.transform([input_query])
    similarities = cosine_similarity(input_vector, tfidf_matrix).flatten()
    ranked_indices = similarities.argsort()[::-1][:top_n]

    return [documents[i] for i in ranked_indices if documents[i] != input_query][:top_n]


# Test follow-up queries

In [None]:
input_query = "What are the benefits of yoga?"
suggested_queries = suggest_queries(input_query, vectorizer, tfidf_matrix, documents)

print("Input Query:", input_query)
print("Suggested Follow-up Queries:")
for i, suggestion in enumerate(suggested_queries, 1):
    print(f"{i}. {suggestion}")


Input Query: What are the benefits of yoga?
Suggested Follow-up Queries:
1. How do you do yoga?
2. What is yoga all about?
3. What is yoga for?


In [None]:
def precision_at_k(relevant, retrieved, k):
    return len(set(relevant) & set(retrieved[:k])) / k

relevant_suggestions = ["How does yoga improve health?", "What are the mental benefits of yoga?"]
precision = precision_at_k(relevant_suggestions, suggested_queries, k=3)
print("Precision@3:", precision)


Precision@3: 0.0
