In [2]:
import pandas as pd

data = pd.read_csv('quora_question_pairs.csv')
duplicates = data[data['is_duplicate'] == 1]
filtered_data = duplicates[['question1', 'question2']]


FileNotFoundError: [Errno 2] No such file or directory: 'quora_question_pairs.csv'

In [None]:
exploratory_starters = ['What', 'How', 'Why', 'Explain']

def is_exploratory(question):
    return any(question.strip().startswith(starter) for starter in exploratory_starters)

filtered_data = filtered_data[
    filtered_data['question1'].apply(is_exploratory) &
    filtered_data['question2'].apply(is_exploratory)
]


In [None]:
annotated_data = filtered_data.rename(columns={
    'question1': 'Query',
    'question2': 'Suggested_Followup'
})
annotated_data['Context'] = ""

annotated_data.to_csv('annotated_conversational_data.csv', index=False)


In [None]:
small_dataset = annotated_data.sample(50, random_state=42)
small_dataset.to_csv('small_annotated_data.csv', index=False)


# TF-IDF
### Start from here

In [None]:
import pandas as pd

data = pd.read_csv('annotated_conversational_data.csv')
queries = data['Query'].tolist()
suggestions = data['Suggested_Followup'].tolist()

documents = queries + suggestions


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
tfidf_matrix = vectorizer.fit_transform(documents)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def suggest_queries(input_query, vectorizer, tfidf_matrix, documents, top_n=3):
    """
    Suggest follow-up queries based on an input query.

    Args:
        input_query (str): The initial query input by the user.
        vectorizer (TfidfVectorizer): The fitted TF-IDF vectorizer.
        tfidf_matrix (sparse matrix): TF-IDF matrix of the documents.
        documents (list): List of documents (queries and suggestions).
        top_n (int): Number of suggestions to return.

    Returns:
        list: Ranked list of suggested queries.
    """
    input_vector = vectorizer.transform([input_query])
    similarities = cosine_similarity(input_vector, tfidf_matrix).flatten()
    ranked_indices = similarities.argsort()[::-1][:top_n]

    return [documents[i] for i in ranked_indices if documents[i] != input_query][:top_n]


# Test follow-up queries

In [None]:
input_query = "What are the benefits of yoga?"
suggested_queries = suggest_queries(input_query, vectorizer, tfidf_matrix, documents)

print("Input Query:", input_query)
print("Suggested Follow-up Queries:")
for i, suggestion in enumerate(suggested_queries, 1):
    print(f"{i}. {suggestion}")


Input Query: What are the benefits of yoga?
Suggested Follow-up Queries:
1. How do you do yoga?
2. What is yoga all about?
3. What is yoga for?


In [None]:
def precision_at_k(relevant, retrieved, k):
    return len(set(relevant) & set(retrieved[:k])) / k

relevant_suggestions = ["How does yoga improve health?", "What are the mental benefits of yoga?"]
precision = precision_at_k(relevant_suggestions, suggested_queries, k=3)
print("Precision@3:", precision)


Precision@3: 0.0


# Simple Keyword Extraction

In [None]:
def simple_keyword(query, n=10):
    corpus = [query]
    # Compute TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)
    
    # Get keywords from the query
    keywords = vectorizer.get_feature_names_out()
    scores = X.toarray()[0]
    keyword_scores = list(zip(keywords, scores))
    sorted_keywords = sorted(keyword_scores, key=lambda x: x[1], reverse=True)
    
    return sorted_keywords[:n]

simple_keyword(input_query)

[('benefits', 0.7071067811865475), ('yoga', 0.7071067811865475)]

In [None]:
data['keyword_1'] = [''] * len(data)

for i in range(len(data)):
    try:
        data.loc[i,'keyword_1'] = ', '.join([k for k, v in simple_keyword(data.loc[i,'Suggested_Followup'])]) # + " " + data.loc[i,'Query'])])
    except:
        continue

data.head(5)

Unnamed: 0,Query,Suggested_Followup,Context,keyword_1
0,How can I be a good geologist?,What should I do to be a great geologist?,,"geologist, great"
1,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,,"comments, youtube"
2,What can make Physics easy to learn?,How can you make physics easy to learn?,,"easy, learn, make, physics"
3,What was your first sexual experience like?,What was your first sexual experience?,,"experience, sexual"
4,What would a Trump presidency mean for current...,How will a Trump presidency affect the student...,,"affect, planning, presently, presidency, stude..."


# Intermediate Keyword Extraction

In [None]:
!pip install rake-nltk



In [None]:
from rake_nltk import Rake

def rake_keyword(query):
    # Use RAKE to extract keywords
    r = Rake()
    r.extract_keywords_from_text(query)
    keywords = r.get_ranked_phrases()
    return set(keywords)

rake_keyword(input_query)

{'benefits', 'yoga'}

In [None]:
data['keyword_2'] = [''] * len(data)

for i in range(len(data)):
    try:
        data.loc[i,'keyword_2'] = ', '.join(rake_keyword(data.loc[i,'Suggested_Followup'])) # + " " + data.loc[i,'Query']))
    except:
        continue

data.head(5)

Unnamed: 0,Query,Suggested_Followup,Context,keyword_1,keyword_2
0,How can I be a good geologist?,What should I do to be a great geologist?,,"geologist, great",great geologist
1,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,,"comments, youtube","youtube comments, see"
2,What can make Physics easy to learn?,How can you make physics easy to learn?,,"easy, learn, make, physics","make physics easy, learn"
3,What was your first sexual experience like?,What was your first sexual experience?,,"experience, sexual",first sexual experience
4,What would a Trump presidency mean for current...,How will a Trump presidency affect the student...,,"affect, planning, presently, presidency, stude...","trump presidency affect, study, planning, stud..."


In [None]:
import pandas as pd
from fuzzywuzzy import process
import re

# Load annotated data
# data = pd.read_csv('annotated_conversational_data.csv')

# Text Preprocessing Function
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Preprocess the dataset columns
data['Query'] = data['Query'].apply(preprocess_text)
data['Suggested_Followup'] = data['Suggested_Followup'].apply(preprocess_text)
data['keyword_1'] = data['keyword_1'].apply(preprocess_text)
data['keyword_2'] = data['keyword_2'].apply(preprocess_text)

# Function to find the best match and its follow-ups
def find_best_match_and_followups(input_query, data, threshold=70):
    """
    Find the best matching query in the dataset for the input query and return its follow-ups.

    Args:
        input_query (str): The user's input query.
        data (DataFrame): The dataframe containing 'Query', 'Suggested_Followup', and keywords.
        threshold (int): Minimum score for a match to be considered valid (0-100).

    Returns:
        tuple: The best match from the dataset and a list of its suggested follow-ups.
    """
    # Preprocess the input query
    input_query_processed = preprocess_text(input_query)
    
    # Combine all keywords into a single search field
    data['combined_keywords'] = data['keyword_1'] + " " + data['keyword_2']
    
    # Search for the best match across queries and keywords
    all_options = data['Query'].tolist() + data['combined_keywords'].tolist()
    best_match, score = process.extractOne(input_query_processed, all_options)
    
    # Determine if the match is a query or a keyword
    if best_match in data['Query'].tolist():
        matched_query = best_match
    else:
        matched_query = data[data['combined_keywords'] == best_match]['Query'].iloc[0]
    
    # Check if the match score meets the threshold
    if score >= threshold:
        followups = data[data['Query'] == matched_query]['Suggested_Followup'].tolist()
        
        # Format suggestions: capitalize and ensure ending question marks
        formatted_followups = [
            suggestion.capitalize() + "?"
            if not suggestion.endswith("?") else suggestion.capitalize()
            for suggestion in followups
        ]
        
        # Capitalize and punctuate the matched query
        matched_query_formatted = matched_query.capitalize() + "?"
        
        return matched_query_formatted, formatted_followups
    else:
        return None, ["No matching query found in the dataset."]

# Test with an input query
input_query = "yoga benefits"
best_match, suggested_followups = find_best_match_and_followups(input_query, data)

print("Input Query:", input_query)
print("Best Match in Dataset:", best_match)
print("Suggested Follow-up Queries:")
for i, suggestion in enumerate(suggested_followups, 1):
    print(f"{i}. {suggestion}")



KeyError: 'keyword_1'