# TF-IDF
### Start from here

In [1]:
import pandas as pd

data = pd.read_csv('annotated_conversational_data.csv')
queries = data['Query'].tolist()
suggestions = data['Suggested_Followup'].tolist()

documents = queries + suggestions


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
tfidf_matrix = vectorizer.fit_transform(documents)


In [3]:
from sklearn.metrics.pairwise import cosine_similarity

def suggest_queries(input_query, vectorizer, tfidf_matrix, documents, top_n=3):
    """
    Suggest follow-up queries based on an input query.

    Args:
        input_query (str): The initial query input by the user.
        vectorizer (TfidfVectorizer): The fitted TF-IDF vectorizer.
        tfidf_matrix (sparse matrix): TF-IDF matrix of the documents.
        documents (list): List of documents (queries and suggestions).
        top_n (int): Number of suggestions to return.

    Returns:
        list: Ranked list of suggested queries.
    """
    input_vector = vectorizer.transform([input_query])
    similarities = cosine_similarity(input_vector, tfidf_matrix).flatten()
    ranked_indices = similarities.argsort()[::-1][:top_n]

    return [documents[i] for i in ranked_indices if documents[i] != input_query][:top_n]


# Test follow-up queries

In [4]:
input_query = "What are the benefits of yoga?"
suggested_queries = suggest_queries(input_query, vectorizer, tfidf_matrix, documents)

print("Input Query:", input_query)
print("Suggested Follow-up Queries:")
for i, suggestion in enumerate(suggested_queries, 1):
    print(f"{i}. {suggestion}")


Input Query: What are the benefits of yoga?
Suggested Follow-up Queries:
1. How do you do yoga?
2. What is yoga all about?
3. What is yoga for?


In [5]:
def precision_at_k(relevant, retrieved, k):
    return len(set(relevant) & set(retrieved[:k])) / k

relevant_suggestions = ["How does yoga improve health?", "What are the mental benefits of yoga?"]
precision = precision_at_k(relevant_suggestions, suggested_queries, k=3)
print("Precision@3:", precision)


Precision@3: 0.0


# Simple Keyword Extraction

In [6]:
def simple_keyword(query, n=10):
    corpus = [query]
    # Compute TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)
    
    # Get keywords from the query
    keywords = vectorizer.get_feature_names_out()
    scores = X.toarray()[0]
    keyword_scores = list(zip(keywords, scores))
    sorted_keywords = sorted(keyword_scores, key=lambda x: x[1], reverse=True)
    
    return sorted_keywords[:n]

simple_keyword(input_query)

[('benefits', 0.7071067811865475), ('yoga', 0.7071067811865475)]

In [7]:
data['keyword_1'] = [''] * len(data)

for i in range(len(data)):
    try:
        data.loc[i,'keyword_1'] = ', '.join([k for k, v in simple_keyword(data.loc[i,'Suggested_Followup'])]) # + " " + data.loc[i,'Query'])])
    except:
        continue

data.head(5)

Unnamed: 0,Query,Suggested_Followup,Context,keyword_1
0,How can I be a good geologist?,What should I do to be a great geologist?,,"geologist, great"
1,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,,"comments, youtube"
2,What can make Physics easy to learn?,How can you make physics easy to learn?,,"easy, learn, make, physics"
3,What was your first sexual experience like?,What was your first sexual experience?,,"experience, sexual"
4,What would a Trump presidency mean for current...,How will a Trump presidency affect the student...,,"affect, planning, presently, presidency, stude..."


# Intermediate Keyword Extraction

In [8]:
!pip install rake-nltk



In [9]:
from rake_nltk import Rake

def rake_keyword(query):
    # Use RAKE to extract keywords
    r = Rake()
    r.extract_keywords_from_text(query)
    keywords = r.get_ranked_phrases()
    return set(keywords)

rake_keyword(input_query)

{'benefits', 'yoga'}

In [10]:
data['keyword_2'] = [''] * len(data)

for i in range(len(data)):
    try:
        data.loc[i,'keyword_2'] = ', '.join(rake_keyword(data.loc[i,'Suggested_Followup'])) # + " " + data.loc[i,'Query']))
    except:
        continue

data.head(5)

Unnamed: 0,Query,Suggested_Followup,Context,keyword_1,keyword_2
0,How can I be a good geologist?,What should I do to be a great geologist?,,"geologist, great",great geologist
1,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,,"comments, youtube","see, youtube comments"
2,What can make Physics easy to learn?,How can you make physics easy to learn?,,"easy, learn, make, physics","learn, make physics easy"
3,What was your first sexual experience like?,What was your first sexual experience?,,"experience, sexual",first sexual experience
4,What would a Trump presidency mean for current...,How will a Trump presidency affect the student...,,"affect, planning, presently, presidency, stude...","planning, trump presidency affect, study, us, ..."


# Query Retrieval by Keywords

In [11]:
import pandas as pd
from fuzzywuzzy import process
import re

# Load annotated data
# data = pd.read_csv('annotated_conversational_data.csv')

# Text Preprocessing Function
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Preprocess the dataset columns
data['Query'] = data['Query'].apply(preprocess_text)
data['Suggested_Followup'] = data['Suggested_Followup'].apply(preprocess_text)
data['keyword_1'] = data['keyword_1'].apply(preprocess_text)
data['keyword_2'] = data['keyword_2'].apply(preprocess_text)

# Function to find the best match and its follow-ups
def find_best_match_and_followups(input_query, data, threshold=70):
    """
    Find the best matching query in the dataset for the input query and return its follow-ups.

    Args:
        input_query (str): The user's input query.
        data (DataFrame): The dataframe containing 'Query', 'Suggested_Followup', and keywords.
        threshold (int): Minimum score for a match to be considered valid (0-100).

    Returns:
        tuple: The best match from the dataset and a list of its suggested follow-ups.
    """
    # Preprocess the input query
    input_query_processed = preprocess_text(input_query)
    
    # Combine all keywords into a single search field
    data['combined_keywords'] = data['keyword_1'] + " " + data['keyword_2']
    
    # Search for the best match across queries and keywords
    all_options = data['Query'].tolist() + data['combined_keywords'].tolist()
    best_match, score = process.extractOne(input_query_processed, all_options)
    
    # Determine if the match is a query or a keyword
    if best_match in data['Query'].tolist():
        matched_query = best_match
    else:
        matched_query = data[data['combined_keywords'] == best_match]['Query'].iloc[0]
    
    # Check if the match score meets the threshold
    if score >= threshold:
        followups = data[data['Query'] == matched_query]['Suggested_Followup'].tolist()
        
        # Format suggestions: capitalize and ensure ending question marks
        formatted_followups = [
            suggestion.capitalize() + "?"
            if not suggestion.endswith("?") else suggestion.capitalize()
            for suggestion in followups
        ]
        
        # Capitalize and punctuate the matched query
        matched_query_formatted = matched_query.capitalize() + "?"
        
        return matched_query_formatted, formatted_followups
    else:
        return None, ["No matching query found in the dataset."]

# Test with an input query
input_query = "yoga benefits"
best_match, suggested_followups = find_best_match_and_followups(input_query, data)

print("Input Query:", input_query)
print("Best Match in Dataset:", best_match)
print("Suggested Follow-up Queries:")
for i, suggestion in enumerate(suggested_followups, 1):
    print(f"{i}. {suggestion}")

Input Query: yoga benefits
Best Match in Dataset: How do you even do yoga?
Suggested Follow-up Queries:
1. How do you do yoga?


# BM25 Query Retrieval and Re-ranking
reference: 
- https://dev.to/mage_ai/how-to-build-a-search-engine-with-word-embeddings-56jd
- https://github.com/czhu12/semantic-search/blob/master/search.py

In [12]:
!pip install rank_bm25



In [13]:
from rank_bm25 import BM25Okapi as BM25
import gensim
from gensim import corpora
import gensim.downloader as api
import numpy as np
import logging
logging.basicConfig(level=logging.DEBUG)

class Retriever(object):
    def __init__(self, documents):
        self.corpus = documents
        self.bm25 = BM25(self.corpus)

    def query(self, tokenized_query, n=100):
        scores = self.bm25.get_scores(tokenized_query)
        best_docs = sorted(range(len(scores)), key=lambda i: -scores[i])[:n]
        return best_docs, [scores[i] for i in best_docs]


class Ranker(object):
    def __init__(self, query_embedding, document_embedding):
        self.query_embedding = query_embedding
        self.document_embedding = document_embedding

    def _create_mean_embedding(self, word_embeddings):
        return np.mean(
            word_embeddings,
            axis=0,
        )

    def _create_max_embedding(self, word_embeddings):
        return np.amax(
            word_embeddings,
            axis=0,
        )

    def _embed(self, tokens, embedding):
        word_embeddings = np.array([embedding[token] for token in tokens if token in embedding])
        mean_embedding = self._create_mean_embedding(word_embeddings)
        max_embedding = self._create_max_embedding(word_embeddings)
        embedding = np.concatenate([mean_embedding, max_embedding])
        unit_embedding = embedding / (embedding**2).sum()**0.5
        return unit_embedding

    def rank(self, tokenized_query, tokenized_documents):
        """
        Re-ranks a set of documents according to embedding distance
        """
        query_embedding = self._embed(tokenized_query, self.query_embedding) # (E,)
        document_embeddings = np.array([self._embed(document, self.document_embedding) for document in tokenized_documents]) # (N, E)
        scores = document_embeddings.dot(query_embedding)
        index_rankings = np.argsort(scores)[::-1]
        return index_rankings, np.sort(scores)[::-1]


def tokenize(document):
    return list(gensim.utils.tokenize(document.lower()))


def show_scores(documents, scores, n=10):
    for i in range(n):
        print("======== RANK: {} | SCORE: {} =======".format(i + 1, scores[i]))
        print(documents[i])
        print("")
    print("\n")

In [14]:
print("Input Query: {}".format(input_query))

# tokenize
corpus = [list(gensim.utils.tokenize(doc.lower())) for doc in data['Query'].tolist() + data['Suggested_Followup'].tolist()]
tokenized_query = tokenize(input_query)

# Retrieval
retriever = Retriever(corpus)
retrieval_indexes, retrieval_scores = retriever.query(tokenized_query)

retrieved_documents = [documents[idx] for idx in retrieval_indexes]
print("======== BM25 ========")
show_scores(retrieved_documents, retrieval_scores, 20)

tokenzed_retrieved_documents = [corpus[idx] for idx in retrieval_indexes]

# re-ranking
print("Loading glove embeddings...", end="")
query_embedding = api.load('glove-wiki-gigaword-50')
print(" [DONE]")
ranker = Ranker(query_embedding=query_embedding, document_embedding=query_embedding)
ranker_indexes, ranker_scores = ranker.rank(tokenized_query, tokenzed_retrieved_documents)
reranked_documents = [retrieved_documents[idx] for idx in ranker_indexes]

print("======== Embedding ========")
show_scores(reranked_documents, ranker_scores, 20)

Input Query: yoga benefits


INFO:gensim.models.keyedvectors:loading projection weights from /Users/Kaito.01/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz
DEBUG:smart_open.smart_open_lib:{'uri': '/Users/Kaito.01/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}


What are the benefits of hot yoga vs regular yoga?

What is Kriya yoga and what are the benefits of Kriya yoga?

What are the benefits of Kriya yoga?

What are the benefits of yoga therapy?

How beneficial is hot yoga compared to regular yoga?

How does yoga work?

What is yoga for?

What is yoga all about?

How do you do yoga?

How do you even do yoga?

What are the reasons why you wear yoga pants outside of yoga class or working out?

What are best website for learning yoga?

What are Yoga poses to lose weight?

How do I income in yoga field?

What are Yoga poses to lose weight?

What are the advantage of yoga therapy?

How can I become a yoga teacher?

Why do women wear yoga pants in public?

What is needed to become a yoga teacher?

What is yoga and how does it work?



Loading glove embeddings...

DEBUG:gensim.utils:starting a new internal lifecycle event log for KeyedVectors
INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (400000, 50) matrix of type float32 from /Users/Kaito.01/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2024-12-08T14:16:51.415736', 'gensim': '4.3.3', 'python': '3.9.19 | packaged by conda-forge | (main, Mar 20 2024, 12:53:33) \n[Clang 16.0.6 ]', 'platform': 'macOS-14.6.1-x86_64-i386-64bit', 'event': 'load_word2vec_format'}


 [DONE]
What are the benefits of Kriya yoga?

What are transcendental meditation benefits?

What is Kriya yoga and what are the benefits of Kriya yoga?

What are the benefits of yoga therapy?

How does yoga work?

What is yoga for?

What are the advantage of yoga therapy?

What are Yoga poses to lose weight?

What are Yoga poses to lose weight?

What is yoga all about?

What are some yoga poses to help me lose weight?

What are some yoga poses to help me lose weight?

How do I income in yoga field?

How can I become a yoga teacher?

What is needed to become a yoga teacher?

What are some good yoga techniques for weight loss?

How much time does it take to lose weight doing yoga?

What are some tips for practicing yoga at home?

What is yoga and how does it work?

How can I earn high income in yoga field?



