In [75]:
from sklearn.metrics.pairwise import cosine_similarity
import os
import pandas as pd
import numpy as np
from openai.embeddings_utils import get_embedding

In [3]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_word = True

    def autocomplete(self, prefix):
        node = self.root
        for char in prefix:
            if char not in node.children:
                return []  # No autocomplete suggestions
            node = node.children[char]

        suggestions = []
        self._find_words_with_prefix(node, prefix, suggestions)
        return suggestions

    def _find_words_with_prefix(self, node, current_prefix, suggestions):
        if node.is_end_of_word:
            suggestions.append(current_prefix)

        for char, child_node in node.children.items():
            self._find_words_with_prefix(child_node, current_prefix + char, suggestions)


In [4]:
# Example usage:
trie = Trie()
words = ["roarmoney", "instacash", "credit", "membership", "wow", "builder", "build"]
for word in words:
    trie.insert(word)


In [28]:
def autocomplete_sentence (query):
    autocomplete_sentence = []
    query = query.split()
    for word in query:
        if len(word) <= 3:
            autocomplete_sentence.append(word)
            continue
        autocomplete_word = trie.autocomplete(word)
        autocomplete_word = autocomplete_word[0] if autocomplete_word else None
        if autocomplete_word:
            autocomplete_sentence.append(autocomplete_word)
        else:
            autocomplete_sentence.append(word)
    #convert list to str
    sentence = " ".join(autocomplete_sentence)
    return sentence

In [29]:
# load in data
data_df = pd.read_csv('../input_files/search-query.csv')
data_df.head()

Unnamed: 0,INPUT,CATEGORY
0,want to put in my new debit c,Other
1,want to,Other
2,virtual wallet,roarmoney
3,vie,Other
4,switch contact method to,Other


In [34]:
# in column input, delete rows if len(INPUT) <= 3
data_df['INPUT'] = data_df['INPUT'].apply(lambda x: x if len(x) > 3 else None)
data_df = data_df.dropna(subset=['INPUT'])
data_df.head()

Unnamed: 0,INPUT,CATEGORY,autocomplete
0,want to put in my new debit c,Other,want to put in my new debit c
1,want to,Other,want to
2,virtual wallet,roarmoney,virtual wallet
4,switch contact method to,Other,switch contact method to
6,stoc,Other,stoc


In [38]:
# run the autocomplete function on INPUT column
data_df['AUTOCOMPLETE'] = data_df['INPUT'].apply(lambda x: autocomplete_sentence(x))
data_df.head()

Unnamed: 0,INPUT,CATEGORY,AUTOCOMPLETE
0,want to put in my new debit c,Other,want to put in my new debit c
1,want to,Other,want to
2,virtual wallet,roarmoney,virtual wallet
4,switch contact method to,Other,switch contact method to
6,stoc,Other,stoc


In [43]:
#rename autocomplete to input
#drop input column
#rename autocomplete to input
data_df = data_df.drop(columns=['INPUT'])
data_df = data_df.rename(columns={'AUTOCOMPLETE': 'INPUT'})

In [44]:
data_df.to_csv('../result_files/autocomplete.csv', index=False)

### Convert documents to embeddings

In [125]:
# read csv 
product_summary_df = pd.read_csv('../result_files/product_summary.csv')
product_summary_df.head()

Unnamed: 0,product_name,product_summary
0,roarmoney.txt,RoarMoney is a mobile bank account designed to...
1,wow_membership.txt,MoneyLion WOW is a membership that offers user...
2,crypto.txt,The MoneyLion crypto product is designed to ma...
3,credit_builder_plus.txt,Credit Builder Plus (CB+) is a membership serv...
4,instacash.txt,Instacash is a service that provides customers...


In [126]:
# in product_sumary column, convert to embeddings
product_summary_df['text-embedding-ada-002'] = product_summary_df['product_summary'].apply(lambda x: get_embedding(x, engine = "text-embedding-ada-002"))

In [143]:
product_summary_df.to_csv('../result_files/product_summary_embedded.csv', index=False)

### Perform semantic search

In [141]:
def semantic_search(query,df):
    '''
    Perform a search against a dataframe using cosine similarity
    query: string
    df: dataframe with documents and embeddings
    '''

    query_embedding = get_embedding(
        query, engine="text-embedding-ada-002"
    )

    # Calculate similarity of query against all documents
    similarity_scores = product_summary_df['text-embedding-ada-002'].apply(lambda x: cosine_similarity([query_embedding], [x])).to_list()
    top_document_index = np.argmax(similarity_scores)
    top_document = df.iloc[top_document_index]['product_summary']
    # print(f"Top document: {top_document}")
    # print(f"Similarity score: {similarity_scores[top_document_index]}")
    similarity_score = similarity_scores[top_document_index]
    if similarity_score >= 0.8:
        return top_document
    else:
        return 'finance'
    
    

In [142]:
search_query = input("Enter your search query: ")
search_query = autocomplete_sentence(search_query)
print("completed search query: ", search_query)
semantic_search(search_query, product_summary_df)

completed search query:  pizza


'finance'