In [57]:
import os
from dotenv import load_dotenv

# Load the environment variables from the .env file
# Loads the COHERE_API_KEY into the environment
load_dotenv()

# If your environment already has the API key loaded, the above step can be ignored


True

In [58]:
import cohere
import hnswlib
import json
import pandas as pd
import numpy as np
from typing import List, Dict
import uuid


In [59]:
#Cohere API test
co = cohere.Client(os.getenv('COHERE_API_KEY'))

response = co.generate(
  prompt='Please explain to me how LLMs work',
  max_tokens=100
)


In [60]:
print(response.generations[0].text)

 LLMs, or Large Language Models, are a type of neural network architecture designed to understand and generate text based on the patterns and relationships within language. They are trained on massive amounts of text data from the internet, learning to predict the next word in a sequence based on the words that came before it.

Here is a simplified breakdown of how LLMs work:

1. Data Collection - Millions of text documents from the internet are gathered, preprocessed, and cleaned, removing any inappropriate


In [61]:
df = pd.read_csv('forum_post_details_with_type.csv')
df.head()

Unnamed: 0,uid,title,url,author,content,date,order,Type
0,c897f16d-d8a3-554a-aec7-733500b736bf,Welcome to Distill.io,https://forums.distill.io/t/welcome-to-distill...,system,A community for discussing topics for web moni...,,1.0,Answer
1,e96b341f-ff13-5c3e-98b0-6fd41d2b2cf9,How do I know my macro is working?,https://forums.distill.io/t/how-do-i-know-my-m...,j.hapgood,I got a paid subscription so I could use a mac...,,1.0,Question
2,4697d49a-8260-5429-85e7-3df5f70af338,How do I know my macro is working?,https://forums.distill.io/t/how-do-i-know-my-m...,j.hapgood,This is a DUMMY statement,"February 23, 2024, 5:13pm",2.0,Question
3,3c484f6c-8f40-5065-9779-90639461a4ba,This account password cannot be initialized. P...,https://forums.distill.io/t/this-account-passw...,yt123190,This account password cannot be initialized. P...,,1.0,Statement
4,581ee545-91df-526f-bd3f-0943d85fe27c,No export possible using the Desktop App,https://forums.distill.io/t/no-export-possible...,gchampeau,"Hi there,\nThanks for your tool, it’s very han...",,1.0,Answer


In [62]:
df.rename(columns={'content': 'text'}, inplace=True)

res = df.groupby(['title','url']).apply(lambda x: x.drop(['title', 'url', 'uid','author'], axis=1).to_dict('records')).reset_index(name='data')
print(res.iloc[1]['data'])


[{'text': 'I’ve looked everywhere I an think of and can’t find a way to change the timeout setting when checking for changes. It seems like this would be a common setting as some website can take a while to completely load, especially if the internet connection is very slow.\nThanks if advance for any help y’all can provide!', 'date': nan, 'order': 1.0, 'Type': 'Question'}, {'text': 'I have the same issue. Also when investigating I can see that the page I am trying to monitor doesn’t load fully when it tries to check for changes, but when I click on the tab all content is loaded (in another container for example)', 'date': 'August 17, 2023,  8:42am', 'order': 2.0, 'Type': 'Answer'}, {'text': '@SPDurkee Welcome to Distill community!\nHere are the options that you can try based on your page:\n\n\nIf you’re dealing with static content that requires monitoring, consider setting the dynamic parameter to FALSE in the config. This adjustment can enhance the loading speed of static pages, part

  res = df.groupby(['title','url']).apply(lambda x: x.drop(['title', 'url', 'uid','author'], axis=1).to_dict('records')).reset_index(name='data')


In [63]:
class ForumDocuments:
    """
    A basic class construct representing the Forum Posts

    Parameters:
    sources (pandas dataframe): A dataframe representing the details of the various forum posts.
    Each row of the DF will have various attributes associated with the forum post details.

    Attributes:
    sources (list): List of dictionaries representing each forum post page

    docs (list): A list of dictionaries representing the documents with the following keys
     - title
     - content/text
     - url
     - order
     - type: Question / Answer / Statement
     - date of publication
    
    docs_embs (list): List of associated document embeddings
    retrieve_top_k (int): The number of documents to retrieve during search (defaults to 10)
    rerank_top_k (int): Number of docs to rerank after retrieval
    docs_len (int): Number of documents in collection
    index (hnswlib.index): Index used for document retrieval

    Methods:
    load(): Loads the data from the sources + post details dataset
    embed(): embeds the documents using the Cohere API
    index(): Indexes the documents for efficient retrieval
    retrieve(query): Retrieves documents based on the given query

    """

    def __init__(self, sources):
        self.sources = sources
        self.docs = []
        self.docs_embs = []
        self.retrieve_top_k = 10
        self.rerank_top_k = 3
        self.load()
        self.embed()
        self.index()
    
    def load(self) -> None:
        """
        Loads the documents from the dataframe and updates docs
        """
        print('Loading documents...')

        def custom_aggregation(x):
            x['date'] = x['date'].fillna('') # Excluding NaN values
            x['text'] = x['text'].fillna('')
            return x.drop(['title', 'url', 'uid','author'], axis=1).to_dict('records')
        

        res = self.sources.groupby(['title','url']).apply(custom_aggregation).reset_index(name='data')

        for index, row in res.iterrows():
            self.docs.append(
                {
                    'title':row['title'],
                    'text':str(row['data']),
                    'url':row['url'],
                }
            )
    def embed(self) -> None:
        """ 
        Embeds the documents using Cohere's API
        """
        print('Embedding documents...')
        # Custom encoder function
        def replace_non_compliant(value):
            if isinstance(value, float):
                if np.isnan(value):
                    return "NaN"  # or use None
                if np.isinf(value):
                    return "Infinity" if value > 0 else "-Infinity"
            return value

        batch_size = 90
        self.docs_len = len(self.docs)
        for i in range(0, self.docs_len, batch_size):
            batch = self.docs[i: min(i+batch_size, self.docs_len)]
            #texts = [json.dumps(item['text'], default=replace_non_compliant) for item in batch] # converting to json string before passing into embedding function
            texts = [replace_non_compliant(item['text']) for item in batch]
            docs_embs_batch = co.embed(
                texts=texts, model="embed-english-v3.0", input_type="search_document"
            ).embeddings
            self.docs_embs.extend(docs_embs_batch)

    def index(self) -> None:
        """
        Indexes the documents for efficient retrieval.
        """
        print("Indexing documents...")

        self.idx = hnswlib.Index(space="ip", dim=1024) # using inner product for indexing
        self.idx.init_index(max_elements=self.docs_len, ef_construction=512, M=64)
        self.idx.add_items(self.docs_embs, list(range(len(self.docs_embs))))

        print(f"Indexing complete with {self.idx.get_current_count()} documents.")
    
    def retrieve(self, query: str) -> List[Dict[str, str]]:
        """
        Retrieves documents based on the given query.

        Parameters:
        query (str): The query to retrieve documents for.

        Returns:
        List[Dict[str, str]]: A list of dictionaries representing the retrieved documents, with 'title', 'text', and 'url' keys.
        """
        docs_retrieved = []
        query_emb = co.embed(
            texts=[query], model="embed-english-v3.0", input_type="search_query"
        ).embeddings

        doc_ids = self.idx.knn_query(query_emb, k=self.retrieve_top_k)[0][0]

        docs_to_rerank = []
        for doc_id in doc_ids:
            docs_to_rerank.append(self.docs[doc_id]["text"])

        rerank_results = co.rerank(
            query=query,
            documents=docs_to_rerank,
            top_n=self.rerank_top_k,
            model="rerank-english-v2.0",
        )

        doc_ids_reranked = []
        for result in rerank_results:
            doc_ids_reranked.append(doc_ids[result.index])

        for doc_id in doc_ids_reranked:
            docs_retrieved.append(
                {
                    "title": self.docs[doc_id]["title"],
                    "text": self.docs[doc_id]["text"],
                    "url": self.docs[doc_id]["url"],
                }
            )

        return docs_retrieved


In [64]:
class Chatbot:
    """
    A class representing a chatbot.

    Parameters:
    docs (Documents): An instance of the Documents class representing the collection of documents.

    Attributes:
    conversation_id (str): The unique ID for the conversation.
    docs (Documents): An instance of the Documents class representing the collection of documents.

    Methods:
    generate_response(message): Generates a response to the user's message.
    retrieve_docs(response): Retrieves documents based on the search queries in the response.

    """

    def __init__(self, docs: ForumDocuments):
        self.docs = docs
        self.conversation_id = str(uuid.uuid4())

    def generate_response(self, message: str):
        """
        Generates a response to the user's message.

        Parameters:
        message (str): The user's message.

        Yields:
        Event: A response event generated by the chatbot.

        Returns:
        List[Dict[str, str]]: A list of dictionaries representing the retrieved documents.

        """
        # Generate search queries (if any)
        response = co.chat(message=message, search_queries_only=True)

        # If there are search queries, retrieve documents and respond
        preamble_override = "You only answer questions using on the documents you have provided with"
        
        if response.search_queries:
            print("Retrieving information...")
            documents = self.retrieve_docs(response)
            print('Retrieved docs: ', len(documents))
            response = co.chat(
                message=message,
                preamble_override = preamble_override,
                documents=documents,
                conversation_id=self.conversation_id,
                stream=True,
            )
            for event in response:
                yield event
            yield response

        # If there is no search query, directly respond
        else:
            response = co.chat(
                message=message,
                preamble_override = preamble_override,
                conversation_id=self.conversation_id, 
                stream=True
            )
            for event in response:
                yield event

    def retrieve_docs(self, response) -> List[Dict[str, str]]:
        """
        Retrieves documents based on the search queries in the response.

        Parameters:
        response: The response object containing search queries.

        Returns:
        List[Dict[str, str]]: A list of dictionaries representing the retrieved documents.

        """
        # Get the query(s)
        queries = []
        for search_query in response.search_queries:
            queries.append(search_query["text"])

        # Retrieve documents for each query
        retrieved_docs = []
        for query in queries:
            retrieved_docs.extend(self.docs.retrieve(query))

        return retrieved_docs

In [65]:
class App:
    def __init__(self, chatbot: Chatbot):
        """
        Initializes an instance of the App class.

        Parameters:
        chatbot (Chatbot): An instance of the Chatbot class.

        """
        self.chatbot = chatbot
    
    def run(self):
        """
        Runs the chatbot application.

        """
        while True:
            # Get the user message
            message = input("User: ")

            # Typing "quit" ends the conversation
            if message.lower() == "quit":
                print("Ending chat.")
                break
            else:
                print(f"User: {message}")

            # Get the chatbot response
            response = self.chatbot.generate_response(message)

            # Print the chatbot response
            print("Chatbot:")
            
            citations_flag = False
            
            for event in response:
                stream_type = type(event).__name__
                
                # Text
                if stream_type == "StreamTextGeneration":
                    print(event.text, end="")

                # Citations
                if stream_type == "StreamCitationGeneration":
                    if not citations_flag:
                        print("\n\nCITATIONS:")
                        citations_flag = True
                    print(event.citations[0])
                
                # Documents
                if citations_flag:
                    if stream_type == "StreamingChat":
                        print("\n\nDOCUMENTS:")
                        documents = [{'id': doc['id'],
                                      'text': doc['text'][:50] + '...',
                                      'title': doc['title'],
                                      'url': doc['url']} 
                                      for doc in event.documents]
                        for doc in documents:
                            print(doc)

            print(f"\n{'-'*100}\n")


In [66]:
docs = ForumDocuments(df.head(100))



Loading documents...
Embedding documents...


  res = self.sources.groupby(['title','url']).apply(custom_aggregation).reset_index(name='data')


Indexing documents...
Indexing complete with 24 documents.


In [67]:
# Create an instance of the Chatbot class with the Documents instance
chatbot = Chatbot(docs)

# Create an instance of the App class with the Chatbot instance
app = App(chatbot)

# Run the chatbot
app.run()

User: My macro gives me an error. What do I do?
Chatbot:
Retrieving information...
Retrieved docs:  3

----------------------------------------------------------------------------------------------------

Ending chat.
