# **Importing Libraries**
We begin by importing the necessary libraries:

**langchain:** Used to manage prompt templates, chains, and memory for conversational contexts.

**OpenAI:** Provides the language model for response generation.

In [1]:
%%writefile requirements.txt
langchain
langchain-community
langchain-openai
pypdf
langchain-chroma
gradio

Writing requirements.txt


In [2]:
!pip install -q -r  requirements.txt

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.2/57.2 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.2/320.2 kB[0m [31m20.6 MB/s[0m eta [36m0:00:

In [3]:
from langchain_openai import OpenAI
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import hub
from google.colab import userdata
import os
import pandas as pd
import numpy as np

# **Setting Up Environment Variables**
To interact with the OpenAI API, you need an API key. The code snippet below fetches the API key stored as an environment variable. Ensure the variable OPENAI_API_KEY is set in your environment for the chatbot to function correctly.

In [4]:
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

# **Indexing**
We start by preparing our documents for retrieval using embeddings and vector storage.

## **Load**
In this step, we load news data. The data used here is from the reuters-21578 dataset.

You can upload this or other documents to provide a robust foundation for the chatbot's responses.

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
file_path = '/content/drive/MyDrive/topics_classification_dataset.csv'
df = pd.read_csv(file_path)
df

Unnamed: 0,LEWISSPLIT,Text,Topics
0,TRAIN,JAGUAR SEES STRONG GROWTH IN NEW MODEL SALES J...,earn
1,TRAIN,NORD RESOURCES CORP <NRD> 4TH QTR NET Shr 19 c...,earn
2,TRAIN,FIVE GROUPS APPLY TO BUY FRENCH TELEPHONE GROU...,acq
3,TRAIN,BLIZZARD CLOSES BOSPHORUS Blizzard conditions ...,ship
4,TRAIN,JAPAN FUND <JPN> SEEKERS CONFIDENT OF FINANCIN...,acq
...,...,...,...
7052,TRAIN,BAKER INTERNATIONAL CORP SUES HUGHES TOOL SEEK...,acq
7053,TRAIN,USAIR GROUP REJECTS TRANS WORLD AIRLINES TAKEO...,acq
7054,TRAIN,BAKER <BKO> SUES TO FORCE HUGHES <HT> MERGER B...,acq
7055,TRAIN,SPAIN DEREGULATES BANK DEPOSIT INTEREST RATES ...,interest


In [7]:
column_name = 'Text'
text_data = df[column_name].dropna().tolist()

## **Store**
We create embeddings of our text chunks and store them in a vector database. This allows us to search for similar content efficiently.

In [8]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
db = Chroma.from_texts(text_data, embeddings)

TypeError: Client.__init__() got an unexpected keyword argument 'proxies'

# **Retrieval and Generation**
With the indexed documents, we can now retrieve relevant information and generate responses based on user questions.

## **Retrieve**
We set up a retriever to find content related to a user's query based on similarity with stored embeddings.

In [None]:
retriever = db.as_retriever(search_type= "similarity")

In [None]:
question = retriever.invoke("Who did Irving Bank Corp acquire a division from?")

In [None]:
text_data[0]

In [None]:
for i in range(len(question)):
  print(f"the {i+1}th similar content :\n \n {question[i].page_content}\n \n")

## **Multi Query**
we create multiple versions of a question to capture different perspectives or possible interpretations. This improves retrieval diversity.

In [None]:
llm = OpenAI(model_name="gpt-3.5-turbo-instruct")

In [None]:
from langchain.prompts import ChatPromptTemplate

# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate five
different versions of the given user question to retrieve relevant documents from a vector
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search.
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_perspectives
    | ChatOpenAI(temperature=0)
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)

In [None]:
generate_queries.invoke("Who did Irving Bank Corp acquire a division from?")

In [None]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    return [loads(doc) for doc in unique_docs]

# Retrieve
question = "Who did Irving Bank Corp acquire a division from?"
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})

In [None]:
for i in range(len(docs)):
  print(f"{i+1}. {docs[1].page_content}\n \n")

## **Generate**
We process the retrieved information to generate a response using a language model, enhancing the chatbot's ability to answer complex queries.

In [None]:
from operator import itemgetter
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain.memory import ConversationBufferWindowMemory
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers.string import StrOutputParser

# RAG template
template = """Use the given context and conversation history to directly answer the question concisely:

Context:
{context}

Chat history:
{chat_history}

New human question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

# Initialize model and memory
llm = ChatOpenAI(temperature=1)
memory = ConversationBufferWindowMemory(k = 3, memory_key="chat_history")

# Define RAG pipeline without memory directly
final_rag_chain = (
    {"context": itemgetter("context"),
     "question": itemgetter("question"),
     "chat_history": itemgetter("chat_history")}  # Include chat_history as a key here
    | prompt
    | llm
    | StrOutputParser()
)

# Retrieve the chat history from memory and include it in the input
def invoke_with_memory(question, retrieval_chain):
    # Load current chat history from memory
    chat_history = memory.load_memory_variables({}).get("chat_history", "")
    result = final_rag_chain.invoke({
        "question": question,
        "context": retrieval_chain,  # Assuming retrieval_chain is defined elsewhere
        "chat_history": chat_history,
    })
    # Update memory with the new interaction
    memory.save_context({"question": question}, {"answer": result})
    return result

## **UI**
Finally, we build a simple interface with Gradio to interact with the news chatbot. Users can type questions, and the chatbot will respond with relevant information.

In [None]:
import gradio as gr

chat_history = []

def news_chatbot(query):
    global chat_history

    response = invoke_with_memory(query, retrieval_chain)
    response = response.replace("Based on the context and previous conversation, ", "")
    response = response.replace("Based on the conversation and context provided, ", "")
    response = response.replace("Based on our previous conversation,", "")
    response = response.replace("Based on the context provided and our previous conversation,", "")


    chat_history.append((query, response))

    return chat_history

def reset_conversation():
    global chat_history
    chat_history = []
    memory.aclear()
    return chat_history

with gr.Blocks() as interface:
    gr.Markdown("News chatbot")
    gr.Markdown("Ask me any News question, and I'll try to provide helpful information based on the provided data.")

    chatbot = gr.Chatbot()
    query = gr.Textbox(label="Your Question", placeholder="Type your medical question here...")

    submit_button = gr.Button("Get Answer")
    reset_button = gr.Button("Start New Conversation")

    submit_button.click(fn=news_chatbot, inputs=query, outputs=chatbot)
    reset_button.click(fn=reset_conversation, inputs=None, outputs=chatbot)

    submit_button.click(lambda: "", None, query)

interface.launch()