<a href="https://colab.research.google.com/github/vperng/AAI520-NPL-Chatbot/blob/main/Chatbot_Project_Team6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Advanced Generative Chatbot Design

Rene Ortiz, Vivian Perng, Karthink Raghavan

## Project Overview

- Goal: Build a chatbot that can carry out multi-turn conversations, adapt to context, and handle a variety of topics.
- Output: A web or app interface where users can converse with the chatbot.


In [1]:
from fastapi import FastAPI, HTTPException
from langchain.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_community.llms import Ollama
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.load import dumpd, dumps, load, loads
from langchain.chains import load_chain
from langserve import add_routes
from langchain_core.runnables import RunnableBinding, RunnableLambda
from pathlib import Path
from dotenv import load_dotenv
import uvicorn
import pandas as pd
import json
import os
import json
import torch
import numpy as np
import logging  

  from tqdm.autonotebook import tqdm, trange


In [12]:
def squad1_json_to_dataframe(file_path, record_path=['data', 'paragraphs', 'qas', 'answers']):
    """
    Functuon to convert the dataset JSON file to a Pandas DataFrame.

    file_path (str): Path to the JSON file
    record_path (list): Path to the deepest level in the JSON structure (default is ['data', 'paragraphs', 'qas', 'answers']).

    Returns dataFrame containing the parsed data.
    """
    # Load JSON data
    with open(file_path, 'r') as f:
        file_data = json.load(f)

    # Extract and normalize the nested JSON structures
    answers_df = pd.json_normalize(file_data, record_path)
    questions_df = pd.json_normalize(file_data, record_path[:-1])
    paragraphs_df = pd.json_normalize(file_data, record_path[:-2])

    # Create 'context' by repeating the corresponding paragraph for each question
    questions_df['context'] = np.repeat(paragraphs_df['context'].values, paragraphs_df.qas.str.len())
    questions_df['answers'] = answers_df['text']

    # Create final DataFrame with necessary columns
    data = questions_df[['id', 'question', 'context', 'answers']].copy()

    return data.reset_index(drop=True)

In [6]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

In [13]:
# Load the SQuAD dataset
file_path = "train-v1.1.json"
df = squad1_json_to_dataframe(file_path, record_path=['data', 'paragraphs', 'qas', 'answers'])
df = df.drop(columns=['id'])
df.head()

Unnamed: 0,question,context,answers
0,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",Saint Bernadette Soubirous
1,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",a copper statue of Christ
2,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",the Main Building
3,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",a Marian place of prayer and reflection
4,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",a golden statue of the Virgin Mary


In [15]:
# Load data into Langchain
loader = DataFrameLoader(df, page_content_column="context")
docs = loader.load()

logging.info("split documents into chunks.")
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(docs)

# Add metadata to the docs (like question and answer_text)
for doc in docs:
    doc.metadata["question"] = doc.metadata.get("question", "Unknown")
    doc.metadata["answer_text"] = doc.metadata.get("answer_text", "No answer")

# Filter out any documents with None in metadata
docs = [doc for doc in docs if all(value is not None for value in doc.metadata.values())]

logging.info("Creating embedding.")


# Initialize the SentenceTransformer model
model_name = 'all-MiniLM-L6-v2'
model_kwargs = {'device': device}
encode_kwargs = {'normalize_embeddings': True}

#sentence_transformer_model = SentenceTransformer(model_name).to(device)

# Wrap the SentenceTransformer model with LangChain's HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


  embeddings = HuggingFaceEmbeddings(


In [22]:
# Generate a sample embedding to check dimensions
sample_text = "This is a sample document to check embedding dimensions."
sample_embedding = embeddings.embed_documents([sample_text])  # Use the embed method to generate an embedding

# Print the dimensions of the embedding
embedding_dimension = len(sample_embedding[0])  # Get the length of the first embedding
print(f"Embedding dimensions: {embedding_dimension}")

Embedding dimensions: 384


In [44]:
logging.info("Loading to vector db")

# Load to Chroma vector store
max_batch_size = 41666  # Maximum batch size allowed
for i in range(0, len(documents), max_batch_size):
    batch_docs = documents[i:i + max_batch_size]
    # Add each batch to the Chroma vector store
    db = Chroma.from_documents(documents=batch_docs, embedding=embeddings, persist_directory="./", collection_name="squadembedding")

    print(f"Added batch {i // max_batch_size + 1} to Chroma vector store.")


Added batch 1 to Chroma vector store.
Added batch 2 to Chroma vector store.
Added batch 3 to Chroma vector store.


In [45]:
logging.info("initialize retriever")
retriever = db.as_retriever()


# Define the LLM and prompt template
logging.info("initialize model and prompt.")

llm = Ollama(model="llama2") 

prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context. 
Think step by step before providing a detailed answer. 
<context>
{context} 
</context>
Question: {input}""")

logging.info(prompt)

logging.info("creating document chains.")

document_chain=create_stuff_documents_chain(llm, prompt)

logging.info("creating retrieval chains.")

# Create the retrieval-based document chain
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [52]:
query ="What was one of the major differences between the Bosniaks, Croats and Serbs?"
response = retrieval_chain.invoke({"input": query})
response['answer']

'Based on the provided context, one of the major differences between the Bosniaks, Croats, and Serbs is their geographical identification. The context highlights that Slavs often identify themselves with the local geographical region in which they live, and this is especially true for the Bosniaks, Croats, and Serbs.\n\nThe Bosniaks are identified as living in southern Bosnia, while the Croats are identified as living in westernmost Croatia. The Serbs are identified as descendants of the Grenzers who continued to live in the area known as the Military Frontier until the Croatian war of independence.\n\nTherefore, one of the major differences between these ethnic groups is their geographical identification and association with specific regions within the broader South Slavic region.'

In [47]:
logging.info("initialize retriever")
db1 = Chroma(persist_directory="./", embedding_function=embeddings, collection_name="squadembedding")

retriever1 = db1.as_retriever()


# Define the LLM and prompt template
logging.info("initialize model and prompt.")

llm1 = Ollama(model="llama2") 



prompt1 = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context. 
Think step by step before providing a detailed answer. 
<context>
{context} 
</context>
Question: {input}""")

logging.info(prompt1)

logging.info("creating document chains.")

document_chain1=create_stuff_documents_chain(llm1, prompt1)

logging.info("creating retrieval chains.")

# Create the retrieval-based document chain
retrieval_chain1 = create_retrieval_chain(retriever1, document_chain1)

In [53]:
query1 ="what is the capital of india?"
response1 = retrieval_chain1.invoke({"input": query1})
response1['answer']

'Based on the provided context, the answer to the question "What is the capital of India?" is New Delhi.'