In [27]:
# LangChain
from langchain.document_loaders import UnstructuredFileLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone

# NLTK
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from langchain.text_splitter import NLTKTextSplitter

import openai

# Pinecone
import pinecone

import os
import streamlit as st
import requests
from dotenv import load_dotenv
import json
import pandas as pd
import numpy as np

load_dotenv()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/trevormcgirr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [38]:
# PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# PINECONE_INDEX_NAME = os.environ.get("PINECONE_INDEX_NAME")
PINECONE_API_KEY = ""
PINECONE_INDEX_NAME = "gcp-starter"

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
openai.api_key = OPENAI_API_KEY

pinecone.init(       
 api_key=PINECONE_API_KEY,
 environment=PINECONE_INDEX_NAME 
)      
index = pinecone.Index('chatbot')

# OpenAI Embeddings
embeddings = OpenAIEmbeddings()

In [6]:
# https://www.kaggle.com/code/sanjay11100/squad-stanford-q-a-json-to-pandas-dataframe
def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main
  
def squad_json_to_dataframe_dev(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
#     ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
#     js['q_idx'] = ndx
    main = m[['id','question','context','answers']].set_index('id').reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [9]:
# training data
# input_file_path = '../input/train-v1.1.json'
input_file_path = '../train-v1.1.json'
record_path = ['data','paragraphs','qas','answers']
train = squad_json_to_dataframe_train(input_file_path=input_file_path,record_path=record_path)

# dev data
# input_file_path = '../input/dev-v1.1.json'
input_file_path = '../dev-v1.1.json'
record_path = ['data','paragraphs','qas','answers']
verbose = 0
dev = squad_json_to_dataframe_dev(input_file_path=input_file_path,record_path=record_path)

Reading the json file
processing...


  js = pd.io.json.json_normalize(file , record_path )
  m = pd.io.json.json_normalize(file, record_path[:-1] )
  r = pd.io.json.json_normalize(file,record_path[:-2])
  main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()


shape of the dataframe is (87599, 6)
Done
Reading the json file
processing...
shape of the dataframe is (10570, 5)
Done


  js = pd.io.json.json_normalize(file , record_path )
  m = pd.io.json.json_normalize(file, record_path[:-1] )
  r = pd.io.json.json_normalize(file,record_path[:-2])


In [10]:
train.head()

Unnamed: 0,index,question,context,answer_start,text,c_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",515,Saint Bernadette Soubirous,0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",188,a copper statue of Christ,0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",279,the Main Building,0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",381,a Marian place of prayer and reflection,0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",92,a golden statue of the Virgin Mary,0


In [11]:
dev.head()

Unnamed: 0,id,question,context,answers,c_id
0,56be4db0acb8001400a502ec,Which NFL team represented the AFC at Super Bo...,Super Bowl 50 was an American football game to...,"[{'answer_start': 177, 'text': 'Denver Broncos...",0
1,56be4db0acb8001400a502ed,Which NFL team represented the NFC at Super Bo...,Super Bowl 50 was an American football game to...,"[{'answer_start': 249, 'text': 'Carolina Panth...",0
2,56be4db0acb8001400a502ee,Where did Super Bowl 50 take place?,Super Bowl 50 was an American football game to...,"[{'answer_start': 403, 'text': 'Santa Clara, C...",0
3,56be4db0acb8001400a502ef,Which NFL team won Super Bowl 50?,Super Bowl 50 was an American football game to...,"[{'answer_start': 177, 'text': 'Denver Broncos...",0
4,56be4db0acb8001400a502f0,What color was used to emphasize the 50th anni...,Super Bowl 50 was an American football game to...,"[{'answer_start': 488, 'text': 'gold'}, {'answ...",0


In [15]:
# Number of unique contexts
# train['context'].nunique()
print("Number of unique contexts in training data: ", train['context'].nunique())

# Number of unique questions
# train['question'].nunique()
print("Number of unique questions in training data: ", train['question'].nunique())

Number of unique contexts in training data:  18891
Number of unique questions in training data:  87355


In [17]:
# Create text corpus of all contexts from training data and dev data (unique contexts)
train_context_corpus = train['context'].unique()
dev_context_corpus = dev['context'].unique()
context_corpus_combined = np.concatenate((train_context_corpus, dev_context_corpus), axis=0)
context_corpus = np.unique(context_corpus_combined)

# Show size of each corpus
print("Size of training context corpus: ", len(train_context_corpus))
print("Size of dev context corpus: ", len(dev_context_corpus))
print("Size of combined context corpus: ", len(context_corpus_combined))
print("Size of unique context corpus: ", len(context_corpus))


Size of training context corpus:  18891
Size of dev context corpus:  2067
Size of combined context corpus:  20958
Size of unique context corpus:  20958


In [18]:
# Snapshots of the context corpus
print("First 5 contexts: ", context_corpus[:5])

First 5 contexts:  ["\n Australia: The event was held in Canberra, Australian Capital Territory on April 24, and covered around 16 km of Canberra's central areas, from Reconciliation Place to Commonwealth Park. Upon its arrival in Canberra, the Olympic flame was presented by Chinese officials to local Aboriginal elder Agnes Shea, of the Ngunnawal people. She, in turn, offered them a message stick, as a gift of peace and welcome. Hundreds of pro-Tibet protesters and thousands of Chinese students reportedly attended. Demonstrators and counter-demonstrators were kept apart by the Australian Federal Police. Preparations for the event were marred by a disagreement over the role of the Chinese flame attendants, with Australian and Chinese officials arguing publicly over their function and prerogatives during a press conference."
 '\n China: In China, the torch was first welcomed by Politburo Standing Committee member Zhou Yongkang and State Councilor Liu Yandong. It was subsequently passed o

In [24]:
# Split context corpus
# NLTK Splitter
text_splitter = NLTKTextSplitter(chunk_size=500)

# Split context corpus into chunks of 500 words and keep track of the indices
context_corpus_split = [text_splitter.split_text(context) for context in context_corpus]

Created a chunk of size 618, which is longer than the specified 500
Created a chunk of size 660, which is longer than the specified 500
Created a chunk of size 768, which is longer than the specified 500
Created a chunk of size 583, which is longer than the specified 500
Created a chunk of size 569, which is longer than the specified 500
Created a chunk of size 625, which is longer than the specified 500
Created a chunk of size 628, which is longer than the specified 500
Created a chunk of size 581, which is longer than the specified 500
Created a chunk of size 527, which is longer than the specified 500
Created a chunk of size 503, which is longer than the specified 500
Created a chunk of size 511, which is longer than the specified 500
Created a chunk of size 738, which is longer than the specified 500
Created a chunk of size 595, which is longer than the specified 500
Created a chunk of size 592, which is longer than the specified 500


Created a chunk of size 611, which is longer than the specified 500
Created a chunk of size 596, which is longer than the specified 500
Created a chunk of size 565, which is longer than the specified 500
Created a chunk of size 584, which is longer than the specified 500
Created a chunk of size 565, which is longer than the specified 500
Created a chunk of size 744, which is longer than the specified 500
Created a chunk of size 530, which is longer than the specified 500
Created a chunk of size 564, which is longer than the specified 500
Created a chunk of size 542, which is longer than the specified 500
Created a chunk of size 807, which is longer than the specified 500
Created a chunk of size 512, which is longer than the specified 500
Created a chunk of size 511, which is longer than the specified 500
Created a chunk of size 517, which is longer than the specified 500
Created a chunk of size 515, which is longer than the specified 500
Created a chunk of size 521, which is longer tha

In [26]:
# Show first 5 chunks
context_corpus_split[:5]

[["Australia: The event was held in Canberra, Australian Capital Territory on April 24, and covered around 16 km of Canberra's central areas, from Reconciliation Place to Commonwealth Park.\n\nUpon its arrival in Canberra, the Olympic flame was presented by Chinese officials to local Aboriginal elder Agnes Shea, of the Ngunnawal people.\n\nShe, in turn, offered them a message stick, as a gift of peace and welcome.",
  'She, in turn, offered them a message stick, as a gift of peace and welcome.\n\nHundreds of pro-Tibet protesters and thousands of Chinese students reportedly attended.\n\nDemonstrators and counter-demonstrators were kept apart by the Australian Federal Police.\n\nPreparations for the event were marred by a disagreement over the role of the Chinese flame attendants, with Australian and Chinese officials arguing publicly over their function and prerogatives during a press conference.'],
 ["China: In China, the torch was first welcomed by Politburo Standing Committee member 

In [44]:
import time

def embed_chunks(context_corpus_split, batch_size=100, retries=3, delay=5):
    to_upsert = []

    for i, context_chunks in enumerate(context_corpus_split):
        print(f"Processing context {i+1} of {len(context_corpus_split)}")  # Print current context index
        for j, chunk in enumerate(context_chunks):
            for attempt in range(retries):
                try:
                    # Create an embedding for the chunk text
                    res = openai.Embedding.create(
                        input=[chunk],
                        engine="text-embedding-ada-002"
                    )
                    embedding = res['data'][0]['embedding'] 

                    # Prepare the data for upserting
                    id = f"context_{i}_{j}"
                    meta = {'text': chunk}
                    to_upsert.append((id, embedding, meta))

                    # Upsert in batches
                    if len(to_upsert) >= batch_size:
                        index.upsert(vectors=to_upsert)
                        to_upsert = []  # Reset the list

                    # If the request was successful, break the loop
                    break
                except openai.ApiError as e:
                    if attempt < retries - 1:  # If this is not the last attempt
                        print(f"Error: {e}. Retrying in {delay} seconds...")
                        time.sleep(delay)  # Wait for a while before retrying
                    else:
                        raise  # If this was the last attempt, re-raise the exception

    # Upsert any remaining embeddings
    if to_upsert:
        index.upsert(vectors=to_upsert)

    print(f"{len(to_upsert)} chunks embedded successfully!")

# Call the function
embed_chunks(context_corpus_split)

Processing context 1 of 20958
Processing context 2 of 20958
Processing context 3 of 20958
Processing context 4 of 20958
Processing context 5 of 20958
Processing context 6 of 20958
Processing context 7 of 20958
Processing context 8 of 20958
Processing context 9 of 20958
Processing context 10 of 20958
Processing context 11 of 20958
Processing context 12 of 20958
Processing context 13 of 20958
Processing context 14 of 20958
Processing context 15 of 20958
Processing context 16 of 20958
Processing context 17 of 20958
Processing context 18 of 20958
Processing context 19 of 20958
Processing context 20 of 20958
Processing context 21 of 20958
Processing context 22 of 20958
Processing context 23 of 20958
Processing context 24 of 20958
Processing context 25 of 20958
Processing context 26 of 20958
Processing context 27 of 20958
Processing context 28 of 20958
Processing context 29 of 20958
Processing context 30 of 20958
Processing context 31 of 20958
Processing context 32 of 20958
Processing contex