In [1]:
#!pip install -r './requirements.txt'

In [2]:
#!pip install python_dotenv

In [11]:
import os
from dotenv import load_dotenv

load_dotenv('../.env')

OPENAI_API_KEY = <OPENAI-KEY>
PINECONE_API_KEY = <PINECONE_API>

PINECONE_INDEX_NAME = 'sky'

In [3]:
PINECONE_INDEX_NAME

'aisensy'

#### Web Crawling
Load data from websites

In [6]:
from selenium import webdriver
from bs4 import BeautifulSoup

# URL of the AWS documentation page you want to retrieve
url = "https://docs.aws.amazon.com/index.html"

# Use Selenium to load the page and execute JavaScript
driver = webdriver.Chrome()  # You need to have the ChromeDriver installed and in your PATH
driver.get(url)

# Get the page source after JavaScript execution
page_source = driver.page_source

# Parse the page source with BeautifulSoup
soup = BeautifulSoup(page_source, "html.parser")

# Find and print the textual content
textual_content = soup.get_text()
print(textual_content)
with open(f"out.txt", 'w', encoding="utf-8") as fp:
    fp.write(textual_content)

# Close the browser
driver.quit()

Error sending stats to Plausible: error sending request for url (https://plausible.io/api/event)




















Welcome to AWS Documentation
Select your cookie preferencesWe use essential cookies and similar tools that are necessary to provide our site and services. We use performance cookies to collect anonymous statistics so we can understand how customers use our site and make improvements. Essential cookies cannot be deactivated, but you can click “Customize cookies” to decline performance cookies.  If you agree, AWS and approved third parties will also use cookies to provide useful site features, remember your preferences, and display relevant content, including relevant advertising. To continue without accepting these cookies, click “Continue without accepting.” To make more detailed choices or learn more, click “Customize cookies.”Accept all cookiesContinue without acceptingCustomize cookiesCustomize cookie preferencesWe use cookies and similar tools (collectively, "cookies") for the following purposes.EssentialEssential cookies are necessary to provide our site and serv

#### Set up OpenAI Embedding process

In [7]:
from openai import OpenAI
from tenacity import retry, wait_random_exponential, stop_after_attempt, retry_if_not_exception_type
from typing import List
from uuid import uuid4
import textwrap

api_key = 'gpt-3.5-turbo'
EMBEDDING_MODEL = 'text-embedding-ada-002'
EMBEDDING_CTX_LENGTH = 8191
EMBEDDING_ENCODING = 'cl100k_base'
client = OpenAI(api_key=OPENAI_API_KEY)

# @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(openai.InvalidRequestError))
def get_embedding(text_or_tokens, model=EMBEDDING_MODEL):
    return client.embeddings.create(input=text_or_tokens, model=model).data[0].embedding

def chunk_text(text: str, max_chunk_size: int, overlap_size: int) -> List[str]:
    """Helper function to chunk a text into overlapping chunks of specified size."""
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + max_chunk_size, len(text))
        chunks.append(text[start:end])
        start += max_chunk_size - overlap_size
    return chunks

def transform_record(record: dict) -> List[dict]:
    """Transform a single record as described in the prompt."""
    max_chunk_size = 500
    overlap_size = 100
    chunks = chunk_text(record, max_chunk_size, overlap_size)
    transformed_records = []
    recordId = str(uuid4())
    for i, chunk in enumerate(chunks):
        chunk_id = f"{recordId}-{i+1}"
        response=get_embedding(chunk)
        transformed_records.append({
            'chunk_id': chunk_id,
            'chunk_parent_id': recordId,
            'chunk_text': chunk,
            'vector' : response
            # embeddings.append(response['data'][0]['embedding'])
            #'sparse_values': splade(chunk)
        })
    return transformed_records

#### Generate Pinecone Index

In [12]:
import pinecone
import time
from pinecone import Pinecone, ServerlessSpec
from pinecone import ServerlessSpec


index_name = PINECONE_INDEX_NAME

# initialize connection to pinecone (get API key at app.pinecone.io)
pc = Pinecone(api_key=PINECONE_API_KEY)
spec = ServerlessSpec(cloud="aws", region="us-east-1")
# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,
        metric='cosine',
        spec=spec
    )
# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

#### Prepare and load data from file

In [13]:
with open('out.txt', 'r', encoding='ISO-8859-1') as f:
    file = f.read()

#### Generate embeddings and Pickle the results to save money on OpenAI

In [14]:
chunked_data = []
chunk_array = transform_record(file)
for chunk in chunk_array:
    chunked_data.append(chunk)

#### Format data to load to Pinecone

In [15]:
def prepare_entries_for_pinecone(entries):
    """
    Prepares an array of entries for upsert to Pinecone.
    Each entry should have a 'vector' field containing a list of floats.
    """
    vectors = []
    for entry in entries:
        vector = entry['vector']
        id = entry.get('chunk_id', '')
        metadata = entry.get('metadata', {'chunk_id': entry.get('chunk_id', ''),'parent_id': entry.get('chunk_parent_id', ''), 'chunk_text': entry.get('chunk_text', '')})
        values = [v for v in vector]
        # sparse_values = entry['sparse_values']
        #vectors.append({'id': id, 'metadata': metadata, 'values': values, 'sparse_values': sparse_values})
        vectors.append({'id': id, 'metadata': metadata, 'values': values})
    return {'vectors': vectors, 'namespace': ''}


In [16]:
vectors = prepare_entries_for_pinecone(chunked_data)

#### Upsert vectors (sparse and dense) and metadata to Pinecone

In [17]:
from tqdm.auto import tqdm  # this is our progress bar

batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(vectors['vectors']), batch_size)):
    ids_batch = [id['id'] for id in vectors['vectors'][i:i+batch_size]]
    embeds = [id['values'] for id in vectors['vectors'][i:i+batch_size]]
    meta = [id['metadata'] for id in vectors['vectors'][i:i+batch_size]]
    # sparse_values = [id['sparse_values'] for id in vectors['vectors'][i:i+batch_size]]
    upserts = []
    # loop through the data and create dictionaries for uploading documents to pinecone index
    # for _id, sparse, dense, meta in zip(ids_batch, sparse_values, embeds, meta):
    for _id,dense, meta in zip(ids_batch, embeds, meta):
        upserts.append({
            'id': _id,
            # 'sparse_values': sparse,
            'values': dense,
            'metadata': meta
        })
    # upload the documents to the new hybrid index
    index.upsert(upserts)


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.50s/it]


#### Query Pinecone and OpenAI

In [18]:
limit = 8000

def retrieve(query):
    res = client.embeddings.create(
        input=[query],
        model='text-embedding-ada-002'
    ).data[0].embedding

    # retrieve from Pinecone
    xq = res
    #sq = splade(query)


    # get relevant contexts
    #res = index.query(xq, top_k=5, include_metadata=True, sparse_vector=sq)
    res = index.query(vector=[xq], top_k=5, include_metadata=True)
    contexts = [
        x['metadata']['chunk_text'] for x in res['matches']
    ]

    # build our prompt with the retrieved contexts included
    prompt_start = (
        "Answer the question based on the context below. If you cannot answer based on the context or general knowledge about Wells Fargo, truthfully answer that you don't know.\n\n"+
        "Context:\n"
    )
    prompt_end = (
        f"\n\nQuestion: {query}\nAnswer:"
    )
    # append contexts until hitting limit
    for i in range(1, len(contexts)):
        if len("\n\n---\n\n".join(contexts[:i])) >= limit:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(contexts[:i-1]) +
                prompt_end
            )
            break
        elif i == len(contexts)-1:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(contexts) +
                prompt_end
            )
    return prompt

def complete(prompt):
    # query text-davinci-003
    res = openai.Completion.create(
        engine='gpt-3.5-turbo',
        prompt=prompt,
        temperature=0,
        max_tokens=512,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    return res['choices'][0]['text'].strip()

#### Langchain Memory for conversation chat style

In [20]:
from langchain.llms import OpenAIChat
from langchain.chains import ConversationChain
from langchain.memory import ConversationSummaryBufferMemory
from langchain_openai import OpenAI

llm = OpenAI(api_key=OPENAI_API_KEY ,model_name="gpt-3.5-turbo-instruct")
# llm = OpenAIChat(temperature=0,model_name='gpt-3.5-turbo', api_key= )

conversation_with_summary = ConversationChain(
    llm=llm, 
    # We set a very low max_token_limit for the purposes of testing.
    memory=ConversationSummaryBufferMemory(llm=llm, max_token_limit=650)
)
#conversation_with_summary.predict(input="Hi, what's up?")

  conversation_with_summary = ConversationChain(


#### Sample query to Pinecone and OpenAI

In [21]:
query ="What is AWS"
# first we retrieve relevant items from Pinecone
query_with_contexts = retrieve(query)
print(textwrap.fill(str(conversation_with_summary.predict(input=query_with_contexts))))

 AWS stands for Amazon Web Services. It is a cloud computing platform
that offers a variety of services such as virtual servers, storage,
databases, and more. It also provides resources for developers,
tutorials, and code examples to help users launch their applications.
Additionally, AWS offers a command line interface and software
development kits for various programming languages. It is a popular
choice for businesses looking to accelerate cloud adoption and
modernization.


#### Clear conversation memory if desired

In [22]:
conversation_with_summary.memory.clear()

#### Loop to ask multiple questions and get answers

In [23]:
while True:
    # Prompt user for input
    user_input = input("Enter your input (type 'quit' to exit): ")

    # Check if user wants to quit
    if user_input.lower() == "quit":
        print("Exiting program...")
        break

    # Process user input
    processed_input = user_input.upper()  # Convert to all uppercase letters
    print("Processed input: ", processed_input)

    query = user_input

    # first we retrieve relevant items from Pinecone
    query_with_contexts = retrieve(query)

    # then we send the context and the query to OpenAI
    print(textwrap.fill(str(conversation_with_summary.predict(input=query_with_contexts))) + '\n')



Enter your input (type 'quit' to exit):  aws


Processed input:  AWS
 AWS stands for Amazon Web Services. It is a cloud computing platform
that offers a wide range of services such as virtual servers, object
storage, managed databases, and more. It also provides tools and
resources for developers to build and deploy applications. Some
examples of AWS services include Amazon API Gateway, Amazon S3, and
AWS Lambda. If you have a specific question about AWS, I'd be happy to
help.



Enter your input (type 'quit' to exit):  quit


Exiting program...
