In [1]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
import openai
import os
import dotenv
import json

In [2]:
urls = {
    "start_urls": ["https://www.amazon.in/s?k=laptop"]
}

with open("scraper/config.json", "w") as f:
    json.dump(urls, f)

In [3]:
! cd scraper && scrapy crawl main
! cd ..

2023-11-28 10:15:26 [scrapy.utils.log] INFO: Scrapy 2.11.0 started (bot: scraper)
2023-11-28 10:15:26 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.11.4 (main, Jun  9 2023, 07:59:55) [GCC 12.3.0], pyOpenSSL 23.3.0 (OpenSSL 3.1.4 24 Oct 2023), cryptography 41.0.5, Platform Linux-6.2.0-37-generic-x86_64-with-glibc2.37
2023-11-28 10:15:26 [scrapy.addons] INFO: Enabled addons:
[]
2023-11-28 10:15:26 [asyncio] DEBUG: Using selector: EpollSelector
2023-11-28 10:15:26 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2023-11-28 10:15:26 [scrapy.utils.log] DEBUG: Using asyncio event loop: asyncio.unix_events._UnixSelectorEventLoop
2023-11-28 10:15:26 [scrapy.extensions.telnet] INFO: Telnet Password: 6d42639523d05f78
2023-11-28 10:15:26 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsol

In [4]:
dotenv.load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
openai.api_key = os.getenv("OPENAI_API_KEY")

In [5]:
text_loader = TextLoader("scraper/output.json")
pages = text_loader.load()

In [6]:
persist_directory = 'docs/chroma/'

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=100
)

splits = text_splitter.split_documents(pages)

len(splits)

embedding = OpenAIEmbeddings()

vectordb = Chroma(
    embedding_function=embedding,
    persist_directory=persist_directory
)

len(splits)

vectordb.add_documents(documents=splits[0:500])
print('ok')

vectordb.persist()

print(vectordb._collection.count())


ok
1550


In [7]:
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Build prompt

with open("template.tpl", "r") as f:
    template = f.read()

QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context", "question"], template=template)


In [8]:
# Run chain

question = "name, cost, description, rating out of 5"
qa_chain = RetrievalQA.from_chain_type(llm,
                                       retriever=vectordb.as_retriever(),
                                       return_source_documents=True,
                                       chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})

result = qa_chain({"query": question})
result["result"]


'{\n  "name": "Acer Aspire 3 Thin and Light Laptop Intel Core i5 12th Generation (Windows 11 Home/8 GB/512 GB SSD) A315-59 with 15.6-inch (39.6 cms) Full HD Display, 1.7 KG, Pure Silver",\n  "cost": "NA",\n  "description": "NA",\n  "rating out of 5": "3.9 out of 5 stars"\n}'