In [316]:
from langchain_community.document_loaders import AsyncChromiumLoader
from collections import deque
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings  
from langchain.vectorstores import Chroma
from crawl4ai import DefaultMarkdownGenerator
import asyncio
import nest_asyncio
import random
nest_asyncio.apply()

In [317]:
startpage = "http://www.partselect.com/Refrigerator-Parts.htm"
seenPages = [startpage]
pagesToVisit = deque()
pagesToVisit.append(startpage)
data = dict()

user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"

In [347]:
def get_urls(soup):
    '''
    description: returns a list of urls that contain "Refrigerator" or "Dishwasher" given BeautifulSoup object
    '''
    
    hrefs = [a['href'] for a in soup.find_all('a', href=True)]
    filtered_urls = list()
    for href in hrefs:
        if ("Refrigerator" in href or "Dishwasher" in href) and ("#" not in href) and ("gtcd" not in href) and (" " not in href):
            if href[:4] == "http":
                filtered_urls.append(href.replace("https","http"))
            elif href[0] == "/":
                filtered_urls.append("http://www.partselect.com" + href)
    return list(set(filtered_urls))

In [348]:
from zenrows import ZenRowsClient

def crawl():
    '''
    description: crawls through the partselect.com website. .
        We save the urls in a list so we don't revisit links, and have a queue to decide which is the next url to visit.
        To save time, we only go to urls with "Dishwasher" or "Refrigerator" in the url.
    '''
    markdown_generator=DefaultMarkdownGenerator()
    client = ZenRowsClient("301f23c56d97a6cbec9c2168d11ff6dc835c2733")
    params = {"js_render":"true","premium_proxy":"true"}

    while len(pagesToVisit):
        ## pop off the next url to visit
        current_url = pagesToVisit.pop()
        print(current_url)
        
        #loader = AsyncChromiumLoader([current_url], user_agent=user_agent, headless=False)
        ## get its html content
        #html = loader.load()[0].page_content
        
        ## get html data and convert to soup
        html = client.get(current_url, params=params).text
        soup = BeautifulSoup(html, 'html.parser')

        ## remove the header and footer data
        for tag in ['header', 'footer']:
            for match in soup.find_all(tag):
                match.decompose()  # Removes the tag and its content
                
        ## extract urls on the page and add to queue if we haven't seen it
        urls = get_urls(soup)
        for url in urls:
            if url not in seenPages:
                seenPages.append(url)
                pagesToVisit.append(url)
                
        ## save markdown information for later conversion into embeddings
        data[current_url] = markdown_generator.generate_markdown(str(soup)).raw_markdown
        if len(data[current_url]) < 1000:
            break
        
        print(len(data[current_url]))
        print(len(pagesToVisit))

In [364]:
crawl()

http://www.partselect.com/Refrigerator-Fans-and-Blowers.htm
21036
59
http://www.partselect.com/Hoover-Refrigerator-Parts.htm
24381
63
http://www.partselect.com/Hoover-Refrigerator-Seals-and-Gaskets.htm
17094
62
http://www.partselect.com/Hoover-Refrigerator-Hinges.htm
7464
61
http://www.partselect.com/Hoover-Refrigerator-Ice-Makers.htm
22733
60
http://www.partselect.com/Hoover-Refrigerator-Hardware.htm
14031
59
http://www.partselect.com/Hoover-Refrigerator-Models.htm
9802
58
http://www.partselect.com/Refrigerator-Starters.htm
9557
57
http://www.partselect.com/PS11701542-Whirlpool-EDR1RXD1-Refrigerator-Ice-and-Water-Filter.htm?SourceCode=18
27609
56
http://www.partselect.com/Refrigerator-Manuals-and-Literature.htm
9738
55
http://www.partselect.com/Haier-Refrigerator-Parts.htm
12314
54
http://www.partselect.com/Refrigerator-Thermostats.htm
26672
53
http://www.partselect.com/Refrigerator-Hardware.htm
19539
56
http://www.partselect.com/Litton-Refrigerator-Hardware.htm
16424
56
http://www.pa

In [365]:
import pickle

#with open("data.pkl", 'wb') as file:
#    pickle.dump(data, file)
#with open("data.pkl", 'rb') as file:
#    data = pickle.load(file)

In [367]:
len(data)

1882

In [431]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=300,
    add_start_index=True,
)

vector_store = Chroma(
    embedding_function=OpenAIEmbeddings(api_key=openai_key), persist_directory="./chroma_db_chunksize1000"
)
count = 0
for key in data.keys():
    print(count)
    document = Document(page_content=data[key], metadata={"url":key})
    docs = text_splitter.split_documents([document])
    vector_store.add_documents(documents=docs)
    count += 1

0
1


KeyboardInterrupt: 

In [433]:
vector_store = Chroma(
    embedding_function=OpenAIEmbeddings(api_key=openai_key), persist_directory="./chroma_db_chunksize1000"
)

for text in vector_store.similarity_search("How can I install part number PS11752778?", k=100):
    print(text)

page_content='[Need help finding your model number?](/Find-Your-Model-Number/?SourceCode=42)
Search
PartSelect Number PS4174247
Manufacturer Part Number DA97-06419C
Manufactured by  Samsung for Samsung 
Jump to:
  * [Product Description](#ProductDescription)
  * [Troubleshooting](#Troubleshooting)
  * [Customer Reviews](#CustomerReviews)
  * [Customer Repair Stories](#RepairStories)
  * [Questions and Answers](#QuestionsAndAnswers)
  * [Related Parts](#RelatedParts)
  * [Model Cross Reference](#ModelCrossReference)' metadata={'start_index': 2939, 'url': 'http://www.partselect.com/PS4174247-Samsung-DA97-06419C-Refrigerator-Door-Bin.htm?SourceCode=14'}
page_content='[Need help finding your model number?](/Find-Your-Model-Number/?SourceCode=42)
Search
PartSelect Number PS4174247
Manufacturer Part Number DA97-06419C
Manufactured by  Samsung for Samsung 
Jump to:
  * [Product Description](#ProductDescription)
  * [Troubleshooting](#Troubleshooting)
  * [Customer Reviews](#CustomerReviews)
 

In [424]:
vector_store = Chroma(
    embedding_function=OpenAIEmbeddings(api_key=openai_key), persist_directory="./chroma_db_chunksize10000"
)

for text in vector_store.similarity_search("How can I install part number PS11752778?", k=10):
    print(text)

page_content='✖
Your coupon for  will be reflected when you check out! 
✖
Your coupon for
has been applied and will be  
reflected when you check out! 
✖
Hello!
You're visiting the PartSelect site in U.S.
Would you like to shop on the Canadian site?
U.S. flag Stay on this site
Canada flag Go to Canadian site
![EasyApplianceParts is Now PartSelect!](https://partselectcom-gtcdcddbene3cpes.z01.azurefd.net/images/eap-to-ps-modal-hero.jpg)
✖
#### EasyApplianceParts is Now PartSelect!
On April 3, EasyApplianceParts merged with our long-time sister site, PartSelect
  * Same trusted genuine parts, service and expert team.
  * All existing EasyApplianceParts orders will ship as planned.
  * Shop with confidence on PartSelect for all your genuine parts needs!


Start Shopping
![to Canada site](https://partselectcom-gtcdcddbene3cpes.z01.azur

In [379]:
vector_store = Chroma(
    embedding_function=OpenAIEmbeddings(api_key=openai_key), persist_directory="./chroma_db_chunksize500"
)

for text in vector_store.similarity_search("How can I install part number PS11752778?", k=30):
    print(text)

page_content='For model number 59669280011
![PartSelect logo](https://partselectcom-gtcdcddbene3cpes.z01.azurefd.net/images/ps-logo-mobile.svg)
Hi Melanie, Thank you for your question. The part number listed under your model number for the support is PS11743468. If you need help placing an order for it, customer service is open 7 days a week and anyone will be happy to assist you. Please feel free to give us a call. We look forward to hearing from you!
Was this helpful?
Thank you for voting!
Related Parts:' metadata={'start_index': 21542, 'url': 'http://www.partselect.com/PS11756423-Whirlpool-WPW10568041-Refrigerator-Crisper-Frame.htm?SourceCode=10'}
page_content='For model number 59669280011
![PartSelect logo](https://partselectcom-gtcdcddbene3cpes.z01.azurefd.net/images/ps-logo-mobile.svg)
Hi Melanie, Thank you for your question. The part number listed under your model number for the support is PS11743468. If you need help placing an order for it, customer service is open 7 days a we

In [416]:
## there are urls of the part in the database
for url in seenPages:
    if "PS11752778" in url:
        print(url)

http://www.partselect.com/PS11752778-Whirlpool-WPW10321304-Refrigerator-Door-Shelf-Bin.htm?SourceCode=18
http://www.partselect.com/PS11752778-Whirlpool-WPW10321304-Refrigerator-Door-Shelf-Bin.htm?SourceCode=10


In [418]:
## took raising k to 1000 to find docs that actually had the right parts
vector_store = Chroma(
    embedding_function=OpenAIEmbeddings(api_key=openai_key), persist_directory="./chroma_db_chunksize500"
)

retrieved_docs = vector_store.similarity_search("How can I install part number PS11752778?", k=1000)
for doc in retrieved_docs:
    if "PS11752778" in str(doc):
        print(doc)

page_content='Rated by 29 customers 
Really Easy 
Less than 15 mins 
Ratings submitted by customers like you who bought this part. 
Really Easy 
Less than 15 mins 
Rated by verified customers 
?
$ 36.08
In Stock
1 2 3 4 5 6 7 8 9 10+ Add to cart 
Get this part fast! Average delivery time for in-stock parts via standard shipping: 1.8 days.
Does this part fit my model?
[Need help finding your model number?](/Find-Your-Model-Number/?SourceCode=42)
Search
PartSelect Number PS11752778' metadata={'url': 'http://www.partselect.com/PS11752778-Whirlpool-WPW10321304-Refrigerator-Door-Shelf-Bin.htm?SourceCode=10', 'start_index': 3051}
page_content='Rated by 29 customers 
Really Easy 
Less than 15 mins 
Ratings submitted by customers like you who bought this part. 
Really Easy 
Less than 15 mins 
Rated by verified customers 
?
$ 36.08
In Stock
1 2 3 4 5 6 7 8 9 10+ Add to cart 
Get this part fast! Average delivery time for in-stock parts via standard shipping: 1.8 days.
Does this part fit my model

In [419]:
## not very insightful information here
retrieved_docs = vector_store.similarity_search("The ice maker on my Whirlpool fridge is not working. How can I fix it?", k=10)
for doc in retrieved_docs:
    print(doc)

page_content='Fixes these symptoms
  * Ice maker not making ice
  * [See more...](/PS11722126-Whirlpool-EDR2RXD1-Water-Filter.htm?SourceCode=18#Troubleshooting)' metadata={'start_index': 9632, 'url': 'http://www.partselect.com/KitchenAid-Refrigerator-Filters.htm'}
page_content='Fixes these symptoms
  * Ice maker not making ice
  * [See more...](/PS11722126-Whirlpool-EDR2RXD1-Water-Filter.htm?SourceCode=18#Troubleshooting)' metadata={'start_index': 9613, 'url': 'http://www.partselect.com/Whirlpool-Refrigerator-Filters.htm'}
page_content='Fixes these symptoms
  * Ice maker not making ice
  * [See more...](/PS11722126-Whirlpool-EDR2RXD1-Water-Filter.htm?SourceCode=18#Troubleshooting)' metadata={'start_index': 17219, 'url': 'http://www.partselect.com/Admiral-Refrigerator-parts.htm'}
page_content='Fixes these symptoms
  * Ice maker not making ice
  * [See more...](/PS11722126-Whirlpool-EDR2RXD1-Water-Filter.htm?SourceCode=18#Troubleshooting)' metadata={'url': 'http://www.partselect.com/Mayt