In [None]:
!pip install requests beautifulsoup4



In [None]:
example_href = "/Models/PDW7880G00SS/Symptoms/Not-cleaning-dishes-properly/"
import re
import requests
from bs4 import BeautifulSoup


def remove_initial_digit(text):
    # Remove a leading digit if present
    return re.sub(r'^\d+', '', text)


def separate_text(text):
    # Use regex to find the pattern and split the text
    match = re.match(r'(\$\w+)([A-Z].*)', text)
    if match:
        return match.groups()
    return None

def split_difficulty(text):
    parts = re.split(r'(Easy|Difficult)', text, 1)
    if len(parts) > 2:
        # Rejoin the keyword with the following text part
        return [parts[0] + parts[1], parts[2]]
    return [text]

def extract_number(text):
    # Use regular expression to find all numbers in the string
    numbers = re.findall(r'\d+', text)
    return numbers[0]


def parse_part_details(text_content):
    """
    Parse the part name and part number from the given text content.

    Args:
    text_content (str): The text content containing the part name and number.

    Returns:
    dict: A dictionary with keys 'Part Name' and 'Part Number' if the format is correct,
          None if the format does not match.
    """
    # Regex to extract the part name and part number, allowing alphanumeric part numbers
    match = re.match(r'^(.*?)\s*–\s*Part Number:\s*([A-Za-z0-9]+)$', text_content)

    if match:
        part_name = match.group(1)
        part_number = match.group(2)

        # Return the details as a dictionary
        return {
            'Part Name': part_name,
            'Part Number': part_number
        }
    else:
        return None


def scrape_symptom(href):
    # Fetch content from the link
    link = f'https://www.partselect.com/{href}'
    response = requests.get(link)
    response.raise_for_status()  # Raise an exception for HTTP errors
    html_content = response.text

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    main_container = soup.select_one('body > main.container div#main')

    # Dictionary to store the results
    solutions_list = []

    solutions_divs = main_container.find_all('div', class_=lambda class_: class_ and ('symptoms align-items-center symptoms__redesign' in class_ or 'symptoms align-items-center symptoms__redesign d-none js-hiddenSymptom' or 'symptoms align-items-center ' in class_))

    for div in solutions_divs:
        solutions_detail = {}
        for child_div in div.find_all('div'):

            class_name = ' '.join(child_div.get('class', []))
            text_content = child_div.get_text(strip=True)
            if class_name == 'header bold d-flex justify-content-start':
              text_content = remove_initial_digit(text_content)
              solutions_detail['Part Name'] = text_content
            if class_name == 'mt-3 mb-2 bold':
              solutions_detail['PartSelect Number'] = text_content.split('Number')[-1]
            if class_name == 'mb-2 bold':
              solutions_detail['Manufacturer Number'] = text_content.split('Number')[-1]
            if class_name == 'd-flex bold':
              if 'In Stock' in text_content:
                text_content = text_content.split('In Stock')[0]
              solutions_detail['Price'] = text_content
            if class_name == 'symptoms__percent':
              solutions_detail['Fix Percent'] = extract_number(text_content)
            if class_name == 'js-RepairRating mt-3 mb-3 mb-lg-1':
              if 'customers' in text_content:
                text_content = text_content.split('customers')[-1]
              else:
                text_content = text_content.split('customer')[-1]
              solutions_detail['Difficulty'] = split_difficulty(text_content)
            if class_name == 'd-flex flex-grow-1 align-items-sm-center' or class_name == 'd-sm-flex flex-grow-1 align-items-center':
              target_div = child_div.find('div', class_='flex-grow-1')
              if target_div:
                  # Find the <a> tag within the 'flex-grow-1' div
                  link = target_div.find('a')
                  if link and 'title' in link.attrs:
                      # Extract the title from the <a> tag
                      title_text = link['title']
                      part_details = parse_part_details(title_text)
                      if part_details:
                          solutions_detail.update({
                              'Part Name': part_details['Part Name'],
                              'PartSelect Number': part_details['Part Number']
                          })
                  else:
                    print('title missing')

              else:
                print('could not find flex-grow-1')
            if class_name == 'symptoms__buy-part':
              price_div = child_div.find('div', class_='mega-m__part__price')
              if price_div:
                  solutions_detail['Price'] = price_div.get_text(strip=True)
              else:
                price_div = child_div.find('div', class_='mega-m__part__avlbl bold mt-1 mb-1 js-tooltip')
                if price_div:
                  solutions_detail['Price'] = price_div.get_text(strip=True)

            if class_name == 'col-12 col-lg-6 mt-4 mb-4 align-self-start':
              if '$' in text_content:
                text_content = text_content.split('$')[0]
              if 'No Longer Available' in text_content:
                text_content = text_content.split('No Longer Available')[0]
              if 'Ratings submitted by customers like you who bought this part.' in text_content:
                text_content = text_content.split('Ratings submitted by customers like you who bought this part.')[-1]
              if len(text_content):
                solutions_detail['How To'] = text_content
        # print(solutions_detail)
        if 'Part Name' in solutions_detail:
          solutions_list.append(solutions_detail)

    return solutions_list


def scrape_instructions(main_container):

    instructions_list = []

    instructions_divs = main_container.find_all('div', class_=lambda class_: class_ and 'repair-story' in class_)
    instructions_list = []
    for div in instructions_divs:
      instructions_detail = {}
      for child_div in div.find_all(['div', 'a']):
        class_name = ' '.join(child_div.get('class', []))
        text_content = child_div.get_text(strip=True)
        if class_name == 'repair-story__title mb-3 mb-lg-4':
          instructions_detail['problem'] = text_content
        if class_name == 'd-lg-flex':
          target_div = child_div.find('div', class_='repair-story__instruction')
          if target_div:
            target_div = target_div.find('div', class_='repair-story__instruction__content')
            if target_div:
              target_text = target_div.get_text(strip=True)
              target_text = target_text.replace('...Read more', '')
              target_text = target_text.replace('Read less', '')
              instructions_detail['instruction'] = target_text
        if class_name == 'repair-story__parts mt-2':
          target_div = child_div.find('a')
          if target_div:
            target_div = target_div.find('span')
            instructions_detail['parts used'] = target_div.get_text(strip=True)


      if 'instruction' in instructions_detail:
        instructions_list.append(instructions_detail)

    return instructions_list








# RUN ON PARTSELECT WEBSITES

In [None]:
dish_url = 'https://www.partselect.com/Dishwasher-Models.htm'
fridge_url = 'https://www.partselect.com/Refrigerator-Models.htm'

def parse_models_page(url):
  # Send HTTP request to the URL
  response = requests.get(url)
  response.raise_for_status()  # Ensure the request was successful

  # Parse the HTML content of the page
  soup = BeautifulSoup(response.text, 'html.parser')

  # Find the main container by class
  main_container = soup.find('main')

  # Extract all <li> elements within the container
  list_items = main_container.find_all('li') if main_container else []

  # Prepare data structure to hold the extracted links and texts
  extracted_data = []

  # Iterate through each <li> element
  for item in list_items:
      link = item.find('a')
      if link and link.has_attr('href'):
          href = link['href']
          text = link.text.strip()
          if '/Models/' in href:
            extracted_data.append({'href': href, 'text': text})

  return extracted_data

extracted_data = parse_models_page(dish_url) + parse_models_page(fridge_url)

for data in extracted_data:
  print(data)

{'href': '/Models/004621710A/', 'text': '004621710A Frigidaire Dishwasher'}
{'href': '/Models/004621711A/', 'text': '004621711A Frigidaire Dishwasher'}
{'href': '/Models/100/Manufacturer/72/', 'text': '100 Thermador Dishwasher'}
{'href': '/Models/1005/Manufacturer/4/', 'text': '1005 Frigidaire Dishwasher'}
{'href': '/Models/1006/Manufacturer/4/', 'text': '1006 Frigidaire Dishwasher'}
{'href': '/Models/1026/Manufacturer/4/', 'text': '1026 Frigidaire Dishwasher'}
{'href': '/Models/1031-005A/', 'text': '1031-005A Frigidaire Dishwasher'}
{'href': '/Models/1032-005A/', 'text': '1032-005A Frigidaire Dishwasher'}
{'href': '/Models/1035B/', 'text': '1035B Frigidaire Dishwasher'}
{'href': '/Models/1036/Manufacturer/4/', 'text': '1036 Frigidaire Dishwasher'}
{'href': '/Models/1037B/', 'text': '1037B Frigidaire Dishwasher'}
{'href': '/Models/1041-002A/', 'text': '1041-002A Frigidaire Dishwasher'}
{'href': '/Models/1046/Manufacturer/4/', 'text': '1046 Frigidaire Dishwasher'}
{'href': '/Models/1066

In [None]:
def parse_model_overview(href, text):
  url = f'https://www.partselect.com/{href}'
  model_name = text
  # Send HTTP request to the URL
  response = requests.get(url)
  response.raise_for_status()

  # Parse the HTML content of the page
  soup = BeautifulSoup(response.text, 'html.parser')

  # Dictionary to hold all the scraped data
  scraped_data = {}

  # Extract product title and model
  scraped_data['title'] = soup.find('h1').text.strip()

  main_container = soup.find('div', {'id': 'main'})
  symptoms_section = main_container.find('div', {'class': 'section-title'}) if main_container else None
  symptoms = []
  instructions = []

  if symptoms_section:
      symptoms = [{
      'description': symptom.find('div', class_='symptoms__descr').text.strip().lower(),
      'solutions': scrape_symptom(symptom['href'].strip())
      } for symptom in main_container.find_all('a', class_='symptoms')]
      instructions = scrape_instructions(main_container)



  print(f"Scraped symptomps for Model {model_name}")


  # print(symptoms)
  qa_list = []
  for symptom in symptoms:
      question = f"Help me. Model {model_name} {symptom['description']}"
      name = symptom['solutions'][0]['Part Name']
      answer = f"In order to fix {symptom['description']} for Model {model_name}, you can use {name}."
      fix_ptg = symptom['solutions'][0]['Fix Percent']
      if 'Difficulty' in symptom['solutions'][0]:
        difficulty = symptom['solutions'][0]['Difficulty']
        answer += f" Customers have found it {difficulty[0]} to use. It took them {difficulty[1]} to fix {symptom['description']} for Model {model_name} with {name}."
      answer += f" In fact, {name} has fixed {symptom['description']} in {fix_ptg}% of our cases."
      if 'How To' in symptom['solutions'][0]:
        how_to = symptom['solutions'][0]['How To']
        answer += how_to
      price = symptom['solutions'][0]['Price']
      if price == 'No Longer Available':
        answer += f'However, {name} is no longer available for purchase.'
      else:
        answer += f' Luckily, {name} is available for purchase at just {price}.'

      ps_no = symptom['solutions'][0]['PartSelect Number']

      answer += f" The PartSelect Number for {name} is {ps_no}"
      if 'Manufacturer Number' in symptom['solutions'][0]:
        m_no = symptom['solutions'][0]['Manufacturer Number']
        answer += f"and its Manufacturer Number is {m_no}."

      # answer = f"Something that can fix {symptom['description']} is {symptom['solutions'][0]['header bold d-flex justify-content-start']} {symptom['solutions'][0]['col-12 col-lg-6 mt-4 mb-4 align-self-start']}. Are you interested in this product?"
      qa_list.append({'prompt': question, 'completion': answer})

  scraped_data['common_symptoms'] = qa_list

  qa_list = []
  for instruction in instructions:
    question = instruction['problem']
    answer = f'To fix this problem, our customers have used {instruction["parts used"]}. {instruction["instruction"]}'
    qa_list.append({'prompt': question, 'completion': answer})

  scraped_data['instructions'] = qa_list

  print(f'Returned scraped data for model {model_name}')
  return scraped_data



In [None]:
example_href, example_name = extracted_data[40].values()
parse_model_overview(example_href, example_name)

Scraped symptomps for Model 14307 LG Dishwasher
Returned scraped data for model 14307 LG Dishwasher


{'title': '14307  ((ABDESEU))  LG Dishwasher - Overview',
 'common_symptoms': [{'prompt': 'Help me. Model 14307 LG Dishwasher not cleaning dishes properly',
   'completion': 'In order to fix not cleaning dishes properly for Model 14307 LG Dishwasher, you can use Diverter Motor - 120V 60Hz. In fact, Diverter Motor - 120V 60Hz has fixed not cleaning dishes properly in 90% of our cases. Luckily, Diverter Motor - 120V 60Hz is available for purchase at just $40.42. The PartSelect Number for Diverter Motor - 120V 60Hz is 4681ED3001D'},
  {'prompt': 'Help me. Model 14307 LG Dishwasher leaking',
   'completion': 'In order to fix leaking for Model 14307 LG Dishwasher, you can use HOSE ASSY, DRAIN. In fact, HOSE ASSY, DRAIN has fixed leaking in 40% of our cases. Luckily, HOSE ASSY, DRAIN is available for purchase at just $68.23. The PartSelect Number for HOSE ASSY, DRAIN is AEM74333104'},
  {'prompt': 'Help me. Model 14307 LG Dishwasher will not fill with water',
   'completion': 'In order to fi

In [176]:
all_data = []
k, n = (0, 0)
for line in extracted_data:
  href, name = line.values()
  n += 1
  try:
    scraped_data = parse_model_overview(href, name)
    if len(scraped_data) >= 1:
      all_data.append(scraped_data)
      k += 1
  except Exception as e:
    print(f"Unexpected {e}, {type(e)}")


print('--'*100)
print(f'Success rate of {k} out of {n}!')
all_data

Scraped symptomps for Model 004621710A Frigidaire Dishwasher
Returned scraped data for model 004621710A Frigidaire Dishwasher
Scraped symptomps for Model 004621711A Frigidaire Dishwasher
Returned scraped data for model 004621711A Frigidaire Dishwasher
Scraped symptomps for Model 100 Thermador Dishwasher
Returned scraped data for model 100 Thermador Dishwasher
Scraped symptomps for Model 1005 Frigidaire Dishwasher
Returned scraped data for model 1005 Frigidaire Dishwasher
Scraped symptomps for Model 1006 Frigidaire Dishwasher
Returned scraped data for model 1006 Frigidaire Dishwasher
Scraped symptomps for Model 1026 Frigidaire Dishwasher
Returned scraped data for model 1026 Frigidaire Dishwasher
Scraped symptomps for Model 1031-005A Frigidaire Dishwasher
Returned scraped data for model 1031-005A Frigidaire Dishwasher
Scraped symptomps for Model 1032-005A Frigidaire Dishwasher
Returned scraped data for model 1032-005A Frigidaire Dishwasher
Scraped symptomps for Model 1035B Frigidaire Dis

[{'title': '004621710A  Frigidaire Dishwasher - Overview',
  'common_symptoms': [{'prompt': 'Help me. Model 004621710A Frigidaire Dishwasher leaking',
    'completion': 'In order to fix leaking for Model 004621710A Frigidaire Dishwasher, you can use Dishwasher Tub Gasket - Gray. In fact, Dishwasher Tub Gasket - Gray has fixed leaking in 67% of our cases. Luckily, Dishwasher Tub Gasket - Gray is available for purchase at just $32.38. The PartSelect Number for Dishwasher Tub Gasket - Gray is 154827601'},
   {'prompt': 'Help me. Model 004621710A Frigidaire Dishwasher not cleaning dishes properly',
    'completion': 'In order to fix not cleaning dishes properly for Model 004621710A Frigidaire Dishwasher, you can use Upper Spray Arm. In fact, Upper Spray Arm has fixed not cleaning dishes properly in 49% of our cases. Luckily, Upper Spray Arm is available for purchase at just $36.14. The PartSelect Number for Upper Spray Arm is 5304506516'},
   {'prompt': 'Help me. Model 004621710A Frigidair

In [None]:
import json

combined_symptoms = []

# Iterate over all entries in all_data
for entry in all_data:
    # Extend the combined list with the common symptoms of each entry
    combined_symptoms.extend(entry['common_symptoms'])

# Convert the list of dictionaries to JSON format
symptoms_json = json.dumps(combined_symptoms, indent=4)

# Optionally, save to a file
with open('fridge_common_symptoms.jsonl', 'w') as file:
    file.write(symptoms_json)