In [83]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import json

In [36]:
urls = ["https://www.verizon.com/smartphones/","https://www.verizon.com/smartphones/page-2/", "https://www.verizon.com/smartphones/page-3/", "https://www.verizon.com/smartphones/page-4/"]

Get links to each smartphone

In [37]:
smartphone_links = set()
for url in urls:
    
    # Send a GET request to the server and get the content of the webpage
    response = requests.get(url)
    # If request was successful (HTTP Status Code 200)
    if response.status_code == 200:
        # Parse the page content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find links to all smartphones
        grid = soup.find("div", id="col")
        smartphone_links_a = grid.find_all("a", href=True)

        # Loop through each link and store in set
        for link in smartphone_links_a:
            if link['href'][0:8] == "https://":
                smartphone_links.add(link['href'])

Get specs from each smartphone link

In [None]:
phones = []
driver = webdriver.Chrome()
smartphone_links = sorted(list(smartphone_links))
for spec_url in smartphone_links:
    
    # Navigate to the webpage
    driver.get(spec_url)
    try:
        if driver.find_element(By.TAG_NAME, "h1").text == "something went wrong, please try after some time":
            continue
    except:
        pass
    driver.execute_script("window.  scrollTo(0, document.body.scrollHeight / 2);")
    # Wait for the page to load completely
    time.sleep(2)

    curr_phone = {}
    curr_phone["name"] = driver.find_element(By.TAG_NAME, "h1").find_element(By.TAG_NAME, "span").text
    html = driver.page_source
    for line in html.split('\n'):
        if line.startswith("window.APP_STATE"):
            script = line
            break
    index = 0
    colors = []
    images = []
    for i in range(3):
        index = script.find('"label"', index)
        colors.append(script[index+8:script.find('"', index+10)])
        index = script.find('"images"', index)
        images.append(script[index+11:script.find('"', index+11)])
    
    curr_phone["colors"] = colors
    curr_phone["images"] = images
    curr_phone["url"] = spec_url

    # Find the 'Specs' button and click it
    wait = WebDriverWait(driver, 10)
    specs_button = wait.until(EC.visibility_of_element_located((By.ID, 'FEATURESTAB')))
    retries = 100
    for i in range(retries):
        try:
            specs_button.click()
            break
        except:
            time.sleep(1)
            continue

    # Wait for the specs to load
    time.sleep(1)

    specs = driver.find_elements(By.CLASS_NAME, "pb-8")
    for spec in specs:
        sections = spec.find_elements(By.CLASS_NAME, "py-8")
        for section in sections:
            subsections = section.find_elements(By.TAG_NAME, "p")
            title = subsections[0].text
            description = subsections[1].text
            if title == "Width":
                curr_phone["width"] = description
            elif title == "Depth":
                curr_phone["depth"] = description
            elif title == "Weight":
                curr_phone["weight"] = description
            elif title == "Height":
                curr_phone["height"] = description
            elif title == "Usage Time":
                curr_phone["battery"] = description
            elif title == "Operating System":
                curr_phone["operating_system"] = description
            elif "camera" in title.lower():
                curr_phone["camera"] = description
            elif "storage" in title.lower():
                curr_phone["storage"] = description
    
    phones.append(curr_phone)
    
    # Save curr_phone to json file using pandas
    df = pd.DataFrame(phones)
    df.to_json(f"phones_{len(phones)}.json", orient="records")
driver.quit()

Get Phone Prices

In [18]:
urls = ["https://www.verizon.com/smartphones/","https://www.verizon.com/smartphones/page-2/", "https://www.verizon.com/smartphones/page-3/", "https://www.verizon.com/smartphones/page-4/"]
phone_price = {}
smartphone_links = set()
for url in urls:
    # Send a GET request to the server and get the content of the webpage
    response = requests.get(url)
    # If request was successful (HTTP Status Code 200)
    if response.status_code == 200:
        # Parse the page content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find links to all smartphones
        grid = soup.find_all("div", attrs={'data-testid': 'product-tile'})
        for tile in grid:
            name = tile.find("span", id="tileProductText").text
            price = re.sub(r'[^\d.]+', "", tile.find("p", attrs={'data-testid': 'dpp-frp'}).text)
            phone_price[name] = price

In [32]:
phones["price"] = phones["name"].apply(lambda x: phone_price[x])

Scraping descriptions and extra camera info

In [88]:
driver = webdriver.Chrome()
phone_description = {}
phone_cameras = {}
smartphone_links = sorted(list(smartphone_links))
for spec_url in smartphone_links[29::]:
    
    # Navigate to the webpage
    driver.get(spec_url)
    try:
        if driver.find_element(By.TAG_NAME, "h1").text == "something went wrong, please try after some time":
            continue
    except:
        pass
    driver.execute_script("window.  scrollTo(0, document.body.scrollHeight / 2);")
    time.sleep(2)
    overview = driver.find_element(By.XPATH, "//div[@data-testid='overviewid']")
    phone_description[spec_url] = overview.find_element(By.TAG_NAME, "span").text

    html = driver.page_source
    for line in html.split('\n'):
        if line.startswith("window.APP_STATE"):
            script = line
            break



    res = json.loads(script[19:-1])
    try:
        camera = res['pdp']['productDetails']['productSpecification']['Camera']
    except:
        phone_cameras[spec_url] = {}
        continue
    phone_cameras[spec_url] = camera
driver.quit()

In [54]:
# Add description column to dataframe when url matches
phones["description"] = phones["url"].apply(lambda x: phone_description[x] if x in phone_description else None)

In [91]:
# Replace camera column in dataframe when url matches
phones["camera"] = phones["url"].apply(lambda x: phone_cameras[x] if x in phone_cameras else None)

In [93]:
phones.to_json(f"phones_final.json", orient="records")