# Code to extract TV info from Good Guys

In [1]:
# Dependencies
#import requests
from bs4 import BeautifulSoup as bs
from time import sleep
import pandas as pd
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from selenium.webdriver import ActionChains
from webdriver_manager.chrome import ChromeDriverManager

There are 136 televisions listed on the Good Guys website contained in 3 webpages (60 per page) so we will be using Splinter to help us automate through the pages (as at 11.08.2020)

In [2]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

url = "https://www.thegoodguys.com.au/televisions/all-tvs"
browser.visit(url)

[WDM] - Current google-chrome version is 84.0.4147
[WDM] - Get LATEST driver version for 84.0.4147


 


[WDM] - Driver [C:\Users\foong\.wdm\drivers\chromedriver\win32\84.0.4147.30\chromedriver.exe] found in cache


## Automate Browser Navigation

In [3]:
# Create function to automate browser navigation for loop

def scroll_and_next(page_no):

    # Ensuring the link is in view
    browser.execute_script('window.scrollTo(0, 10000);')

    # Find Next button and click to navigate to next page
    browser.find_by_text("Next").first.click()
    
    sleep(5)

## Use Beautiful Soup To Get Web Results

In [4]:
# Create function to retrieve url page, create Beautiful Soup object, 
# parse and get results for loop.

def get_results(page_no): 
    
    # Create a Beautiful Soup object; parse with 'html.parser'
    soup = bs(browser.html, "html.parser")

    # results are returned as an iterable list
    results = soup.select("#product_listing_tab")[0].find_all("li")
    
    return results
      
# Example: show the number of results in first page
print(f"There are {len(get_results(0))} results on this page")

There are 60 results on this page


In [5]:
# Retrieve results for first page of results (60 televisions)
# To illustrate examples in the functions below

page_no = 1
results = get_results(page_no) # Store results in variable "results"
len(results)

60

## Find Product Category

In [6]:
# Create function to get product category for loop

def category_finder(x):
    onclick = results[x].a["onclick"]
    start = onclick.rfind("Product Category L1 :") + len("Product Category L1 :")
    end = onclick.rfind("','TGGCATLPEventAction':'Product Category L2 :")
    category = onclick[start:end]
    return category

# Example: to find the Product Category for the first result
print(category_finder(0))

Televisions


## Find Currency Code

In [7]:
# Create function to get currency code for loop

def currency_finder(x):
    scripts = results[x].script.find_next("script").string
    start = scripts.rfind("currencyCode")+len("currencyCode")+4
    end = start + 3
    currency = scripts[start:end]
    return currency

# Example: to find the Currency Code for the first result
print(currency_finder(0))

AUD


## Find Brand

In [8]:
# Create function to get brand for loop

def brand_finder(x):
    brand = results[x].select('input[type="hidden"]')[3]["value"]
    return brand

# Example: to find the Brand for the first result
print(brand_finder(0))

Sony


## Find Model Number

In [9]:
# Create function to get model number for loop

def model_finder(x):
    model = results[x].find("div", class_="product-tile-model").text
    return model

# Example: to find the Model Number for the first result
print(model_finder(0))

KDL32W660E


## Find Name

In [10]:
# Create function to get name for loop

def name_finder(x):
    brand = results[x].select('input[type="hidden"]')[3]["value"]
    name = results[x].select('input[type="hidden"]')[4]["value"]
    return brand + " " + name

# Example: to find the Product Name for the first result
print(name_finder(0))

Sony 32"(81cm) FHD LED LCD Smart TV


## Find Screen Size

In [11]:
# Create function to get screen size for loop

def size_finder(x):
    name = results[x].select('input[type="hidden"]')[4]["value"]
    size = name.split()[0]
    return size

# Example: to find the Screen Size for the first result
print(size_finder(0))

32"(81cm)


## Find Price

In [12]:
# Create function to get price for loop

def price_finder(x):
    price = results[x].select('input[type="hidden"]')[5]["value"]
    return price

# Example: to find the Price for the first result
print(price_finder(0))

$595.00


## Find Link to Product Image

In [13]:
# Create function to get image link for loop

def image_finder(x):
    image = results[x].img["data-src"]
    return image

# Example: to find the link to Product Image for the first result
print(image_finder(0))

//thegoodguys.sirv.com/products/50048259/50048259_511653.PNG?scale.height=215&scale.width=215&canvas.height=215&canvas.width=215&canvas.opacity=0


## The Loop

In [14]:
# Create lists to hold values

categories = list()
currencies = list()
brands = list()
models = list()
names = list()
sizes = list()
prices = list()
images = list()

In [15]:
# Loop inside a loop 
# Loop splinter to navigate through the pages
# Loop beautiful soup for returned results 

x = 0 # Set variables - x to loop through results
page_no = 1 # Set variables - page_no to loop through web pages

while page_no <= 3: # Only 3 web pages
    
    if page_no == 1: # If page 1, do not activate splinter to navigate next page
        
        results = get_results(page_no) # Run get_results function to retrieve results into a variable
        
        page_no += 1
        
        for x in range(len(results)): # Then loop through results list to get values using functions defined earlier

            try:

                category = category_finder(x)
                currency = currency_finder(x)
                brand = brand_finder(x)
                model = model_finder(x)
                name = name_finder(x)
                size = size_finder(x)
                price = price_finder(x)
                image = image_finder(x)

                if (category and currency and brand and model and name and size and price and image):
                # append values from each function into the lists created if no missing values(namely price is not available when product is out of stock)
                
                            categories.append(category)
                            currencies.append(currency)
                            brands.append(brand)
                            models.append(model)
                            names.append(name)
                            sizes.append(size)
                            prices.append(price)
                            images.append(image)

            except: # Manage exceptions
                print("Price not available: ",name_finder(x)) # So we can view items with no pricing


            x += 1
             
    else: 
    
        scroll_and_next(page_no) # If we are not on page 1, then use function to navigate to next page
    
        sleep(15) # Sleep to allow time for new page results to load
        
        results = get_results(page_no) # Retrieve results from new page and store in variable for loop

        page_no += 1

        for x in range(len(results)):

            try:

                category = category_finder(x)
                currency = currency_finder(x)
                brand = brand_finder(x)
                model = model_finder(x)
                name = name_finder(x)
                size = size_finder(x)
                price = price_finder(x)
                image = image_finder(x)

                if (category and currency and brand and model and name and size and price and image):

                            categories.append(category)
                            currencies.append(currency)
                            brands.append(brand)
                            models.append(model)
                            names.append(name)
                            sizes.append(size)
                            prices.append(price)
                            images.append(image)

            except:
                print("Price not available: ",name_finder(x))


            x += 1

Price not available:  Hitachi 55"(140cm) UHD HDR LED LCD Smart TV
Price not available:  Sony 85" Z9G 8K UHD ANDROID LCD LED TV
Price not available:  FFALCON 32" F1 HD LED TV
Price not available:  Sony 65" A8H 4K UHD ANDROID BRAVIA OLED TV
Price not available:  Sony 55" A8H 4K UHD ANDROID BRAVIA OLED TV
Price not available:  Hisense 24"(60cm) HD LED LCD TV


In [16]:
# Check number of results - site lists 136 products in total (as at 11.08.2020)

print(len(categories))
print(len(currencies))
print(len(brands))
print(len(names))
print(len(sizes))
print(len(prices))
print(len(images))

129
129
129
129
129
129
129


In [17]:
df = pd.DataFrame({
        "retailer": "Good Guys",
        "category": "Televisions",
        "currency": currencies,
        "brand": brands,
        "model": models,
        "name": names,
        "size": sizes,
        "price": prices,
        "image": images
})

df.head()

Unnamed: 0,retailer,category,currency,brand,model,name,size,price,image
0,Good Guys,Televisions,AUD,Sony,KDL32W660E,"Sony 32""(81cm) FHD LED LCD Smart TV","32""(81cm)",$595.00,//thegoodguys.sirv.com/products/50048259/50048...
1,Good Guys,Televisions,AUD,Hitachi,50UHDSM8,"Hitachi 50""(127cm) UHD LED LCD Smart TV","50""(127cm)",$699.00,//thegoodguys.sirv.com/products/50063086/50063...
2,Good Guys,Televisions,AUD,Hitachi,40FHDSM8,"Hitachi 40""(101cm) FHD LED LCD Smart TV","40""(101cm)",$399.00,//thegoodguys.sirv.com/products/50063088/50063...
3,Good Guys,Televisions,AUD,Philips,50PUT6103/79,"Philips 50""(126cm) UHD LED LCD Smart TV","50""(126cm)",$599.00,//thegoodguys.sirv.com/products/50063014/50063...
4,Good Guys,Televisions,AUD,TCL,40D3000F,"TCL 40""(101cm) FHD LED LCD TV","40""(101cm)",$445.00,//thegoodguys.sirv.com/products/50052593/50052...


In [18]:
df.describe()

Unnamed: 0,retailer,category,currency,brand,model,name,size,price,image
count,129,129,129,129,129,129,129,129,129
unique,1,1,1,11,129,129,20,69,129
top,Good Guys,Televisions,AUD,Samsung,VG-SCFT65BW/RU,"Hisense 65"" S8 4K UHD SMART LED TV","65""",$5995.00,//thegoodguys.sirv.com/products/50070885/50070...
freq,129,129,129,40,1,1,29,5,1


In [19]:
df.isnull().sum() # To check if there are any null values

retailer    0
category    0
currency    0
brand       0
model       0
name        0
size        0
price       0
image       0
dtype: int64

In [20]:
df[df.duplicated()].count() # To check if there are any duplicated values

retailer    0
category    0
currency    0
brand       0
model       0
name        0
size        0
price       0
image       0
dtype: int64

In [21]:
df.to_csv("output/good_guys_rerun.csv") 