In [1]:
"""
bestbuy.py scrapes product information as well as product reviews relevant to the "keyword" that the user specifies.

The main function of interest is "bestbuy_scrape_to_df", which will allow for the data scraped to be returned as a pandas DataFrame.

Multiprocessing is NOT implemented for walmart.py.
"""

# imports
from bs4 import BeautifulSoup as soup
from lxml import html  
import requests

import pandas as pd
import math
from time import sleep

In [2]:
def bestbuy_product_id(keyword, search_page, ua):
    
    """
    Function:
    ---------''
        (1) bestbuy_product_id pulls all the product id for products which have reviews

    Args:
    -----
        (1) keyword (str): search term defined by the user

    Returns:
    --------
        (1) product_id_list (list): list of product id
        (2) product_id_list_name (list): list of product id with name
    """
    
    url = 'https://www.bestbuy.com/site/searchpage.jsp?cp=1&intl=nosplash&st='  + keyword.replace(" ", "%20")

    # find the maximum page of search page
    if search_page == None:
        headers = {'User-Agent': ua}
        page = requests.get(url, headers = headers)
        page_soup = soup(page.content, 'html.parser')

        max_page = int(page_soup.findAll("li",{"class":"page-item"})[-1].a.text)
        search_page = list(range(1,max_page+1))
        print('Found {} search pages'.format(max_page))
    
    # get all the search pages url
    p = 1
    search_url = []
    while p <= max_page:
        url = 'https://www.bestbuy.com/site/searchpage.jsp?cp=1&intl=nosplash&st='  + keyword.replace(" ", "%20")
        search_url.append(url)
        p += 1
        
    search_url = []
    for p in search_page:
        url_temp = url.replace('?cp=1', '?cp='+ str(p))
        search_url.append(url_temp)
    
    # get all the product id
    product_id_list = []
    product_id_list_name =[]
    product_id_list_review_count = []
    
    for u in range(len(search_url)):
        headers = {'User-Agent': ua}
        page = requests.get(search_url[u], headers = headers)
        page_soup = soup(page.content, 'html.parser')
        
        product_id_raw = page_soup.findAll("div",{"class":"list-item lv "})
        product_id_review_count_raw = page_soup.findAll("div",{"class":"information"})

        length = 0
        for j in range(len(product_id_raw)):
            product_id_temp = product_id_raw[j].get('data-sku-id')
            product_id_review_count_temp = product_id_review_count_raw[j].find("p",{"class":"sr-only"}).text
            product_id_name_temp = product_id_review_count_raw[j].a.get('href').split('.p?skuId')[0][6:]

            flag = True
            # check if same product (bestbuy has same product different color same reviews)
            for i in range(len(product_id_list_review_count)):
                if product_id_review_count_temp == product_id_list_review_count[i]:
                    if " ".join(product_id_name_temp.split('-')[:6]) == " ".join(product_id_list_name[i].split('-')[:6]):
                        flag = False

            if flag == False:
                continue

            # remove product without reviews
            if product_id_review_count_temp != "Not Yet Reviewed": 
                product_id_list.append(product_id_temp)
                product_id_list_name.append(product_id_name_temp)
                product_id_list_review_count.append(product_id_review_count_temp)
                length += 1
        print('Found {} products in search page {} of {}'.format(length, u+1, len(search_url)))
    
    print('Found {} products in total'.format(len(product_id_list)))
    print(product_id_list) ##############
    print(product_id_list_name) ##############

    return product_id_list, product_id_list_name

In [3]:
def bestbuy_review_scraper(product_id, product_id_name, ua):
    """
    Function:
    ---------
        (1) bestbuy_review_scraper pulls the relevant information for a specific product   

    Args:
    -----
        (1) product_id (str): unique identifier for the product 

    Returns:
    --------
        (1) reviews_df (pandas DataFrame): pandas DataFrame with the following columns:
            (a) Source
            (b) Brand
            (c) Product ID
            (d) Name
            (e) Date
            (f) Rating
            (g) Usefulness
            (h) Review Title
            (i) Review
    """
    
    url = 'https://www.bestbuy.com/site/reviews/' + product_id_name + '?sort=MOST_RECENT&page=1'
    headers = {'User-Agent': ua}
    page = requests.get(url, headers = headers)
    page_soup = soup(page.content, 'html.parser')
    
    # get product name
    product_name = page_soup.find('h2', {'class':"product-title"}).a.text
                      
    # get brand
    brand = product_name.split(' - ')[0]
    
    # get all review pages url
    max_page_raw = page_soup.find('span', {'class':"message-text"}).text.split()[-2].replace(',', '')
    max_page = math.ceil(int(max_page_raw)/20)
    print('{} review pages'.format(max_page))
    p = 2
    review_url = []
    while p <= max_page:
        url_temp = url.replace('page=1', 'page='+ str(p))
        review_url.append(url_temp)
        p += 1    
    
    date = []
    rating = []
    usefulness = []
    review_title = []
    review_details = []
    
    for u in range(len(review_url)+1):    
        
        if u>0:
            headers = {'User-Agent': ua}
            page = requests.get(review_url[u-1], headers = headers)
            page_soup = soup(page.content, 'html.parser')    
            sleep(1)
                    
        # get dates
        date_raw = page_soup.findAll("div", {"class":"col-xs-12 col-md-9"})
        for l in range(len(date_raw)):
            date.append(" ".join(date_raw[l].find("div", {"class":"disclaimer"}).time.get('title').split()[:3]))

        # get star ratings
        rating_raw = page_soup.findAll("span",{"class":"c-reviews"})
        for j in range(len(rating_raw)):
            rating.append(rating_raw[j].span.text) 

        # get usefulness
        usefulness_raw = page_soup.findAll('div', {'class':"feedback-display"})
        for k in range(len(usefulness_raw)):
            usefulness.append(usefulness_raw[k].button.get('aria-label').split()[5]) 

        # get review titles
        review_title_raw = page_soup.findAll("h3",{"class":"ugc-review-title c-section-title heading-5 v-fw-medium "})
        for i in range(len(review_title_raw)):
            review_title.append(review_title_raw[i].text)

        # get review details
        review_details_raw = page_soup.findAll("div",{"class":"ugc-review-body body-copy-lg"})
        for k in range(len(review_details_raw)):
            review_details.append(review_details_raw[k].p.text.replace('\r\n', ' '))
        
        print(u+1, end=" ")
    print('\n {} reviews in total'.format(len(date)))

    # Consolidate information into a pandas DataFrame
    reviews_df = pd.DataFrame()
    for m in range(len(date)):
        review_dict = { 'Source': "Bestbuy",
                       'Brand': brand,
                       'Product ID': product_id,
                       'Name': product_name,
                       'Date': date[m],
                       'Rating': rating[m],
                       'Usefulness': usefulness[m],
                       'Review Title': review_title[m],
                       'Review': review_details[m]}
        reviews_df = reviews_df.append(review_dict, ignore_index=True)
        
    return reviews_df

In [7]:
def bestbuy_scrape_to_df(keyword, search_page, ua, product_id_list = [], product_id_list_name = []):
    """
    Function:
    ---------
        (1) bestbuy_scrape_to_df iterates through all the product id to pulls the relevant information

    Args:
    -----
        (1) keyword (str): search term defined by the user

    Returns:
    --------
        output_df (pandas DataFrame): pandas DataFrame with the following columns:
            (a) Source
            (b) Brand
            (c) Product ID
            (d) Name
            (e) Date
            (f) Rating
            (g) Usefulness
            (h) Review Title
            (i) Review
    """
    
    # gather all the product id
    if product_id_list == []:
        product_id_list, product_id_list_name = bestbuy_product_id(keyword, search_page, ua)
    
    output_df = pd.DataFrame()
    for p in range(len(product_id_list)):
        print("Scraping reviews for {}, {} of {} products.".format(product_id_list[p], p+1, str(len(product_id_list))))
        reviews_df = bestbuy_review_scraper(product_id_list[p], product_id_list_name[p], ua)
        output_df = output_df.append(reviews_df, ignore_index = True)
        output_df.to_csv('bestbuy {}.csv'.format(product_id_list[p]))
    
    return output_df

In [8]:
bestbuy_scrape_to_df('coffee machine', None, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36')

Found 14 search pages
Found 21 products in search page 1 of 14
Found 22 products in search page 2 of 14
Found 18 products in search page 3 of 14
Found 11 products in search page 4 of 14
Found 21 products in search page 5 of 14
Found 14 products in search page 6 of 14
Found 19 products in search page 7 of 14
Found 11 products in search page 8 of 14
Found 15 products in search page 9 of 14
Found 13 products in search page 10 of 14
Found 9 products in search page 11 of 14
Found 19 products in search page 12 of 14
Found 15 products in search page 13 of 14
Found 1 products in search page 14 of 14
Found 209 products in total
['4835800', '6296683', '6272730', '7678017', '2613145', '5715715', '5316921', '6291169', '6265164', '5836500', '5781003', '5578138', '5726800', '4372106', '5890804', '5723355', '5715716', '5721504', '7421017', '6203569', '5838228', '6265163', '5204102', '6260442', '6258429', '6287389', '1208131', '5855432', '6203022', '6168212', '5712252', '6269237', '5396812', '5857408'

139 review pages
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 
 2762 reviews in total
Scraping reviews for 6296683, 2 of 209 products.
1 review pages
1 
 15 reviews in total
Scraping reviews for 6272730, 3 of 209 products.
3 review pages
1 2 3 
 54 reviews in total
Scraping reviews for 7678017, 4 of 209 products.
203 review pages
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 9

Scraping reviews for 4871212, 65 of 209 products.
1 review pages
1 
 7 reviews in total
Scraping reviews for 4902011, 66 of 209 products.
3 review pages
1 2 3 
 41 reviews in total
Scraping reviews for 7180029, 67 of 209 products.
9 review pages
1 2 3 4 5 6 7 8 9 
 176 reviews in total
Scraping reviews for 8446202, 68 of 209 products.
67 review pages
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 
 1323 reviews in total
Scraping reviews for 6997814, 69 of 209 products.
28 review pages
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 
 558 reviews in total
Scraping reviews for 5621700, 70 of 209 products.
2 review pages
1 2 
 33 reviews in total
Scraping reviews for 6261477, 71 of 209 products.
1 review pages
1 
 15 reviews in total
Scraping reviews for 4760666, 72 of 209 products.
7 review pages
1 2 3 4 5 6 7 
 136 re

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 
 1126 reviews in total
Scraping reviews for 4434200, 148 of 209 products.
5 review pages
1 2 3 4 5 
 99 reviews in total
Scraping reviews for 5855445, 149 of 209 products.
1 review pages
1 
 8 reviews in total
Scraping reviews for 6261481, 150 of 209 products.
1 review pages
1 
 8 reviews in total
Scraping reviews for 4434000, 151 of 209 products.
3 review pages
1 2 3 
 44 reviews in total
Scraping reviews for 5855430, 152 of 209 products.
1 review pages
1 
 4 reviews in total
Scraping reviews for 5857403, 153 of 209 products.
1 review pages
1 
 9 reviews in total
Scraping reviews for 6026129, 154 of 209 products.
1 review pages
1 
 1 reviews in total
Scraping reviews for 6261480, 155 of 209 products.
1 review pages
1 
 11 reviews in total
Scraping reviews for 6026128, 156 of 209 products.
1 review pages
1 
 1 reviews in tota

Unnamed: 0,Brand,Date,Name,Product ID,Rating,Review,Review Title,Source,Usefulness
0,Keurig,"Jan 23, 2019",Keurig - K- Classic K50 Single Serve K-Cup Pod...,4835800,5,I love this product. It makes a good cup of co...,Great coffee maker,Bestbuy,0
1,Keurig,"Jan 22, 2019",Keurig - K- Classic K50 Single Serve K-Cup Pod...,4835800,5,Great value for the price to add a Keurig to t...,Great Coffee Maker,Bestbuy,0
2,Keurig,"Jan 20, 2019",Keurig - K- Classic K50 Single Serve K-Cup Pod...,4835800,4,Got this as a Christmas gift and wonderful ess...,Easy coffeemaker,Bestbuy,0
3,Keurig,"Jan 20, 2019",Keurig - K- Classic K50 Single Serve K-Cup Pod...,4835800,2,Bought for a Christmas present and January 20t...,Disappointed,Bestbuy,0
4,Keurig,"Jan 20, 2019",Keurig - K- Classic K50 Single Serve K-Cup Pod...,4835800,5,This is a great item for people who love coffe...,Great item,Bestbuy,0
5,Keurig,"Jan 19, 2019",Keurig - K- Classic K50 Single Serve K-Cup Pod...,4835800,5,Kuerig coffee makers like the K50 delivers a g...,Perfect cup everytime,Bestbuy,0
6,Keurig,"Jan 19, 2019",Keurig - K- Classic K50 Single Serve K-Cup Pod...,4835800,5,we have the touch screen kuerig and itm starte...,works as intended,Bestbuy,0
7,Keurig,"Jan 19, 2019",Keurig - K- Classic K50 Single Serve K-Cup Pod...,4835800,5,Have used this model for years with no issues ...,Great,Bestbuy,0
8,Keurig,"Jan 19, 2019",Keurig - K- Classic K50 Single Serve K-Cup Pod...,4835800,4,This model is big enough to make all sizes of ...,Keurig Quality,Bestbuy,0
9,Keurig,"Jan 18, 2019",Keurig - K- Classic K50 Single Serve K-Cup Pod...,4835800,4,"Great machine, easy to use and operate, lasts ...",Great Machine,Bestbuy,0
