In [1]:
"""
amazon.py scrapes product information as well as product reviews relevant to the "keyword" that the user specifies.

The main function of interest is "amazon_scrape_to_df", which will allow for the data scraped to be returned as a pandas DataFrame.

Multiprocessing is NOT implemented for amzon.py.
"""

# imports
from lxml import html  
import requests
from fake_useragent import UserAgent

from random import randint
import pandas as pd
import math
from time import sleep


In [2]:
def amazon_product_id(keyword, search_page, ua):
    
    """
    Function:
    ---------
        (1) amazon_product_id pulls all the product id for products which have reviews

    Args:
    -----
        (1) keyword (str): search term defined by the user

    Returns:
    --------
        (1) product_id_list (list): list of product id
    """
    
    # get all the search pages url
    url = 'https://www.amazon.com/s/ref=sr_pg_1/133-8018566-5677454?fst=as%3Aon&rh=n%3A1055398%2Cn%3A284507%2Cn%3A915194%2Cn%3A7740213011%2Ck%3A' + keyword.replace(" ", "+") + '&page=1'
    search_url = []
    for p in search_page:
        url_temp = url.replace('page=1', 'page='+ str(p)).replace('pg_1', 'pg_'+ str(p))
        search_url.append(url_temp)
    
    # get all the product id
    product_id_list = []
    for u in range(len(search_url)):
        headers = {'User-Agent': ua}
        page = requests.get(search_url[u], headers = headers)
        parser = html.fromstring(page.content)
        
        product_id_raw = parser.xpath('.//li[@class="s-result-item  celwidget  "]')
        product_id_raw2 = parser.xpath('.//li[@class="s-result-item  celwidget  AdHolder"]')

        for j in range(len(product_id_raw)):
            product_id_list.append(product_id_raw[j].get('data-asin'))
        for j in range(len(product_id_raw2)):
            product_id_list.append(product_id_raw2[j].get('data-asin'))
        print('Found {} products in search page {} of {}'.format(len(product_id_raw)+len(product_id_raw2), u+1, len(search_url)))
    
    print('Found {} products with in total'.format(len(product_id_list)))
    print(product_id_list) ##############
    
    return product_id_list

In [3]:
def amazon_review_scraper(product_id, ua):
    """
    Function:
    ---------
        (1) amazon_review_scraper pulls the relevant information for a specific product   

    Args:
    -----
        (1) product_id (str): unique identifier for the product 

    Returns:
    --------
        (1) reviews_df (pandas DataFrame): pandas DataFrame with the following columns:
            (a) Source
            (b) Brand
            (c) Product ID
            (d) Name
            (e) Date
            (f) Rating
            (g) Usefulness
            (h) Review Title
            (i) Review
    """
    
    url = 'https://www.amazon.com/product-reviews/' + product_id + '?pageNumber=1&sortBy=recent'
    headers = {'User-Agent': ua}
    page = requests.get(url, headers = headers)
    parser = html.fromstring(page.content)

    if parser.xpath('.//span[@data-hook="total-review-count"]//text()')[0] == '0':
        return pd.DataFrame()
    
    # get product name
    product_name = parser.xpath('.//h1[@class="a-size-large a-text-ellipsis"]//a//text()')[0]
                      
    # get brand
    brand = parser.xpath('.//div[@class="a-row product-by-line"]//a//text()')[0]
    
    # get all review pages url
    max_page_raw = parser.xpath('.//span[@data-hook="total-review-count"]//text()')[0]
    max_page = math.ceil(int(max_page_raw.replace(',', ''))/8)

    print('{} review pages'.format(max_page))
    p = 2
    review_url = []
    while p <= max_page:
        url_raw = url.replace('pageNumber=1', 'pageNumber='+ str(p))
        review_url.append(url_raw)
        p += 1    
    
    date = []
    rating = []
    usefulness = []
    review_title = []
    review_details = []
    
    for u in range(len(review_url)+1):    
        
        if u>0:
            headers = {'User-Agent': ua}
            page = requests.get(review_url[u-1], headers = headers)
            parser = html.fromstring(page.content)
            sleep(1)
        
        # get dates
        date.extend(parser.xpath('.//span[@data-hook="review-date"]//text()'))

        # get star ratings
        rating_raw = parser.xpath('.//i[@data-hook="review-star-rating"]//text()')
        for j in range(len(rating_raw)):
            rating.append(rating_raw[j][0]) 

        # get usefulness
        usefulness_raw = parser.xpath('.//span[@data-hook="review-voting-widget"]')
        for k in range(len(usefulness_raw)):
            if usefulness_raw[k].xpath('.//span[@data-hook="helpful-vote-statement"]') == []:
                usefulness.append('0')
            else:
                usefulness.append(usefulness_raw[k].xpath('.//div//span//text()')[0].split()[0].replace('One', '1')) 

        # get review titles
        review_title.extend(parser.xpath('.//a[@data-hook="review-title"]//text()'))

        # get review details
        review_details_raw = parser.xpath('.//span[@data-hook="review-body"]')
        for j in range(len(review_details_raw)):
            review_details.append(' '.join(review_details_raw[j].xpath('.//text()')).replace("\'", '\"').replace("\"","\'"))
        
        print(u+1, end = " ")
    print('\n {} reviews in total'.format(len(date)))

    # Consolidate information into a pandas DataFrame
    reviews_df = pd.DataFrame()
    for m in range(len(date)):
        review_dict = { 'Source': "Amazon",
                       'Brand': brand,
                       'Product ID': product_id,
                       'Name': product_name,
                       'Date': date[m],
                       'Rating': rating[m],
                       'Usefulness': usefulness[m],
                       'Review Title': review_title[m],
                       'Review': review_details[m]}
        reviews_df = reviews_df.append(review_dict, ignore_index=True)
        
    return reviews_df

In [4]:
def amazon_scrape_to_df(keyword, search_page, product_id_list = []):
    """
    Function:
    ---------
        (1) amazon_scrape_to_df iterates through all the product id to pulls the relevant information

    Args:
    -----
        (1) keyword (str): search term defined by the user

    Returns:
    --------
        output_df (pandas DataFrame): pandas DataFrame with the following columns:
            (a) Source
            (b) Brand
            (c) Product ID
            (d) Name
            (e) Date
            (f) Rating
            (g) Usefulness
            (h) Review Title
            (i) Review
    """
    
    # gather all the product id
    # UserAgent(verify_ssl=False).random
    ua_list = ['Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0', 
               'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0', 
               'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0', 
               'Mozilla/5.0 (Windows; U; Windows NT 6.1; sv-SE) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4', 
               'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0',
               'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1',
               'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
               'Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0']

    if product_id_list == []:
        product_id_list = amazon_product_id(keyword, search_page, ua_list[randint(1, len(ua_list))-1])
    
    output_df = pd.DataFrame()
    for p in range(len(product_id_list)):
        print("Scraping reviews for {}, {} of {} products.".format(product_id_list[p], p+1, str(len(product_id_list))))
        reviews_df = amazon_review_scraper(product_id_list[p], ua_list[randint(1, len(ua_list))-1])
        output_df = output_df.append(reviews_df, ignore_index = True)
        output_df.to_csv('amazon {}.csv'.format(product_id_list[p]))
    
    return output_df

In [5]:
amazon_scrape_to_df('coffee machine', [1,2,3,4,5])

Found 35 products in search page 1 of 5
Found 36 products in search page 2 of 5
Found 36 products in search page 3 of 5
Found 36 products in search page 4 of 5
Found 36 products in search page 5 of 5
Found 179 products with in total
['B018UQ5AMS', 'B00MVWGQX0', 'B00EI7DPPI', 'B07C1XC3GF', 'B005MLB2S6', 'B002YI2IG0', 'B01KA7VWWC', 'B07684BPLB', 'B06Y2WCHDV', 'B01KIG4YNO', 'B002LAREDS', 'B0055P70MQ', 'B018UQ5VEK', 'B00005IBX9', 'B00LU2I3V0', 'B016UO0EL4', 'B07BFB27FM', 'B003KYSLMC', 'B01FJPSLZQ', 'B000T9SCZ2', 'B005IR4W7W', 'B00YEYKK8U', 'B01FUGGBWE', 'B06Y3Q79W1', 'B002S4DI2S', 'B014I5OWCO', 'B07DDMSS7Z', 'B07FP43C9H', 'B079M4J2HV', 'B01KA7VW40', 'B078X9X8WP', 'B0746DV91S', 'B073QWDRZX', 'B079NKGZL9', 'B07CPMQ9L5', 'B01N7VIDWY', 'B008J8MJ9K', 'B074WH5S44', 'B07DR89BR6', 'B000N22JPE', 'B07L6PWR83', 'B076DP5TQD', 'B000T9XPHC', 'B010SN80UK', 'B07KW5CYQ7', 'B008YS1WXE', 'B01KA7VW8G', 'B07FDFP6MY', 'B07KPTQBF1', 'B014W1C2VM', 'B001NXC5YC', 'B0160R1MO4', 'B01EA5ZHIA', 'B072NDTS6F', 'B00DWKEHU

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 

334 review pages
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273

IndexError: list index out of range

In [None]:
amazon_scrape_to_df('coffee machine', [1,2,3,4,5], ['B008J8MJ9K', 'B074WH5S44', 'B07DR89BR6', 'B000N22JPE', 'B07L6PWR83', 'B076DP5TQD', 'B000T9XPHC', 'B010SN80UK', 'B07KW5CYQ7', 'B008YS1WXE', 'B01KA7VW8G', 'B07FDFP6MY', 'B07KPTQBF1', 'B014W1C2VM', 'B001NXC5YC', 'B0160R1MO4', 'B01EA5ZHIA', 'B072NDTS6F', 'B00DWKEHU4', 'B00EI7DPS0', 'B07FCRVQB6', 'B0000A1ZMS', 'B01KA7VW40', 'B07DV44R5H', 'B01EA5ZHIA', 'B06Y3Q79W1', 'B0777K7422', 'B076X4DTT5', 'B0793P737W', 'B07CPMQ9L5', 'B07H9TT7J2', 'B016UO0I9W', 'B079M4J2HV', 'B0719SW2MK', 'B07DDMSS7Z', 'B001G8Y2WW', 'B07MB9H6R6', 'B000FFRYYK', 'B001K66LPQ', 'B07CPJF2BG', 'B003W8JSH8', 'B018UQ56IG', 'B06Y29Y264', 'B004JMZGLI', 'B075G7B98P', 'B01K4NLLQY', 'B074B58C9Z', 'B075BDVPSH', 'B018UQ5RC6', 'B004SOZVQ2', 'B0746DV91S', 'B07K1R37G4', 'B013ZWNASS', 'B073ZCCQMG', 'B078RQVQF1', 'B07K1145HV', 'B01MAZKUBO', 'B0037ZG3DS', 'B007G948YW', 'B06Y3YM8K8', 'B01EA5ZKX2', 'B075G7B98P', 'B01N4QXRBL', 'B001L2S01C', 'B07KPTQBF1', 'B01M68TFFD', 'B077S526PB', 'B072X1M76L', 'B01N5S8PE7', 'B079VTTN6T', 'B079WD2QZN', 'B07CC4JY1H', 'B074XPSDT5', 'B06XDFHVL4', 'B074JPZWYR', 'B076R32128', 'B009PLQ5H2', 'B000VTP45Q', 'B07CNTK8RC', 'B076PFMRGX', 'B07FX73Y7H', 'B000FFRZ26', 'B0041847SI', 'B0000YTYGM', 'B079NKGZL9', 'B007DKTSWQ', 'B0081PTLBU', 'B01MCYAFOD', 'B002FIL2DE', 'B0041847TW', 'B002YI49H6', 'B07GBR72S5', 'B00LU2I46E', 'B06WLJBVDV', 'B00N3L224K', 'B00QKKYI84', 'B00G4LCAJI', 'B07CPJF2BG', 'B07FK4FQNJ', 'B06Y2WCHDV', 'B01MR8XXLP', 'B01MSACZGH', 'B00QKKYEX8', 'B00G4KR3QE', 'B07DDRGVW7', 'B01JP0LAN6', 'B006ZCE38M', 'B0097D2YE6', 'B07KGJ1479', 'B000FFILRO', 'B0037XIP22', 'B07HDRLFFZ', 'B0175VTVB0', 'B07L45B41Y', 'B06XP4DKGL', 'B00KQWLGOG', 'B00VGGVQCI', 'B07F63PF42', 'B018UQ5CSA', 'B002Q8HNDG', 'B011EBK3KA', 'B01N4QXRBL', 'B079M4J2HV', 'B0008JIW8U', 'B01D4PRG3Q', 'B07F9MXR1Y', 'B07F5P4WB8', 'B07CCMTDVR', 'B07JXY9FNX', 'B000FFRZ2Q', 'B07MLRGT5T', 'B00QKKYLGS', 'B07FKHYD6Z', 'B01ETTUKN4', 'B01NAUPLDT', 'B001K66LPQ', 'B013DPYPAO', 'B07GRL9GFH', 'B00EI7DPPI', 'B01M7UII5H', 'B07684BPLB', 'B01MDR4IAS', 'B01MZCQ5O6'])

Scraping reviews for B008J8MJ9K, 1 of 143 products.
69 review pages
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 