In [4]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


In [7]:
import requests
from bs4 import BeautifulSoup
import csv
import time

def scrape_amazon_Electronics_reviews():
    url = "https://www.amazon.com/s?k=Electronics"  
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        product_links = []
        
        for product in soup.find_all("div", class_="sg-col-inner"):
            link = product.find("a", class_="a-link-normal")
            if link:
                product_links.append("https://www.amazon.com" + link['href'])
        
        with open("Electronics_reviews.csv", "w", newline="", encoding="utf-8") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["Rating", "Comment"])
            
            for link in product_links:
                response = requests.get(link, headers=headers)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.content, "html.parser")
                    product_title = soup.find("span", class_="a-size-large product-title-word-break").text.strip()
                    
                    review_section = soup.find("div", {"data-hook": "reviews-collapsed"})
                    if review_section:
                        reviews = review_section.find_all("span", {"data-hook": "review-body"})
                        for review in reviews:
                            rating = review.find_previous("i").text.strip()  
                            comment = review.text.strip()  # Get the comment text
                            writer.writerow([rating, comment])
                            
                            # Introduce a delay to avoid overwhelming the server
                            time.sleep(2)  
                else:
                    print(f"Failed to scrape {link}. Status code: {response.status_code}")
                    # Retry the request after waiting for a while
                    time.sleep(10)  
    else:
        print(f"Failed to access the URL. Status code: {response.status_code}")


if __name__ == "__main__":
    scrape_amazon_Electronics()

Failed to access the URL. Status code: 503


### Reading Json File

In [1]:
import json
import os


file_path = os.path.expanduser("~/Desktop/Final Project/Electronics.json")

def stream_json_data(file_path):
    with open(file_path, "r") as json_file:
        for line_number, line in enumerate(json_file, start=1):
            try:
                data = json.loads(line)
                yield data
            except json.JSONDecodeError:
                print(f"Error parsing line {line_number}: {line}")

for index, item in enumerate(stream_json_data(file_path), start=1):
    print(f"Row {index}: {item}")
    if index == 20:
        break

Row 1: {'overall': 5.0, 'verified': True, 'reviewTime': '07 17, 2002', 'reviewerID': 'A1N070NS9CJQ2I', 'asin': '0060009810', 'style': {'Format:': ' Hardcover'}, 'reviewerName': 'Teri Adams', 'reviewText': "This was the first time I read Garcia-Aguilera.  I came upon the name of this book on Live with Regis and Kelly. This book was exactly what I was looking for ... it hit the spot.  I really enjoyed this book because it was well written. Once I started this book it kept me coming back for more. It had culture, family, friendship and romance. I was looking for a little more romance when I picked this book but in the end it turned out to be just right.  I love the main chartachter Margarita (aka Daisy). I've never been to Miami but the way Daisy told the story I certainly felt I'd been there.\nAlso after going through all of Daisy's perils ... I closed the book with a feeling I had grown emotionally as well.", 'summary': 'Hit The Spot!', 'unixReviewTime': 1026864000}
Row 2: {'overall': 5

### Converting it into Excel File

In [5]:
import os
import json
import pandas as pd
from itertools import islice

file_path = os.path.expanduser("~/Desktop/Final Project/Electronics.json")

chunk_size = 1000

json_directory = os.path.dirname(file_path)

output_file_path = os.path.join(json_directory, "Electronics.xlsx")

excel_writer = pd.ExcelWriter(output_file_path, engine="xlsxwriter")

row_counter = 0

with open(file_path, "r") as json_file:
    while True:
        data_chunk = list(islice(json_file, chunk_size))
        if not data_chunk:
            break

        df_chunk = pd.DataFrame([json.loads(line) for line in data_chunk])

        df_chunk.to_excel(excel_writer, sheet_name="Sheet1", index=False, startrow=row_counter)

        row_counter += len(df_chunk)

excel_writer.save()

print("Data saved to Excel file:", output_file_path)

Data saved to Excel file: /Users/0xjoex/Desktop/Final Project/Electronics.xlsx


In [7]:
import pandas as pd
import os

file_path = os.path.expanduser("~/Desktop/Final Project/Electronics.xlsx")

df = pd.read_excel(file_path)

print(df.head())

  overall verified   reviewTime      reviewerID        asin  \
0       5     True  07 17, 2002  A1N070NS9CJQ2I  0060009810   
1       5    False   07 6, 2002  A3P0KRKOBQK1KN  0060009810   
2       5    False   07 3, 2002  A192HO2ICJ75VU  0060009810   
3       4    False  06 30, 2002  A2T278FKFL3BLT  0060009810   
4       5    False  06 28, 2002  A2ZUXVTW8RXBXW  0060009810   

                       style reviewerName  \
0  {'Format:': ' Hardcover'}   Teri Adams   
1  {'Format:': ' Hardcover'}     Willa C.   
2  {'Format:': ' Hardcover'}          Kit   
3  {'Format:': ' Hardcover'}       Andres   
4  {'Format:': ' Hardcover'}         John   

                                          reviewText  \
0  This was the first time I read Garcia-Aguilera...   
1  As with all of Ms. Garcia-Aguilera's books, I ...   
2  I've not read any of Ms Aguilera's works befor...   
3  This romance novel is right up there with the ...   
4  Carolina Garcia Aguilera has done it again.  S...   

             

In [8]:
print("Shape of the dataset:", df.shape)

Shape of the dataset: (1048575, 12)


In [9]:
print("\nBasic Information:")
print(df.info())


Basic Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 12 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   overall         1035845 non-null  object
 1   verified        887548 non-null   object
 2   reviewTime      1042266 non-null  object
 3   reviewerID      1048575 non-null  object
 4   asin            1047571 non-null  object
 5   style           902959 non-null   object
 6   reviewerName    1013386 non-null  object
 7   reviewText      1046804 non-null  object
 8   summary         1048484 non-null  object
 9   unixReviewTime  799614 non-null   object
 10  vote            275785 non-null   object
 11  image           72538 non-null    object
dtypes: object(12)
memory usage: 96.0+ MB
None


In [10]:
print("\nSummary Statistics:")
print(df.describe())


Summary Statistics:
        overall verified reviewTime reviewerID        asin  \
count   1035845   887548    1042266    1048575     1047571   
unique    39424      977      39728     638976      267167   
top           5     True       True       True  B000BQ7GW8   
freq     598079   596838     161443      42032       13755   

                                           style     reviewerName  reviewText  \
count                                     902959          1013386     1046804   
unique                                    228626           669843      836949   
top     {'Package Type:': ' Standard Packaging'}  Amazon Customer  Five Stars   
freq                                       33969            26733       38232   

           summary unixReviewTime    vote  \
count      1048484         799614  275785   
unique      447538          72620   14412   
top     Five Stars              2       2   
freq         79448          16755   25023   

                                    