In [None]:
# IMPORTS  --RUN THIS

from bs4 import BeautifulSoup
import requests
import time
import datetime
import pandas as pd
import numpy as np
import time
import random
import ast

In [None]:
# define get_xx methods on product listing page   --RUN THIS

def get_title(soup):
    try:
        title = soup.find("span", attrs={"id": 'productTitle'})
        
        title_string = title.text.strip()
        
    except AttributeError:
        title_string = ""
        
    return title_string

def get_original_price(soup):

    try:
        price = soup.find("span", attrs={'class':'a-size-small a-color-secondary aok-align-center basisPrice'}).find("span", attrs={'class':'a-offscreen'}).string.strip()

    except AttributeError:

        price = ""

    return price

def get_sale_price(soup):
    
    try:
        price_desc = soup.find("div", attrs={'class':'a-section a-spacing-none aok-align-center aok-relative'}).find("span", attrs={"aok-offscreen"}).text.strip().split(" ")
        price = price_desc[0]
        if len(price_desc) > 2:
            
            discount = price_desc[2]
        else:
            discount = ""
        
    except AttributeError:

        price = ""
        discount = ""
        
    return price, discount

def get_rating(soup):

    try:
        rating = soup.find("div", attrs={'id':'averageCustomerReviews'}).find("span", attrs={'class': 'a-size-base a-color-base'}).string.strip()
    
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""
            
    if "Previous page" in rating:
        rating = ""
        
    return rating

def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip().split(" ")[0]

    except AttributeError:
        review_count = ""

    return review_count

def get_categories(soup):

    def extract_category(text):
        word_pos = text.find("in")
        if word_pos != -1:
            # Slicing after the word
            return text[word_pos + 2:].strip()
        return text.strip()
        
    try:
        product_details = soup.find("div", attrs={'id':'detailBulletsWrapper_feature_div'}).find_all("span", attrs={'class':'a-list-item'})
        ranks = ""
        for prod in product_details:
            text = prod.text.strip()
            if 'Best Sellers Rank' in text:
                ranks = prod
                break
        ranking = ranks.find_all("a")
        cat1 = extract_category(ranking[0].text)
        cat2 = ranking[-1].text.strip()
        
        
    except AttributeError:
       
        try:
            product_details = soup.find("div", attrs={'id': 'prodDetails'}).find_all("tr")
            ranks = ""
            for prod in product_details:
                text = prod.find("th", attrs={'class':'a-color-secondary a-size-base prodDetSectionEntry'}).text.strip()
                if 'Best Sellers Rank' in text:
                    ranks = prod
                    break
            ranking = ranks.find_all("a")
            cat1 = extract_category(ranking[0].text)
            cat2 = ranking[-1].text.strip()
                    

        except AttributeError:
            cat1 = ""
            cat2 = ""
            
    return cat1, cat2

def get_rankings(soup):
    try:
        product_details = soup.find("div", attrs={'id':'detailBulletsWrapper_feature_div'}).find_all("span", attrs={'class':'a-list-item'})
        ranks = ""
        for prod in product_details:
            text = prod.text.strip()
            if 'Best Sellers Rank' in text:
                ranks = prod
                break
        ranking = ranks.text[20:].strip()
        
        
    except AttributeError:
       
        try:
            product_details = soup.find("div", attrs={'id': 'prodDetails'}).find_all("tr")
            ranks = ""
            for prod in product_details:
                text = prod.find("th", attrs={'class':'a-color-secondary a-size-base prodDetSectionEntry'}).text.strip()
                if 'Best Sellers Rank' in text:
                    ranks = prod
                    break
            ranking = ranks.text.strip()[20:].strip()
                    

        except AttributeError:
            ranking = ""
            
    return ranking

def get_purchase_count_last_month(soup):
    try:
        count = soup.find("span", attrs={'id':'social-proofing-faceout-title-tk_bought'}).find("span").string.strip()

    except AttributeError:
        count = ""
    
    return count

def get_description(soup):
    try:
        desc_list = soup.find("div", attrs={'id':'feature-bullets'}).find_all("span")
        res = []
        for desc in desc_list:
            res.append(desc.string.strip())

        return res
        
    except AttributeError:
        desc = ""
        return desc

def get_review_insights(soup):
    try:
        insights = soup.find("div", attrs={'data-hook':'cr-insights-widget-aspects'}).find_all("button")
        for i in range(len(insights)):
            insights[i] = insights[i].get('data-csa-c-item-id')

    except AttributeError:
        insights = ""

    return insights

def get_store_name(soup):
    try:
        store_name = soup.find("a", attrs={'id':'bylineInfo'}).string.strip()
        if "Brand: " in store_name:
            store_name = store_name.replace("Brand: ", "")
        elif "Visit the " in store_name:
            store_name = store_name.replace("Visit the ", "")
    except AttributeError:
        store_name = ""

    return store_name

def get_supplier(soup):
    try:
        supplier_name = soup.find("a", attrs={'id':'sellerProfileTriggerId'}).string.strip()
        
    except AttributeError:
        supplier_name = ""

    return supplier_name


def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id':'availability'}).find("span").string.strip()

    except AttributeError:
        available = "Not Available"	

    return available

# def get_shipping_locations(soup):
#     try:
#         location = soup.find("div", attrs={'class':'a-popover a-popover-modal a-declarative'})

#     except AttributeError:
#         location = ""
        
#     return location

# def get_delivery_time(soup):
#     # try:
        

#     # except AttributeError:
#     return
    

In [None]:
def load_headers(file_name):
    with open(file_name, "r") as file:
        headers_list = [ast.literal_eval(line.strip()) for line in file]
    return headers_list

# Define a function to save data to CSV
def save_to_csv(data, file_name):
    df = pd.DataFrame.from_dict(data)
    df['title'].replace('', np.nan, inplace=True)
    df.to_csv(file_name, mode='a', header=not pd.io.common.file_exists(file_name), index=False)
    print("SAVED TO CSV")


# Load headers
HEADERS_LIST = load_headers("headers.txt")

# Connect to website and pull data
start_time = time.time()

if __name__ == "__main__":

    NUM_ITEMS = 5000 # DECLARE HOW MANY ROWS YOU WANT
    MAIN_CATEGORY = "Boy's Fashion" # INPUT THE CATEGORY YOU ARE SCRAPING. THIS IS WILL BE THE MAIN CATEGORY.
    
    URLS = []
    BOYS_FASHION_URL = 'https://www.amazon.com/s?i=fashion-boys-intl-ship&bbn=16225021011&rh=n%3A7141123011%2Cn%3A16225021011%2Cn%3A7147443011%2Cp_n_size_six_browse-vebin%3A4940401011&dc&ds=v1%3Aea9Nn8gVro9WrRJD1zLQrqASGRn9CcdrQy45VoGcq7o&fst=as%3Aoff&pf_rd_i=16225021011&pf_rd_m=ATVPDKIKX0DER&pf_rd_p=d84623b2-8aff-40df-9701-224067aef31e&pf_rd_r=RH2FKA9CSFGZCAXT2SV4&pf_rd_s=merchandised-search-3&pf_rd_t=101&qid=1718713695&ref=sr_ex_n_1'
    URLS.append(BOYS_FASHION_URL)

    product_listing_page = requests.get(URLS[0], headers=random.choice(HEADERS_LIST))
    soup = BeautifulSoup(product_listing_page.content, "html.parser")
    product_links = soup.find_all("a", attrs={'class': 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})
    
    product_links_list = []
    for link in product_links:
        product_links_list.append(link.get('href'))

    data = {"title": [], "original_price": [], "sale_price": [], "discount": [], "rating": [], "review_count": [], "main_category": [], "sub_category_1": [], "sub_category_2": [], "rankings": [], "description": [], "insights": [], "purchase_cnt_prev_month": [], "store_name": [], "supplier": [], "is_available": [], "url": []}
    CSV_FILE_NAME = 'Boys Fashion' + '.csv' # INPUT YOUR CSV FILE NAME
    
    i = 0
    page = 327
    while (i <= NUM_ITEMS):
        for link in product_links_list:
            try:
                product_webpage = requests.get("https://www.amazon.com" + link, headers=random.choice(HEADERS_LIST))
                
            except OSError:
                continue
            except AttributeError as e:
                print(f"Encountered an AttributeError: {e}")
                continue
            except Exception as e:
                print(f"Encountered an error: {e}")
                continue
                
            product_soup = BeautifulSoup(product_webpage.content, "html.parser")
            sale_price, discount = get_sale_price(product_soup)
            if sale_price == "":
                continue
            data['title'].append(get_title(product_soup))
            data['original_price'].append(get_original_price(product_soup))
            data['sale_price'].append(sale_price)
            data['discount'].append(discount)
            data['rating'].append(get_rating(product_soup))
            data['review_count'].append(get_review_count(product_soup))
            data['main_category'].append(MAIN_CATEGORY)
            cat1, cat2 = get_categories(product_soup)
            data['sub_category_1'].append(cat1)
            data['sub_category_2'].append(cat2)
            data['rankings'].append(get_rankings(product_soup))
            data['description'].append(get_description(product_soup))
            data['insights'].append(get_review_insights(product_soup))
            data['purchase_cnt_prev_month'].append(get_purchase_count_last_month(product_soup))
            data['store_name'].append(get_store_name(product_soup))
            data['supplier'].append(get_supplier(product_soup))
            data['is_available'].append(get_availability(product_soup))
            data['url'].append(link)
        
            print(f"retrieved listing {i}")
            time.sleep(random.randint(1, 2))
            i += 1
            
            # Save to CSV every 100 items
            if i % 100 == 0:
                save_to_csv(data, CSV_FILE_NAME)
                data = {key: [] for key in data}  # Reset the data dictionary
        
        # navigate to next page if more items are needed
        if len(data) < NUM_ITEMS:
            try:
                next_page_element = soup.find("a", attrs={'class': 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})
                if next_page_element:
                    next_page_suffix = next_page_element.get('href')
                    next_webpage = requests.get("https://www.amazon.com" + next_page_suffix, headers=random.choice(HEADERS_LIST))
                    soup = BeautifulSoup(next_webpage.content, "html.parser")
                    product_links = soup.find_all("a", attrs={'class': 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})

                    product_links_list = []
                    for link in product_links:
                        product_links_list.append(link.get('href'))
                    page += 1
                    print(f"Retrieving page {page} with {len(product_links_list)} items")
                else:
                    print("No more pages to retrieve.")
                    break
            except AttributeError as e:
                print(f"Encountered an AttributeError: {e}")
                continue
            except Exception as e:
                print(f"Encountered an error: {e}")
                continue

    save_to_csv(data, CSV_FILE_NAME)
    print("done")

end_time = time.time()
print(f"Time elapsed: {end_time - start_time}")

In [None]:
# # convert to csv

# CSV_FILE_NAME = 'Womens Fashion' + '.csv' # INPUT YOUR CSV FILE NAME

# amazon_df = pd.DataFrame.from_dict(data)
# amazon_df['title'].replace('', np.nan, inplace=True)
# # amazon_df = amazon_df.dropna(subset=['title'])
# amazon_df.to_csv(CSV_FILE_NAME, header=True, index=False)

In [None]:
# df = pd.read_csv('Womens Fashion.csv')

# # Remove duplicate rows
# df_cleaned = df.drop_duplicates()

# # Save the cleaned DataFrame back to a CSV file
# df_cleaned.to_csv('Womens Fashion Cleaned.csv', index=False)

In [None]:
# CHECK YOUR FUNCTION WORKS -- UNCOMMENT THE ONE YOU ARE TESTING

# uncomment these two as well
# product_listing_page = requests.get("https://www.amazon.com" + product_links_list[3], headers=HEADERS)
# product_listing_soup = BeautifulSoup(product_listing_page.content, "html.parser")



# product_title = get_title(product_listing_soup)
# product_title

# product_list_price = get_list_price(product_listing_soup)
# product_list_price

# product_listing_soup.find_all("div", attrs={"class":"a-section a-spacing-none aok-align-center aok-relative"})

# sale_price = get_sale_price(product_listing_soup)
# print(sale_price)

# rating = get_rating(product_listing_soup)
# rating

# availability = get_availability(product_listing_soup)
# availability

# store_name = get_store_name(product_listing_soup)
# store_name

# desc = get_product_description(product_listing_soup)
# desc

# ranking = get_best_sellers_rank(product_listing_soup)
# ranking

# purchase_last_month = get_purchase_count_last_month(product_listing_soup)
# purchase_last_month

# categories = get_categories(product_listing_soup)
# categories

# insights = get_review_insights(product_listing_soup)
# insights