# Unstructured Data Analytics Final Project
# ShampYou Project Scraping Code (Influenster and Ebay) 

### Group Members: Dameli Aziken, Twinkle Panda, Vishwa Patel, Sneha Sastry Rayadurgam, Kimberly Simmonds

# Part 1: Influenster Web Scraping
We used Influenster.com for the majority of our scraping and captured a list of 100 most popular shampoos, their average rating and total number of reviews, and 50 user reviews per shampoo. 

## Get 100 URLs
Influenster automatically sorts products in descending order by number of ratings, so we scraped product page URLs for the top 100 products under the Shampoo category. 

In [90]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import pandas as pd
import time

chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")

# Update the path to ChromeDriver binary (if required)
webdriver_service = Service("/usr/local/bin/chromedriver")
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
wait = WebDriverWait(browser, 20)  # Adjust the wait time if necessary

url = 'https://www.influenster.com/search?categories=Hair&categories=Hair+Products&categories=Shampoo&tab=product'
browser.get(url)

data = []

# Accept cookies
try:
    cookie_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Accept All Cookies")]')))
    browser.execute_script("arguments[0].click();", cookie_button)
    print('Accepted cookies')

    # Wait for the page to fully load after accepting cookies
    wait.until(EC.presence_of_element_located((By.XPATH, '//div[contains(@class, "SearchResultsPage_search-results__grid__")]')))
    print("Page loaded successfully.")

except Exception as e:
    print(f'No cookie button or page did not load properly: {e}')

# Create an empty list to store URLs
urls = []

# Function to scroll and load products until we have at least 100
def load_products_until(limit=100):
    total_products = 0
    scroll_pause_time = 2  # Adjust if necessary

    while total_products < limit:
        # Scroll down to the bottom
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(scroll_pause_time)  # Wait for new products to load

        # Update the total number of products
        product_links = browser.find_elements(By.XPATH, '//a[@data-link-overlay="true"]')
        new_total_products = len(product_links)

        if new_total_products > total_products:
            total_products = new_total_products
            print(f"Total products loaded: {total_products}")
        else:
            # If no new products are loaded after scrolling, break the loop
            print("No more products loaded after scrolling.")
            break

# Function to scrape URLs
def url_scrape(limit=100):
    # Find all product link elements using the data-link-overlay attribute
    product_links = browser.find_elements(By.XPATH, '//a[@data-link-overlay="true"]')

    print(f"Found {len(product_links)} products on the page.")

    for product_link in product_links[:limit]:
        try:
            # Get the product URL from the href attribute
            product_url = product_link.get_attribute('href')

            # Ensure the URL is absolute
            if product_url.startswith('/'):
                product_url = 'https://www.influenster.com' + product_url

            # Remove any trailing slashes and append '/reviews/'
            product_url = product_url.rstrip('/') + '/reviews/'

            # Append the URL to the list
            urls.append({'URL': product_url})
        except Exception as e:
            print(f'Error finding information: {e}')


# Load products until we have at least 100
load_products_until(limit=100)

# Call the function to scrape URLs
url_scrape(limit=100)

# Close the browser when done
browser.quit()

# Convert the URLs list to a DataFrame and display it
urls_df = pd.DataFrame(urls)
print("Number of URLs pulled:", len(urls_df))
urls_df.to_csv('Influenster URL List.csv', sep='\t', encoding='utf-8', index=False, header=True)
urls_df[:5]

Accepted cookies
Page loaded successfully.
Total products loaded: 60
Total products loaded: 80
Total products loaded: 100
Found 100 products on the page.
Number of URLs pulled: 100


Unnamed: 0,URL
0,https://www.influenster.com/reviews/garnier-sl...
1,https://www.influenster.com/reviews/head-shoul...
2,https://www.influenster.com/reviews/pantene-da...
3,https://www.influenster.com/reviews/tresemme-s...
4,https://www.influenster.com/reviews/pantene-pr...


## Get 50 Ratings/Reviews for Every Shampoo
Next, we used the URLs captured in the previous step to go to the reviews page for each product and capture the 50 most recent reviews.

In [92]:
list = urls_df['URL'] #this is for all 100 URLs
#list = urls_df['URL'][:3] #this is for testing purposes only

In [93]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import pandas as pd
import time

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('--disable-notifications')
chrome_options.add_argument("window-size=1280,720")
# Uncomment the line below to run Chrome in headless mode
# chrome_options.add_argument("--headless")

# Update the path to ChromeDriver binary (if required)
webdriver_service = Service("/usr/local/bin/chromedriver")

# Initialize the WebDriver
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
wait = WebDriverWait(browser, 20)  # 20 seconds wait time

# List of product URLs to scrape
product_urls = list


# Initialize an empty list to store all review data
all_data = []

# Loop over each product URL
for url in product_urls:
    print(f"Processing URL: {url}")
    browser.get(url)
    data = []

    # Accept cookies if the button appears
    try:
        cookie_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Accept All Cookies")]')))
        browser.execute_script("arguments[0].click();", cookie_button)
        print('Accepted cookies')
    except TimeoutException:
        print('No cookie button found or already accepted.')

    # Wait for the page to load
    try:
        wait.until(EC.presence_of_element_located((By.XPATH, '//div[contains(@class, "InfiniteScroll_infinite-scroll__")]')))
        print("Page loaded successfully.")
    except TimeoutException:
        print("Page did not load properly.")
        continue  # Skip to the next URL if the page didn't load

    # Extract the product title from the page title
    try:
        full_title = browser.title
        print(f"Full page title: {full_title}")
        # Split the title at 'Reviews' to get the product name
        product_title = full_title.split('Reviews')[0].strip()
        print(f"Product title: {product_title}")
    except Exception as e:
        print(f'Error finding product title: {e}')
        product_title = "Unknown Product"

    # Function to load reviews until we have at least 50
    def load_reviews_until(limit=50):
        while True:
            # Get the current number of reviews
            reviews = browser.find_elements(By.XPATH, '//div[@data-cy="review-ugc-container__body"]')
            total_reviews = len(reviews)
            print(f'Current number of reviews loaded: {total_reviews}')
            if total_reviews >= limit:
                print(f'Reached the limit of {limit} reviews.')
                break
            try:
                # Scroll to the bottom to make sure the "Load more" button is in view
                browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                # Wait for the "Load more" button to be clickable and click it
                load_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Load more")]')))
                print('Clicking "Load more" button...')
                browser.execute_script("arguments[0].click();", load_more_button)
                # Wait for new reviews to load
                time.sleep(2)  # Adjust the sleep time if necessary
            except (NoSuchElementException, TimeoutException):
                print("No more 'Load more' button found.")
                break
            except Exception as e:
                print(f"Error clicking 'Load more' button: {e}")
                break

    # Function to scrape reviews on the current page
    def scrape_reviews(limit=50):
        # Find all individual review elements
        reviews = browser.find_elements(By.XPATH, '//div[@data-cy="review-ugc-container__body"]')

        # Limit the number of reviews to the specified limit
        reviews = reviews[:limit]
        print(f'Scraping {len(reviews)} reviews.')

        for review_element in reviews:
            try:
                # Get the star rating element
                star_rating = review_element.find_element(By.XPATH, './/div[starts-with(@class, "StarRating_star-rating__rating-text__")]').text.strip()

                # Get the review text element
                review_text = review_element.find_element(By.XPATH, './/div[starts-with(@class, "Review_review__body-text__")]').text.strip()

                # Append the data
                data.append({
                    'product_name': product_title,
                    'user_rating': star_rating,
                    'product_review': review_text
                })
            except Exception as e:
                print(f'Error finding information: {e}')

    # Load reviews until we have at least 50
    load_reviews_until(limit=50)

    # Scrape the reviews
    scrape_reviews(limit=50)

    # Add the data from this product to the all_data list
    all_data.extend(data)

    # Optional: Pause between products to be polite to the server
    time.sleep(2)

# Close the browser when done
browser.quit()

# Convert the data to a DataFrame and display it
df = pd.DataFrame(all_data)
print("Total number of reviews pulled: ", len(df))
df[:5]

# Optionally, save the DataFrame to a CSV file
df.to_csv('Influenster Reviews.csv', index=False)


Processing URL: https://www.influenster.com/reviews/garnier-sleek-shine-intensely-smooth-leave-in-conditioning-cream/reviews/
Accepted cookies
Page loaded successfully.
Full page title: Garnier Sleek & Shine Intensely Smooth Leave-In Conditioning Cream Reviews | Find the Best Shampoo Products | Influenster
Product title: Garnier Sleek & Shine Intensely Smooth Leave-In Conditioning Cream
Current number of reviews loaded: 10
Clicking "Load more" button...
Current number of reviews loaded: 20
Clicking "Load more" button...
Current number of reviews loaded: 30
Clicking "Load more" button...
Current number of reviews loaded: 40
Clicking "Load more" button...
Current number of reviews loaded: 50
Reached the limit of 50 reviews.
Scraping 50 reviews.
Processing URL: https://www.influenster.com/reviews/head-shoulders-classic-clean-anti-dandruff-shampoo/reviews/
No cookie button found or already accepted.
Page loaded successfully.
Full page title: Head & Shoulders Classic Clean Anti-Dandruff Sha

## Get Product Description, Total # of Reviews, and Average Review for Every Shampoo
Lastly, we used the URLs captured in the first step to scrape the product description, total number of reviews, and average rating for each shampoo product. 

In [95]:
list = urls_df['URL'] #this is for all 100 URLs
#list = urls_df['URL'][:3] #this is for testing purposes only

list = list.str.replace('/reviews/$', '', regex=True)

In [100]:
list = pd.read_csv('Influenster URL List.csv')
list = urls_df['URL']
list = list.str.replace('/reviews/$', '', regex=True)


In [103]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('--disable-notifications')
chrome_options.add_argument("window-size=1280,720")
# Uncomment the line below to run Chrome in headless mode (without opening a browser window)
# chrome_options.add_argument("--headless")

# Update the path to ChromeDriver binary (if required)
webdriver_service = Service("/usr/local/bin/chromedriver")  # Update this path if necessary

# Initialize the WebDriver
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
wait = WebDriverWait(browser, 5)  # Adjust the wait time if necessary

# List of product URLs to scrape
product_urls = list

# Initialize an empty list to store product data
product_data = []

# Loop over each product URL
for url in product_urls:
    print(f"Processing URL: {url}")
    browser.get(url)
    time.sleep(2)  # Wait for the page to load

    # Accept cookies if the button appears
    try:
        cookie_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Accept All Cookies")]')))
        browser.execute_script("arguments[0].click();", cookie_button)
        print('Accepted cookies')
    except (NoSuchElementException, TimeoutException):
        print('No cookie button found or already accepted.')

    # Click the "See more" button to expand the description if it exists
    try:
        # Locate the "See more" button
        see_more_button = browser.find_element(By.XPATH, '//button[contains(text(), "See more")]')
        browser.execute_script("arguments[0].click();", see_more_button)
        print("Clicked 'See more' button to expand the product description.")
        # Wait a moment for the content to expand
        time.sleep(1)
    except NoSuchElementException:
        print("'See more' button not found; description is already fully visible.")

    # Extract the product description
    try:
        # Use XPath with starts-with to handle dynamic class names
        description_element = wait.until(EC.presence_of_element_located(
            (By.XPATH, '//p[starts-with(@class, "ProductDescription_product-description__text__")]')
        ))
        product_description = description_element.text.strip()
        print(f"Product Description: {product_description}")
    except TimeoutException:
        print("Product description not found.")
        product_description = None

    # Extract the average rating
    try:
        rating_element = wait.until(EC.presence_of_element_located(
            (By.XPATH, '//div[contains(@class, "StarRating_star-rating__review-rating__")]')
        ))
        average_rating_text = rating_element.text.strip()
        # Expected format: '4.41 / 5'
        average_rating = average_rating_text.split(' / ')[0]
        average_rating = float(average_rating)
        print(f"Average Rating: {average_rating}")
    except (TimeoutException, ValueError):
        print("Average rating not found or could not parse rating.")
        average_rating = None

    # Extract the number of reviews
    try:
        reviews_element = wait.until(EC.presence_of_element_located(
            (By.XPATH, '//a[contains(@class, "StarRating_star-rating__review-count__")]')
        ))
        number_of_reviews_text = reviews_element.text.strip()
        # Expected format: '24090 reviews'
        number_of_reviews = number_of_reviews_text.split()[0].replace(',', '')
        number_of_reviews = int(number_of_reviews)
        print(f"Number of Reviews: {number_of_reviews}")
    except (TimeoutException, ValueError):
        print("Number of reviews not found or could not parse number.")
        number_of_reviews = None

    # Extract the product title from the page title
    try:
        full_title = browser.title
        # Split the title at 'Reviews' to get the product name
        product_title = full_title.split('Reviews')[0].strip()
    except Exception as e:
        print(f'Error finding product title: {e}')
        product_title = "Unknown Product"

    # Store the data
    product_data.append({
        'URL': url,
        'Product Name': product_title,
        'Product Description': product_description,
        'Average Rating': average_rating,
        'Number of Reviews': number_of_reviews
    })

    # Optional: Pause between products to be polite to the server
    time.sleep(2)

# Close the browser when done
browser.quit()

# Convert the data to a DataFrame and display it
description_df = pd.DataFrame(product_data)

# Optionally, save the DataFrame to a CSV file
description_df.to_csv('Influenster Product Descriptions.csv', index=False)


Processing URL: https://www.influenster.com/reviews/garnier-sleek-shine-intensely-smooth-leave-in-conditioning-cream
Accepted cookies
'See more' button not found; description is already fully visible.
Product Description: Leave-in conditioning cream for up to 3 day sleek* *With shampoo, conditioner and leave-in cream
Average Rating: 4.41
Number of Reviews: 24090
Processing URL: https://www.influenster.com/reviews/head-shoulders-classic-clean-anti-dandruff-shampoo
No cookie button found or already accepted.
Clicked 'See more' button to expand the product description.
Product Description: Head & Shoulders® Classic Clean Shampoo is paraben free America’s #1 dandruff shampoo (*based on volume sales) Clinically proven to protect against flakes, itch, oil and dryness with regular use (**flakes and itch associated with dandruff; washes away oil & flakes) Clinically proven. Up to 100% dandruff protection. ( ***visible flakes, with regular use)
Average Rating: 4.22
Number of Reviews: 18437
Proc

In [105]:
description_df[:5]

Unnamed: 0,URL,Product Name,Product Description,Average Rating,Number of Reviews
0,https://www.influenster.com/reviews/garnier-sl...,Garnier Sleek & Shine Intensely Smooth Leave-I...,Leave-in conditioning cream for up to 3 day sl...,4.41,24090
1,https://www.influenster.com/reviews/head-shoul...,Head & Shoulders Classic Clean Anti-Dandruff S...,Head & Shoulders® Classic Clean Shampoo is par...,4.22,18437
2,https://www.influenster.com/reviews/pantene-da...,Pantene Daily Moisture Renewal Shampoo,Daily Moisture Renewal Shampoo,4.26,16833
3,https://www.influenster.com/reviews/tresemme-s...,Tresemme Silky & Smooth Shampoo for Frizzy Hair,"Looking for that high glam, smooth look? TRESe...",4.3,14193
4,https://www.influenster.com/reviews/pantene-pr...,Pantene Pro-V Smooth & Sleek Shampoo & Conditi...,Smooth things over with this frizz-fighting co...,4.32,13372


# Part 2: Ebay Web Scraping

Since Influenster.com does not have price data, we used Ebay to scrape for 10,000 products using search term "shampoo." We attempted to use Amazon but were blocked after attempting to scrape using Selenium. 
We also considered using other websites that offer beauty product reviews and price data together, such as Makeup Alley - however, that website does not have a sufficient number of reviews to use for our purposes. 

In [1]:
# Import necessary libraries
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Set up the WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
driver.maximize_window()

# URL of eBay shampoo listings
url = "https://www.ebay.com/b/Shampoos/177661/bn_8574035"

# Initialize WebDriverWait
wait = WebDriverWait(driver, 10)

# Open the page
driver.get(url)

# Initialize a list to store the scraped data
shampoo_data = []

# Loop through multiple pages
for page in range(1, 201):  # Adjust the range based on how many pages you want to scrape
    print(f"Scraping page {page}...")
    
    # Wait for the products to load
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.page-container > div.main-content > section.brw-region.brw-region--right > section.brw-river.bwrvr")))

    # Get all product containers
    products = driver.find_elements(By.CSS_SELECTOR, "body > div.page-container > div.main-content > section.brw-region.brw-region--right > section.brw-river.bwrvr > ul > li")

    for product in products:
        try:
            # Extract the product name
            name = product.find_element(By.CSS_SELECTOR, ".brwrvr__item-card__signals .bsig--header span").text
        except:
            name = 'N/A'

        try:
            # Extract the product price
            price = product.find_element(By.CSS_SELECTOR, ".brwrvr__item-card__signals .brwrvr__item-card__signals__body span").text
        except:
            price = 'N/A'

        # Append the data to the list
        shampoo_data.append({
            'product_name': name,
            'price': price,
        })
   
    # Go to the next page
    try:
        next_button = driver.find_element(By.CSS_SELECTOR, ".pagination__next")
        next_button.click()
        time.sleep(3)  # Wait for the next page to load
    except:
        print("No more pages.")
        break

# Close the WebDriver
driver.quit()

# Convert the data into a pandas DataFrame
df = pd.DataFrame(shampoo_data)

# Save the data to a CSV file
df.to_csv("ebay_shampoo_data.csv", index=False)

print("Scraping complete!")


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 