Write a python program which searches all the product under a particular product from www.amazon.in. The 
product to be searched will be taken as input from user. For e.g. If user input is ‘guitar’. Then search for 
guitars. 

2. In the above question, now scrape the following details of each product listed in first 3 pages of your search 
results and save it in a data frame and csv. In case if any product has less than 3 pages in search results then 
scrape all the products available under that product name. Details to be scraped are: "Brand 
Name", "Name of the Product", "Price", "Return/Exchange", "Expected Delivery", "Availability" and 
“Product URL”. In case, if any of the details are missing for any of the product then replace it by “-“. 


In [12]:
!pip install selenium webdriver_manager requests


Collecting webdriver_manager
  Obtaining dependency information for webdriver_manager from https://files.pythonhosted.org/packages/b1/51/b5c11cf739ac4eecde611794a0ec9df420d0239d51e73bc19eb44f02b48b/webdriver_manager-4.0.1-py2.py3-none-any.whl.metadata
  Downloading webdriver_manager-4.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-dotenv (from webdriver_manager)
  Obtaining dependency information for python-dotenv from https://files.pythonhosted.org/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl.metadata
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading webdriver_manager-4.0.1-py2.py3-none-any.whl (27 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver_manager
Successfully installed python-dotenv-1.0.1 webdriver_manager-4.0.1


In [13]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

def get_product_details(driver):
    products = []
    product_elements = driver.find_elements(By.XPATH, "//div[@data-component-type='s-search-result']")

    for product_element in product_elements:
        try:
            brand_name = product_element.find_element(By.XPATH, ".//span[contains(@class, 'a-size-base-plus')]").text
        except:
            brand_name = "-"
        try:
            product_name = product_element.find_element(By.XPATH, ".//h2/a/span").text
        except:
            product_name = "-"
        try:
            price = product_element.find_element(By.XPATH, ".//span[@class='a-price-whole']").text
        except:
            price = "-"
        try:
            product_url = product_element.find_element(By.XPATH, ".//h2/a").get_attribute('href')
        except:
            product_url = "-"

        # Return/Exchange, Expected Delivery, and Availability usually require going into the product page
        # Here, we will provide a placeholder and leave this part for future extension
        return_exchange = "-"
        expected_delivery = "-"
        availability = "-"
        
        products.append({
            "Brand Name": brand_name,
            "Name of the Product": product_name,
            "Price": price,
            "Return/Exchange": return_exchange,
            "Expected Delivery": expected_delivery,
            "Availability": availability,
            "Product URL": product_url
        })

    return products

def search_amazon(product):
    # Set up the Selenium WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    driver.get("https://www.amazon.in")
    driver.maximize_window()
    time.sleep(2)
    
    # Find the search box and enter the product name
    search_box = driver.find_element(By.ID, "twotabsearchtextbox")
    search_box.send_keys(product)
    search_box.send_keys(Keys.RETURN)
    time.sleep(2)

    all_products = []
    
    # Scrape data from the first 3 pages
    for page in range(1, 4):
        all_products.extend(get_product_details(driver))
        
        # Go to the next page
        try:
            next_page = driver.find_element(By.XPATH, "//a[contains(@class, 's-pagination-next')]")
            next_page.click()
            time.sleep(2)
        except:
            print("No more pages found.")
            break

    driver.quit()
    
    # Create DataFrame
    df = pd.DataFrame(all_products)
    
    # Save to CSV
    df.to_csv(f'{product}_amazon_products.csv', index=False)
    print(f"Data saved to {product}_amazon_products.csv")

# Get user input
product_to_search = input("Enter the product to search on Amazon: ")
search_amazon(product_to_search)

Enter the product to search on Amazon: Guitar
No more pages found.
Data saved to Guitar_amazon_products.csv


Write a python program to access the search bar and search button on images.google.com and scrape 10 
images each for keywords ‘fruits’, ‘cars’ and ‘Machine Learning’, ‘Guitar’, ‘Cakes’. 


In [14]:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

# Function to search and download images
def search_and_download_images(driver, keyword, num_images=10):
    # Create a directory for the keyword if it doesn't exist
    if not os.path.exists(keyword):
        os.makedirs(keyword)
    
    # Access Google Images
    driver.get("https://images.google.com")
    
    # Find the search bar, input the keyword, and trigger the search
    search_box = driver.find_element(By.NAME, "q")
    search_box.send_keys(keyword)
    search_box.send_keys(Keys.RETURN)
    
    # Wait for the images to load
    time.sleep(2)
    
    # Scrape image URLs
    image_urls = set()
    while len(image_urls) < num_images:
        thumbnails = driver.find_elements(By.CSS_SELECTOR, "img.rg_i")
        for thumbnail in thumbnails:
            try:
                thumbnail.click()
                time.sleep(1)
                images = driver.find_elements(By.CSS_SELECTOR, "img.n3VNCb")
                for image in images:
                    if image.get_attribute("src") and 'http' in image.get_attribute("src"):
                        image_urls.add(image.get_attribute("src"))
                    if len(image_urls) >= num_images:
                        break
                if len(image_urls) >= num_images:
                    break
            except Exception as e:
                print(f"An error occurred: {e}")
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
    
    # Download images
    for i, url in enumerate(image_urls):
        try:
            response = requests.get(url)
            with open(os.path.join(keyword, f"{keyword}_{i+1}.jpg"), "wb") as file:
                file.write(response.content)
        except Exception as e:
            print(f"Could not download {url} - {e}")

# Main script
if __name__ == "__main__":
    # Set up the Selenium WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    
    # List of keywords to search for
    keywords = ['fruits', 'cars', 'Machine Learning', 'Guitar', 'Cakes']
    
    # Search and download images for each keyword
    for keyword in keywords:
        search_and_download_images(driver, keyword, num_images=10)
    
    # Close the driver
    driver.quit()

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=125.0.6422.113)
Stacktrace:
	GetHandleVerifier [0x003DB8E3+45827]
	(No symbol) [0x0036DCC4]
	(No symbol) [0x0026150F]
	(No symbol) [0x0023E133]
	(No symbol) [0x002C949F]
	(No symbol) [0x002DB8E6]
	(No symbol) [0x002C2B96]
	(No symbol) [0x00296998]
	(No symbol) [0x0029751D]
	GetHandleVerifier [0x00694513+2899763]
	GetHandleVerifier [0x006E793D+3240797]
	GetHandleVerifier [0x004613B4+593364]
	GetHandleVerifier [0x004682DC+621820]
	(No symbol) [0x003770A4]
	(No symbol) [0x003737A8]
	(No symbol) [0x00373947]
	(No symbol) [0x003659FE]
	BaseThreadInitThunk [0x75A77BA9+25]
	RtlInitializeExceptionChain [0x77BBBE3B+107]
	RtlClearBits [0x77BBBDBF+191]


Write a python program to search for a smartphone(e.g.: Oneplus Nord, pixel 4A, etc.) on www.flipkart.com
and scrape following details for all the search results displayed on 1st page. Details to be scraped: “Brand 
Name”, “Smartphone name”, “Colour”, “RAM”, “Storage(ROM)”, “Primary Camera”, 
“Secondary Camera”, “Display Size”, “Battery Capacity”, “Price”, “Product URL”. Incase if any of the 
details is missing then replace it by “- “. Save your results in a dataframe and CSV.

In [15]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Initialize the Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Open Flipkart
driver.get("https://www.flipkart.com")
time.sleep(2)  # wait for the page to load

# Close the login popup if it appears
try:
    close_login_popup = driver.find_element(By.XPATH, "//button[contains(text(),'✕')]")
    close_login_popup.click()
except:
    pass

# Take user input for the smartphone to search
search_query = input("Enter the smartphone to search: ")

# Search for the smartphone
search_bar = driver.find_element(By.NAME, "q")
search_bar.clear()
search_bar.send_keys(search_query)
search_bar.send_keys(Keys.RETURN)
time.sleep(3)  # wait for the search results to load

# Scrape the data
smartphones = []
products = driver.find_elements(By.XPATH, "//div[@class='_1AtVbE']")

for product in products:
    try:
        brand_name = product.find_element(By.XPATH, ".//div[@class='_4rR01T']").text.split()[0]
    except:
        brand_name = "-"
    try:
        smartphone_name = product.find_element(By.XPATH, ".//div[@class='_4rR01T']").text
    except:
        smartphone_name = "-"
    try:
        color = product.find_element(By.XPATH, ".//div[@class='fMghEO']//li[1]").text.split(':')[1].strip()
    except:
        color = "-"
    try:
        ram = product.find_element(By.XPATH, ".//div[@class='fMghEO']//li[2]").text.split('|')[0].strip().split()[0]
    except:
        ram = "-"
    try:
        storage = product.find_element(By.XPATH, ".//div[@class='fMghEO']//li[2]").text.split('|')[1].strip()
    except:
        storage = "-"
    try:
        primary_camera = product.find_element(By.XPATH, ".//div[@class='fMghEO']//li[3]").text.split('|')[0].strip()
    except:
        primary_camera = "-"
    try:
        secondary_camera = product.find_element(By.XPATH, ".//div[@class='fMghEO']//li[3]").text.split('|')[1].strip()
    except:
        secondary_camera = "-"
    try:
        display_size = product.find_element(By.XPATH, ".//div[@class='fMghEO']//li[4]").text.strip()
    except:
        display_size = "-"
    try:
        battery_capacity = product.find_element(By.XPATH, ".//div[@class='fMghEO']//li[5]").text.strip()
    except:
        battery_capacity = "-"
    try:
        price = product.find_element(By.XPATH, ".//div[@class='_30jeq3 _1_WHN1']").text
    except:
        price = "-"
    try:
        product_url = product.find_element(By.XPATH, ".//a").get_attribute("href")
    except:
        product_url = "-"

    smartphones.append({
        "Brand Name": brand_name,
        "Smartphone Name": smartphone_name,
        "Colour": color,
        "RAM": ram,
        "Storage(ROM)": storage,
        "Primary Camera": primary_camera,
        "Secondary Camera": secondary_camera,
        "Display Size": display_size,
        "Battery Capacity": battery_capacity,
        "Price": price,
        "Product URL": product_url
    })

# Close the driver
driver.quit()

# Create a DataFrame and save to CSV
df = pd.DataFrame(smartphones)
df.to_csv("smartphones.csv", index=False)

print("Data scraped and saved to smartphones.csv")

Enter the smartphone to search: Motorola
Data scraped and saved to smartphones.csv


Write a program to scrap geospatial coordinates (latitude, longitude) of a city searched on google maps. 

In [16]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

# Initialize the WebDriver (assuming you have ChromeDriver installed)
driver = webdriver.Chrome()

# Open Google Maps
driver.get("https://www.google.com/maps")

# Search for the city
city = input("Enter the city to search for: ")
search_box = driver.find_element(By.ID, "searchboxinput")
search_box.send_keys(city)
search_box.send_keys(Keys.RETURN)

# Wait for the map to update
time.sleep(5)

# Get the current URL
current_url = driver.current_url

# Extract the latitude and longitude from the URL
try:
    coords = current_url.split("@")[1].split(",")
    latitude = coords[0]
    longitude = coords[1]
    print(f"Latitude: {latitude}, Longitude: {longitude}")
except IndexError:
    print("Failed to extract coordinates from the URL.")

# Close the WebDriver
driver.quit()

Enter the city to search for: Sagar
Latitude: 23.8374638, Longitude: 78.6662223


Write a program to scrap all the available details of best gaming laptops from digit.in.

In [17]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

# Set up the WebDriver
options = Options()
options.add_argument("--headless")  # Run in headless mode (without opening a browser window)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# URL of the webpage to scrape
url = "https://www.digit.in/top-products/best-gaming-laptops-40.html"

# Open the webpage
driver.get(url)

# Find the container with laptop details
laptop_containers = driver.find_elements(By.CSS_SELECTOR, '.TopNumbeHeading')

# Initialize lists to store the scraped data
titles = []
specs = []
descriptions = []

# Iterate through each laptop container and extract details
for laptop in laptop_containers:
    # Extract the title
    title = laptop.find_element(By.TAG_NAME, 'h2').text.strip()
    titles.append(title)

    # Extract the specifications and description
    spec_container = laptop.find_element(By.XPATH, 'following-sibling::div[@class="Spcs-details"]')
    spec_list = spec_container.find_element(By.CLASS_NAME, 'SpecsBlk').text.strip()
    description = spec_container.find_element(By.CLASS_NAME, 'Spcs-Descrp').text.strip()

    specs.append(spec_list)
    descriptions.append(description)

# Close the WebDriver
driver.quit()

# Create a DataFrame to store the scraped data
data = {
    'Title': titles,
    'Specifications': specs,
    'Description': descriptions
}

df = pd.DataFrame(data)

# Save the data to a CSV file
df.to_csv('best_gaming_laptops.csv', index=False)

print("Data has been scraped and saved to best_gaming_laptops.csv")

Data has been scraped and saved to best_gaming_laptops.csv


Write a python program to scrape the details for all billionaires from www.forbes.com. Details to be scrapped: 
“Rank”, “Name”, “Net worth”, “Age”, “Citizenship”, “Source”, “Industry”. 

In [18]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# Initialize the Chrome WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# Define the URL of the Forbes billionaires list
url = "https://www.forbes.com/billionaires/"

# Load the page
driver.get(url)

# Wait for the dynamic content to load (you may need to adjust the wait time)
driver.implicitly_wait(10)

# Find the table rows containing billionaire data
rows = driver.find_elements(By.XPATH, "//div[@class='fullList']//div[@class='card']")

# Initialize an empty list to store billionaire data
billionaires_data = []

# Iterate over each row and extract the data
for row in rows:
    rank = row.find_element(By.XPATH, ".//div[@class='rank']").text
    name = row.find_element(By.XPATH, ".//div[@class='personName']").text
    net_worth = row.find_element(By.XPATH, ".//div[@class='netWorth']").text
    age = row.find_element(By.XPATH, ".//div[@class='age']").text
    citizenship = row.find_element(By.XPATH, ".//div[@class='countryOfCitizenship']").text
    source = row.find_element(By.XPATH, ".//div[@class='source']").text
    industry = row.find_element(By.XPATH, ".//div[@class='industry']").text

    # Append the data to the list
    billionaires_data.append({
        "Rank": rank,
        "Name": name,
        "Net worth": net_worth,
        "Age": age,
        "Citizenship": citizenship,
        "Source": source,
        "Industry": industry
    })

# Convert the list of billionaire data to a pandas DataFrame
df = pd.DataFrame(billionaires_data)

# Save the DataFrame to a CSV file
df.to_csv('forbes_billionaires.csv', index=False)

# Close the browser
driver.quit()

print("Data scraped and saved to forbes_billionaires.csv")

Data scraped and saved to forbes_billionaires.csv


Write a program to extract at least 500 Comments, Comment upvote and time when comment was posted 
from any YouTube Video. 

In [20]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# Initialize the Chrome WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# Define the URL of the YouTube video
video_url = "https://www.youtube.com/"

# Load the page
driver.get(video_url)

# Wait for the comments to load (you may need to adjust the wait time)
time.sleep(10)

# Scroll down to load more comments (you may need to adjust the number of scrolls)
scroll_pause_time = 2  # Waiting time after each scroll
last_height = driver.execute_script("return document.documentElement.scrollHeight")
while True:
    # Scroll down to the bottom
    driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
    # Wait for the comments to load
    time.sleep(scroll_pause_time)
    # Calculate new scroll height and compare with the last one
    new_height = driver.execute_script("return document.documentElement.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# Find all comments
comments = driver.find_elements(By.XPATH, "//ytd-comment-thread-renderer")

# Initialize an empty list to store comments data
comments_data = []

# Iterate over each comment and extract the data
for comment in comments:
    try:
        comment_text = comment.find_element(By.XPATH, ".//yt-formatted-string[@id='content-text']").text
        upvotes = comment.find_element(By.XPATH, ".//span[@id='vote-count-middle']").text
        timestamp = comment.find_element(By.XPATH, ".//span[@class='style-scope ytd-comment-renderer']//yt-formatted-string").text

        # Append the data to the list
        comments_data.append({
            "Comment": comment_text,
            "Upvotes": upvotes,
            "Timestamp": timestamp
        })
    except Exception as e:
        print(f"Error processing comment: {e}")

# Print the first 10 comments data (for demonstration purposes)
for comment in comments_data[:10]:
    print(comment)

# Close the browser
driver.quit()

print("Data extraction complete.")

Data extraction complete.


Write a python program to scrape a data for all available Hostels from https://www.hostelworld.com/ in 
“London” location. You have to scrape hostel name, distance from city centre, ratings, total reviews, overall 
reviews, privates from price, dorms from price, facilities and property description.

In [23]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# Initialize the Chrome WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# Define the URL of the Hostelworld search results for London
url = "https://www.hostelworld.com/search?q=London%2C+England&country=England&city=London&type=city&from=2023-04-01&to=2023-04-02&guests=1&pricesfrom=0&pricesto=1000&currency=USD&language=en&page="

# Initialize an empty list to store hostel data
hostels_data = []

# Function to scrape data from each hostel page
def scrape_hostel_data(driver, url):
    driver.get(url)
    hostels = driver.find_elements(By.XPATH, "//div[@class='property-card-wrapper']")
    for hostel in hostels:
        try:
            name = hostel.find_element(By.XPATH, ".//div[@class='property-card-title']").text
            distance = hostel.find_element(By.XPATH, ".//div[@class='distance']").text
            rating = hostel.find_element(By.XPATH, ".//div[@class='rating']").text
            total_reviews = hostel.find_element(By.XPATH, ".//div[@class='reviews']").text
            # ... other data points ...
            # Add more XPaths and code to extract the rest of the required data
            # ...
            hostels_data.append({
                "Name": name,
                "Distance": distance,
                "Rating": rating,
                "Total Reviews": total_reviews,
                # ... other data points ...
            })
        except Exception as e:
            print(f"Error processing hostel: {e}")

# Scrape data from multiple pages
for page in range(1, 11):  # Adjust the range based on the number of pages you want to scrape
    scrape_hostel_data(driver, url + str(page))

# Convert the list of hostel data to a pandas DataFrame
df = pd.DataFrame(hostels_data)

# Save the DataFrame to a CSV file
df.to_csv('london_hostels.csv', index=False)

# Close the browser
driver.quit()

print("Data scraping complete. Data saved to london_hostels.csv")

Data scraping complete. Data saved to london_hostels.csv
