In [1]:
from bs4 import BeautifulSoup

In [124]:
# find(): for a single element match.
# find_all(): for when you need multiple matches.
# select(): for when you need the flexibility of CSS selectors.
# select_one(): for the first match using CSS selectors.

In [198]:
html = """
<html>
    <body>
        <p class="intro">This is a paragraph with class "intro".</p>
        <p id="unique-para">This is a unique paragraph with id "unique-para".</p>
        <div class="container">
            <h1 class="header">This is the first h1</h1>
            <h1 class="header">This is the second h1</h1>
            <h1 class="header">This is the third h1</h1>
            <span>This is a span element inside the div.</span>
        </div>
    </body>
</html>
"""

In [199]:
soup = BeautifulSoup(html,"html.parser")

In [200]:
print(soup.contents)

['\n', <html>
<body>
<p class="intro">This is a paragraph with class "intro".</p>
<p id="unique-para">This is a unique paragraph with id "unique-para".</p>
<div class="container">
<h1 class="header">This is the first h1</h1>
<h1 class="header">This is the second h1</h1>
<h1 class="header">This is the third h1</h1>
<span>This is a span element inside the div.</span>
</div>
</body>
</html>, '\n']


In [201]:
data1 = soup.find("p", {"class":"intro"})  
data2 = soup.find("h1", class_="header")
data3 = soup.find(id="unique-para") 

In [202]:
print(data1.text)

This is a paragraph with class "intro".


In [160]:
data2 = soup.find_all("h1", class_="header")

In [161]:
print(data2)

[<h1 class="header">This is the first h1</h1>, <h1 class="header">This is the second h1</h1>, <h1 class="header">This is the third h1</h1>]


In [162]:
for h1 in data2:
    print(h1.text)

This is the first h1
This is the second h1
This is the third h1


In [225]:
data = soup.select("#unique-para")

In [229]:
print(data[0].text)

This is a unique paragraph with id "unique-para".


In [156]:
soup.select("div.container h1")  
soup.select("#unique-para") 
soup.select("p.intro")

[<p class="intro">This is a paragraph with class "intro".</p>]

In [159]:
soup.select_one("div.container h1").text

'This is the first h1'

In [163]:
def function2():
    # file = open('templates/index.html', 'r')
    # contents = file.read()
    # file.close()

    with open('./templates/index.html') as file:
        contents = file.read()

        soup = BeautifulSoup(contents, "html.parser")

        # find element named div
        # div = soup.find("div")
        # print(div.text)

        # find all elements named div
        divs = soup.find_all('div')
        
        # 1st occurrence of div element
        # print(soup.find('div'))
        
        
        # Below code gives result set
        """
        A ResultSet is essentially a list-like collection of BeautifulSoup 
        Tag objects. When you call find_all() to search for elements in an 
        HTML document, BeautifulSoup returns a ResultSet containing all 
        the elements that match the specified criteria. Each element in the 
        ResultSet is represented as a Tag object.

        You can iterate over a ResultSet like you would with a list, 
        accessing individual Tag objects to extract information such as 
        tag names, attributes, and text content.
        """
        # print(type(divs))
        print(divs[1].text)
        
        # parent elements
        # print(divs[1].parent)
       


# function2()


In [164]:
# day,max_temp,min_temp,condition

In [12]:
def function3():
    """
    Find temperature from html file from daily forecast website
    """
    
    info = []
    with open('templates/temp.html', 'r') as file:
        html = file.read()

        soup = BeautifulSoup(html, 'html.parser')

        # find container div
        div = soup.find('div', {"class": "DailyForecast--DisclosureList--350ZO"})

        # find all details elements
        details_elements = div.find_all('details', {"data-track-string": "detailsExpand"})

        for details in details_elements:
            # search for an <h2> tag with the attribute data-testid set to "daypartName" within the context of the details element.
            day = details.find("h2", {"data-testid": "daypartName"})

            # find span tag with data-testid set to TemperatureValue
            # 1st parameter is the tag and 2nd argument in {} are its attributes
            max_temp = details.find("span", {"data-testid": "TemperatureValue"})
            # print(max_temp.text[0:2])
            
            
            # find min_temp
            min_temp = details.find("span", {"data-testid": "TemperatureValue", "class": "DetailsSummary--lowTempValue--1DlJK"})

            # condition
            condition = details.find("span", {"class": "DetailsSummary--extendedData--aaFeV"})
            # print(condition.text)
            
            # print(f"day: {day.text}, max: {max_temp.text.replace('°', '')}, min: {min_temp.text.replace('°', '')}, condition: {condition.text}")
            info.append({
                "day": day.text,
                "max_temp": max_temp.text[0:2],
                "min_temp": min_temp.text[0:2],
                "condition": condition.text
            })

    print(info)
    
    # store details into csv file
    # with open('info.csv', 'w', encoding='utf-8') as file:
    #     file.write("day,max_temp_DegC,min_temp_DegC,condition\n")
    #     for data in info:
    #         file.write(f"{data['day']},{data['max_temp']},{data['min_temp']},{data['condition']}\n")


In [182]:
with open(r"templates\temp.html","r") as file:
    file_content = file.read()
    # print(file_content)
    soup = BeautifulSoup(file_content,"html.parser")
    print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Title
  </title>
 </head>
 <body>
  <div class="DailyForecast--DisclosureList--350ZO">
   <details class="Disclosure--themeList--uBa5q" data-track-string="false" event="true" open="">
    <summary aria-expanded="true" class="Disclosure--Summary--AvowU DaypartDetails--Summary--2nJx1 Disclosure--positionShowOpenSummaryContainer--2aHoR Disclosure--hideBorderOnSummaryOpen--LEvZQ" role="button">
     <div class="DaypartDetails--DetailSummaryContent--1c28m Disclosure--SummaryDefault--1z_mF Disclosure--positionShowOpenSummary--1hI0u">
      <div class="Accessibility--visuallyHidden--1432w">
       <div class="DetailsSummary--DetailsSummary--QpFD- DetailsSummary--fadeOnOpen--1MLf5" data-testid="DetailsSummary" id="detailIndex0">
        <h2 class="DetailsSummary--daypartName--1Mebr" data-testid="daypartName">
         Today
        </h2>
        <div class="DetailsSummary--temperature--3FMlw" data-testid="detailsTe

In [192]:
info = []
with open(r"templates\temp.html","r") as file:
    file_content = file.read()
    # print(file_content)
    soup = BeautifulSoup(file_content,"html.parser")
    # print(soup.prettify())
    div_cont = soup.find("div",{"class":"DailyForecast--DisclosureList--350ZO"})
    div_container = div_cont.find_all("details",{"data-track-string":"detailsExpand"})
    # print(div_container.prettify())
    for detail in div_container:
        day = detail.find("h2",{"data-testid":"daypartName"})
        # print(day.text)

        max_temp = detail.find("span",{"data-testid":"TemperatureValue"})
        # print(max_temp.text)

        min_temp = detail.find("span",{"class":"DetailsSummary--lowTempValue--1DlJK","data-testid":"TemperatureValue"})
        # print(min_temp.text)

        condition = detail.find("span",{"class":"DetailsSummary--extendedData--aaFeV"})
        # print(condition.text)

        info.append({
            "Day":day.text,
            "Max Temp":max_temp.text,
            "Min Temp":min_temp.text,
            "Condition":condition.text
        })
# print(info)

with open("info.csv","w",encoding="utf-8") as file:
    file.write("Day,Max Temperature,Min Temperature,Condition\n")
    for information in info:
        file.write(f"{information['Day']},{information['Max Temp']},{information['Min Temp']},{information['Condition']}\n")


In [13]:
function3()

[{'day': 'Wed 05', 'max_temp': '38', 'min_temp': '25', 'condition': 'Mostly Sunny'}, {'day': 'Thu 06', 'max_temp': '38', 'min_temp': '25', 'condition': 'Mostly Sunny'}, {'day': 'Fri 07', 'max_temp': '36', 'min_temp': '25', 'condition': 'Mostly Sunny'}, {'day': 'Sat 08', 'max_temp': '36', 'min_temp': '25', 'condition': 'Mostly Sunny'}, {'day': 'Sun 09', 'max_temp': '37', 'min_temp': '24', 'condition': 'Isolated T-Storms'}, {'day': 'Mon 10', 'max_temp': '36', 'min_temp': '23', 'condition': 'Partly Cloudy'}, {'day': 'Tue 11', 'max_temp': '36', 'min_temp': '23', 'condition': 'Mostly Sunny'}, {'day': 'Wed 12', 'max_temp': '36', 'min_temp': '23', 'condition': 'Mostly Sunny'}, {'day': 'Thu 13', 'max_temp': '36', 'min_temp': '23', 'condition': 'Mostly Sunny'}, {'day': 'Fri 14', 'max_temp': '36', 'min_temp': '23', 'condition': 'Mostly Sunny'}, {'day': 'Sat 15', 'max_temp': '35', 'min_temp': '24', 'condition': 'Partly Cloudy'}, {'day': 'Sun 16', 'max_temp': '35', 'min_temp': '24', 'condition': '

In [5]:
from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options

# webdriver used for driving the browser
# webdriver has we need to download for specific browser.Selenium can be used with most major web browsers, 
# but it requires the appropriate WebDriver for each browser to control it. Selenium WebDrivers act as a bridge 
# between Selenium and the web browser.
# e.g. for chrome, we need chrome web driver similarly for firefox and so on.
from selenium.webdriver.common.keys import Keys

# to find elements by id
from selenium.webdriver.common.by import By


def function1():
    # instantiate the browser 
    # put the path of chrome driver here
    # In below code, is old vsersioned code where we don't have to specify chrome driver path
    # browser = webdriver.Chrome("C:/Users/DELL/Desktop/selenium/chromedriver.exe")
    
    chrome_options = Options()

    # Set the user agent
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
    chrome_options.add_argument(f"user-agent={user_agent}")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")

    browser = webdriver.Chrome(options=chrome_options)
    # below code will take handle browser and driver itself
    # browser = webdriver.Chrome()

    # open the browser to visit google.com on chrome window and automatically closes after 5 seconds
    browser.get("https://google.co.in")
    time.sleep(5)

    # search automatically on google
    # To execute below code, we need to find id element of google searchbox
    # to find id, right click on google search box and find what is the value "id" attribute
    # in this case, id="APjFqb"
    input = browser.find_element(By.ID, "APjFqb")
    
    # search string
    input.send_keys("iPhone 13 pro")
    
    # hold for 5 seconds
    time.sleep(5)

    # make search automatically
    input.send_keys(Keys.ENTER)
    time.sleep(30)

    # Closes the browser and shuts down the ChromiumDriver executable.
    browser.quit()


function1()

In [224]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def function2():
    info = []
    # options = Options()
    # options.add_argument("--headless")
    browser = webdriver.Chrome()
    browser.get("https://www.accuweather.com/en/in/pune/204848/daily-weather-forecast/204848")
    time.sleep(8)
    div_container = browser.find_elements(By.CLASS_NAME,"info")

    for container in div_container:
        high_temp = container.find_element(By.CSS_SELECTOR,".high")
        date = container.find_element(By.CLASS_NAME,"date")
        low_temp = container.find_element(By.CLASS_NAME,"low")

        date_data = date.text.replace("\n1/"," ")
        low_temp_data = low_temp.text.replace("/","")

        info.append({"Date":date_data,"High Temp":high_temp.text,"Low Temp":low_temp_data})

    print(info)

    browser.quit()

    with open('Temp_data.csv','w',encoding='utf-8') as file:
        file.write("Date,High Temp,Low Temp\n")
        for data in info:
            file.write(f"{data['Date']},{data['High Temp']},{data['Low Temp']}\n")


function2()

[{'Date': 'SUN 26', 'High Temp': '35°', 'Low Temp': '11°'}, {'Date': 'MON 27', 'High Temp': '34°', 'Low Temp': '10°'}, {'Date': 'TUE 28', 'High Temp': '34°', 'Low Temp': '12°'}, {'Date': 'WED 29', 'High Temp': '34°', 'Low Temp': '10°'}, {'Date': 'THU 30', 'High Temp': '35°', 'Low Temp': '10°'}, {'Date': 'FRI 31', 'High Temp': '35°', 'Low Temp': '13°'}, {'Date': 'SAT\n2/1', 'High Temp': '29°', 'Low Temp': '11°'}, {'Date': 'SUN\n2/2', 'High Temp': '31°', 'Low Temp': '16°'}, {'Date': 'MON\n2/3', 'High Temp': '29°', 'Low Temp': '17°'}, {'Date': 'TUE\n2/4', 'High Temp': '33°', 'Low Temp': '13°'}, {'Date': 'WED\n2/5', 'High Temp': '29°', 'Low Temp': '14°'}, {'Date': 'THU\n2/6', 'High Temp': '32°', 'Low Temp': '12°'}, {'Date': 'FRI\n2/7', 'High Temp': '34°', 'Low Temp': '13°'}, {'Date': 'SAT\n2/8', 'High Temp': '32°', 'Low Temp': '14°'}, {'Date': 'SUN\n2/9', 'High Temp': '32°', 'Low Temp': '14°'}, {'Date': 'MON\n2/10', 'High Temp': '32°', 'Low Temp': '14°'}, {'Date': 'TUE\n2/11', 'High Temp':

In [39]:
import requests


In [40]:
url = "https://www.amazon.in/gp/bestsellers/?ref_=nav_cs_bestsellers"

In [41]:
response = requests.get(url)
print(response.text)

<!doctype html><html lang="en-in" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->
<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>
<!-- sp:end-feature:head-start -->
<!-- sp:feature:csm:head-open-part1 -->

<!-- sp:end-feature:csm:head-open-part1 -->
<!-- sp:feature:cs-optimization -->
<meta http-equiv='x-dns-prefetch-control' content='on'>
<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">
<link rel="dns-prefetch" href="https://m.media-amazon.com">
<link rel="dns-prefetch" href="https://completion.amazon.com">
<!-- sp:end-feature:cs-optimization -->
<!-- sp:feature:csm:head-open-part2 -->

<!-- sp:end-feature:csm:head-open-part2 -->
<!-- sp:feature:aui-assets -->
<link rel="stylesheet" href="https://m.media-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,51FIeTurRAL.css,31fFxDf00KL.css,11j2+ObrspL.css,01qDClimA1L.css,01pOTCa2wPL.css,413Vvv3GONL.css,11TIuySqr6L.css,01Rw4F+QU6L.css,11JJsNcqOIL.css,01J3

In [42]:
soup = BeautifulSoup(response.text,"html.parser")

In [70]:
product_url_data = soup.find("a",{"class":"a-link-normal aok-block"})

In [71]:
print(product_url_data["href"])

/Oneplus-Bluetooth-Wireless-Earphones-Bombastic/dp/B09TVVGXWS/ref=zg_bs_c_electronics_d_sccl_1/000-0000000-0000000?pd_rd_w=PAAx2&content-id=amzn1.sym.cde02f8b-0594-439d-9e93-f4cced7ce3ce&pf_rd_p=cde02f8b-0594-439d-9e93-f4cced7ce3ce&pf_rd_r=ZCDVF346R73WRJ27J90P&pd_rd_wg=hgyCC&pd_rd_r=398fe485-69f9-4f35-b167-2cb1a72386ec&pd_rd_i=B09TVVGXWS&psc=1


In [72]:
product_title_data = soup.find("div",{"class":"p13n-sc-truncate-desktop-type2"})

In [111]:
print(product_title_data.text)

Oneplus Bullets Z2 Bluetooth Wireless in Ear Earphones with Mic, Bombastic Bass - 12.4 mm Drivers, 10 Mins Charge - 20 Hrs Music, 30 Hrs Battery Life, IP55 Dust and Water Resistant (Magico Black)


In [108]:
product_price_data = soup.select("span.a-color-price")

In [106]:
product_price_data = soup.find_all("span",{"class":"a-size-base a-color-price"})

In [109]:
print(product_price_data)

[]


In [101]:
for cl_name in product_price_data:
    print(cl_name)

<span class="a-carousel-pagination a-size-base"><span class="a-carousel-page-count">Page <span class="a-carousel-page-current">1</span> of <span class="a-carousel-page-max">1</span> </span><span class="a-carousel-restart-container"><span class="a-text-separator"></span><a class="a-carousel-restart" href="#">Start over</a></span><span aria-live="polite" class="a-carousel-accessibility-page-info a-offscreen">Page 1 of 1  </span></span>
<span class="a-carousel-pagination a-size-base"><span class="a-carousel-page-count">Page <span class="a-carousel-page-current">1</span> of <span class="a-carousel-page-max">1</span> </span><span class="a-carousel-restart-container"><span class="a-text-separator"></span><a class="a-carousel-restart" href="#">Start over</a></span><span aria-live="polite" class="a-carousel-accessibility-page-info a-offscreen">Page 1 of 1  </span></span>
<span class="a-carousel-pagination a-size-base"><span class="a-carousel-page-count">Page <span class="a-carousel-page-curren

In [110]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-in">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:csm:head-open-part1 -->
  <!-- sp:end-feature:csm:head-open-part1 -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:csm:head-open-part2 -->
  <!-- sp:end-feature:csm:head-open-part2 -->
  <!-- sp:feature:aui-assets -->
  <link href="https://m.media-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,51FIeTurRAL.css,31fFxDf00KL.css,11j2+ObrspL.css,01qDClimA1L.css,01pOTCa2wPL.css,413Vvv3GONL.css,11TIuySqr6L.css,01Rw4F+Q

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome()
driver.get("https://www.amazon.in/gp/bestsellers/?ref_=nav_cs_bestsellers")
driver.implicitly_wait(10)
product_price_data = driver.find_element(By.XPATH, "/html/body/div[1]/div[1]/div[2]/div/div/div/div[2]/div/div[2]/div/div[1]/div/div/div/div/div[2]/div/div[2]/div/ol/li/div/div[2]/span/div/div/div/div[2]/div/div/a/div/span/span")
print(product_price_data.text)

driver.quit()


₹1,299.00


In [3]:
#Use Multithreading when :
# -Tasks are I/O-bound (waiting on input/output operations).
# -need concurrency but not full parallelism.
# -want to reduce memory overhead (threads share memory).
# -are working with tasks like:
#     Network requests (API calls, web scraping)
#     File I/O (reading/writing files, database queries)
#     Waiting for user input

In [4]:
#Use Multiprocessing when :
# -Tasks are CPU-bound (heavy computations, mathematical operations).
# -need true parallelism (utilize multiple CPU cores).
# -to avoid GIL (Global Interpreter Lock) limitations.
# -working with tasks like:
    # Data processing (image processing, video encoding)
    # Machine learning model training
    # Scientific computing (NumPy, Pandas, simulations)