In [1]:
import requests, random, html_to_json, time
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
stations = [
    "https://www.aqi.in/weather/india/karnataka/bangalore/jp-nagar",
    "https://www.aqi.in/weather/india/karnataka/bangalore/bapuji-nagar",
    "https://www.aqi.in/weather/india/karnataka/bangalore/city-railway-station",
    "https://www.aqi.in/weather/india/karnataka/bangalore/brigade-road",
    "https://www.aqi.in/weather/india/karnataka/bangalore/koramangala",
    "https://www.aqi.in/weather/india/karnataka/bangalore/bellandur",
    "https://www.aqi.in/weather/india/karnataka/bangalore/doddanekundi",]

In [3]:
def get_page_with_cloudflare_bypass(url):
    """Get page content with Cloudflare bypass"""
    # List of user agents to rotate through
    user_agents = [
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203"
    ]
    
    # Create a session
    session = requests.Session()
    
    # Set default headers
    session.headers.update({
        'User-Agent': random.choice(user_agents),
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Cache-Control': 'max-age=0',
        'DNT': '1',
    })
    
    # First visit Google to establish cookies and referrer
    session.get('https://www.google.com')
    time.sleep(random.uniform(1, 2))
    
    # Add referrer from Google
    session.headers.update({
        'Referer': 'https://www.google.com/'
    })
    
    # Make the request
    response = session.get(url, timeout=30)
    
    if response.status_code == 200:
        return response.text
    else:
        print(f"Error: HTTP status code {response.status_code}")
        return None

In [4]:
def get_air_quality(station):
    print(station)

    html_content = get_page_with_cloudflare_bypass(station)
    if html_content:
        # Parse data
        soup = BeautifulSoup(html_content, 'html.parser')

    json_this = lambda h_: html_to_json.convert(f"""{soup.select_one(f"{h_}")}""")

    json_data = json_this("div[class*='pm-data']")
    pm25 = json_data['div'][0]['span'][1]['_value']
    pm10 = json_data['div'][0]['span'][3]['_value']

    json_data = json_this("div[class*='temp flex']")
    temperature = json_data['div'][0]['p'][0]['span'][0]['_value']

    json_data = json_this("div[class*='condition grid']")
    humidity = json_data['div'][0]['div'][0]['span'][2]['span'][0]['_values'][0]
    feels_like = json_data['div'][0]['div'][0]['span'][1]['span'][0]['_value'].split("°")[0]

    json_data = json_this("a[class*='aqi']")
    aqi = json_data['a'][0]['div'][0]['div'][0]['div'][0]['div'][1]['div'][0]['p'][0]['span'][0]['_value']

    json_data = json_this("div[class*='wind-info']")
    wind_speed = json_data['div'][0]['div'][0]['div'][0]['div'][0]['div'][0]['div'][0]['div'][1]['div'][0]['p'][0]['span'][0]['_value']

    return pm25, pm10, temperature, humidity, feels_like, aqi, wind_speed

In [5]:
today = datetime.now()
df = pd.DataFrame()

station_name = lambda st: f"{st[-1].replace("-", " ").upper()}, {st[-2].upper()}"

for station in stations:
    pm25, pm10, temperature, humidity, feels_like, aqi, wind_speed = get_air_quality(station)

    data = {
        "date": today.date(),
        "time": today.time().strftime("%H:%M"),
        "station": station_name(station.split("/")),
        "aqi": aqi,
        "pm2.5": pm25,
        "pm10": pm10,
        "temperature": temperature, 
        "feels_like": feels_like,
        "humidity": humidity, 
        "wind_speed": wind_speed,
    }
    df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)

# driver.quit()
# out_file = "blr_traffic_times.csv"
# df.to_csv(out_file, mode="a", header=header, index=False)

https://www.aqi.in/weather/india/karnataka/bangalore/jp-nagar
https://www.aqi.in/weather/india/karnataka/bangalore/bapuji-nagar
https://www.aqi.in/weather/india/karnataka/bangalore/city-railway-station
https://www.aqi.in/weather/india/karnataka/bangalore/brigade-road
https://www.aqi.in/weather/india/karnataka/bangalore/koramangala
https://www.aqi.in/weather/india/karnataka/bangalore/bellandur
https://www.aqi.in/weather/india/karnataka/bangalore/doddanekundi


In [6]:
display(df)

Unnamed: 0,date,time,station,aqi,pm2.5,pm10,temperature,feels_like,humidity,wind_speed
0,2025-07-29,19:38,"JP NAGAR, BANGALORE",60,14,45,22,25,88,31
1,2025-07-29,19:38,"BAPUJI NAGAR, BANGALORE",57,12,45,22,25,88,31
2,2025-07-29,19:38,"CITY RAILWAY STATION, BANGALORE",62,15,39,22,25,88,31
3,2025-07-29,19:38,"BRIGADE ROAD, BANGALORE",60,14,34,22,25,88,31
4,2025-07-29,19:38,"KORAMANGALA, BANGALORE",60,14,52,22,25,88,31
5,2025-07-29,19:38,"BELLANDUR, BANGALORE",63,16,51,22,25,88,31
6,2025-07-29,19:38,"DODDANEKUNDI, BANGALORE",63,15,44,22,25,88,31


In [15]:
html_content = get_page_with_cloudflare_bypass(stations[0])
if html_content:
    # Parse data
    soup = BeautifulSoup(html_content, 'html.parser')

In [16]:
soup

<!DOCTYPE html>
<html class="text-[3px] min-[150px]:text-[4.3px] min-[200px]:text-[5.5px] min-[250px]:text-[7px] min-[300px]:text-[8.5px] min-[500px]:text-[10px] min-[600px]:text-[10px] min-[640px]:text-[9px] min-[768px]:text-[6.5px] min-[900px]:text-[7px] min-[970px]:text-[7.5px] min-[1024px]:text-[8px] min-[1400px]:text-[8.7px]" dir="ltr" lang="en"><head><meta charset="utf-8"/><meta content="width=device-width, initial-scale=1" name="viewport"/><link as="image" fetchpriority="high" href="https://www.aqi.in/media/misc/aqi-logo.svg" rel="preload"/><link as="image" fetchpriority="high" href="https://www.aqi.in/media/misc/chat-with-us.svg" rel="preload"/><link as="image" fetchpriority="high" href="https://www.aqi.in/media/sensor-ranges/temperature-cool-level.webp" rel="preload"/><link data-precedence="next" href="https://www.aqi.in/_next/static/css/09e867ee82502d30.css" rel="stylesheet"/><link data-precedence="next" href="https://www.aqi.in/_next/static/css/e67f24cfbff8d1cd.css" rel="sty

In [66]:
html_string = f"""{soup.select_one("div[class*='wind-info']")}"""
json_data = html_to_json.convert(html_string)
json_data['div'][0]

{'_attributes': {'class': ['wind-info', 'min-w-0']},
 'div': [{'_attributes': {'class': ['gradient-border-card',
     'overflow-hidden',
     'card',
     'rounded-[1.7rem]',
     'sm:rounded-2',
     'relative',
     'min-w-0',
     'h-full']},
   'div': [{'_attributes': {'class': ['wrapper',
       'w-full',
       'bg-[#1e374d]',
       'h-full',
       'relative',
       'flex',
       'flex-col',
       'max-w-full']},
     'div': [{'_attributes': {'class': ['content',
         'p-[2rem]',
         'sm:px-[2.5rem]',
         'grow',
         'flex',
         'items-center',
         'justify-center']},
       'div': [{'_attributes': {'class': ['flex',
           'flex-col',
           'justify-evenly',
           'gap-[2.5rem]',
           'w-full',
           'h-full',
           'pt-[2rem]']},
         'div': [{'_attributes': {'class': ['speed',
             'gap-[5rem_2rem]',
             'grid',
             'grid-cols-[auto_auto]',
             'lg:grid-cols-1',
             

In [89]:
json_data['div'][0]['div'][0]['div'][0]['div'][0]['div'][0]['div'][0]['div'][1]['div'][0]['p'][0]['span'][0]['_value']

'27'