# Importing Washington AQI daily summary data via API

In [2]:
import requests
import pandas as pd
from datetime import datetime

In [3]:
# EPA API account credentials
email = "ch.vishal400@gmail.com"  
api_key = "sandkit13" 

In [4]:
# link to reteive data
url = "https://aqs.epa.gov/data/api/dailyData/byState"

# parameters of url to make it work
params = {
    "email": email,
    "key": api_key,
    "param": "88101",  # PM2.5 (Particulate Matter pollution Local Conditions)
    "bdate": "20240101",  # Begining date of data collection January 1, 2024
    "edate": "20241231",  # Ending date of data collection December 31, 2024
    "state": "53" }  # washington state code


In [5]:
# function to retrieve data from EPA API
def aqi_data(url, params):
    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        if data["Header"][0]["status"] == "Success":
            return data["Data"]
        else:
            print("API request failed:", data["Header"][0]["error"])
            return None
    else:
        print(f"HTTP Error: {response.status_code}")
        return None

In [6]:
# gathering data of washington aqi
print("Gathering data for Washington, 2024...")
wa_aqi_data = aqi_data(url, params)

Gathering data for Washington, 2024...


In [7]:
# covert data to save in csv format
if wa_aqi_data:
    df = pd.DataFrame(wa_aqi_data)
    
    print("Columns Available in the data:", df.columns.tolist())  # print available columns to check
    print(f"Total Columns: {len(df.columns)}") # print total number of columns
    print(f"Total Rows: {len(df)}") # print total number of rows 
    
    file = "washington_aqi.csv" # save file in csv
    df.to_csv(file, index=False) 
    print(f"Data saved to {file}")
    
    print("\nFirst 5 rows of the data:") # to check few rows
    print(df.head())
else:
    print("no data found.")

Columns Available in the data: ['state_code', 'county_code', 'site_number', 'parameter_code', 'poc', 'latitude', 'longitude', 'datum', 'parameter', 'sample_duration_code', 'sample_duration', 'pollutant_standard', 'date_local', 'units_of_measure', 'event_type', 'observation_count', 'observation_percent', 'validity_indicator', 'arithmetic_mean', 'first_max_value', 'first_max_hour', 'aqi', 'method_code', 'method', 'local_site_name', 'site_address', 'state', 'county', 'city', 'cbsa_code', 'cbsa', 'date_of_last_change']
Total Columns: 32
Total Rows: 70939
Data saved to washington_aqi.csv

First 5 rows of the data:
  state_code county_code site_number parameter_code  poc   latitude  \
0         53         033        0030          88101    5  47.597222   
1         53         033        0030          88101    5  47.597222   
2         53         033        0030          88101    5  47.597222   
3         53         033        0030          88101    5  47.597222   
4         53         033    

# Connection to MongoDB

In [9]:
from pymongo import MongoClient
import pandas as pd

client = MongoClient('mongodb://localhost:27017/')
dbname = client['washington_aqi']
collection_name = dbname['aqi_daily_summary']


In [10]:
# Read CSV data 
wa_aqi_data = pd.read_csv('washington_aqi.csv') # washington aqi data

print(wa_aqi_data.head())
print(wa_aqi_data.tail())

# count total number of rows
total_rows = len(wa_aqi_data)
print(f" Total number of rows: {total_rows}")

   state_code  county_code  site_number  parameter_code  poc   latitude  \
0          53           33           30           88101    5  47.597222   
1          53           33           30           88101    5  47.597222   
2          53           33           30           88101    5  47.597222   
3          53           33           30           88101    5  47.597222   
4          53           33           30           88101    5  47.597222   

    longitude  datum                 parameter sample_duration_code  ...  \
0 -122.319722  WGS84  PM2.5 - Local Conditions                    X  ...   
1 -122.319722  WGS84  PM2.5 - Local Conditions                    1  ...   
2 -122.319722  WGS84  PM2.5 - Local Conditions                    1  ...   
3 -122.319722  WGS84  PM2.5 - Local Conditions                    1  ...   
4 -122.319722  WGS84  PM2.5 - Local Conditions                    X  ...   

  method_code                                             method  \
0         170  Met One B

In [11]:
# convert dataframe to dictionary format
data = wa_aqi_data.to_dict(orient='records')

# insert data in dictonary format into mongodb
collection_name.insert_many(data)

InsertManyResult([ObjectId('67f8583fa48cef0646241d45'), ObjectId('67f8583fa48cef0646241d46'), ObjectId('67f8583fa48cef0646241d47'), ObjectId('67f8583fa48cef0646241d48'), ObjectId('67f8583fa48cef0646241d49'), ObjectId('67f8583fa48cef0646241d4a'), ObjectId('67f8583fa48cef0646241d4b'), ObjectId('67f8583fa48cef0646241d4c'), ObjectId('67f8583fa48cef0646241d4d'), ObjectId('67f8583fa48cef0646241d4e'), ObjectId('67f8583fa48cef0646241d4f'), ObjectId('67f8583fa48cef0646241d50'), ObjectId('67f8583fa48cef0646241d51'), ObjectId('67f8583fa48cef0646241d52'), ObjectId('67f8583fa48cef0646241d53'), ObjectId('67f8583fa48cef0646241d54'), ObjectId('67f8583fa48cef0646241d55'), ObjectId('67f8583fa48cef0646241d56'), ObjectId('67f8583fa48cef0646241d57'), ObjectId('67f8583fa48cef0646241d58'), ObjectId('67f8583fa48cef0646241d59'), ObjectId('67f8583fa48cef0646241d5a'), ObjectId('67f8583fa48cef0646241d5b'), ObjectId('67f8583fa48cef0646241d5c'), ObjectId('67f8583fa48cef0646241d5d'), ObjectId('67f8583fa48cef0646241d

In [12]:
# to check inserted data
for col in collection_name.find().limit(10):
    print(col)


{'_id': ObjectId('67f8583fa48cef0646241d45'), 'state_code': 53, 'county_code': 33, 'site_number': 30, 'parameter_code': 88101, 'poc': 5, 'latitude': 47.597222, 'longitude': -122.319722, 'datum': 'WGS84', 'parameter': 'PM2.5 - Local Conditions', 'sample_duration_code': 'X', 'sample_duration': '24-HR BLK AVG', 'pollutant_standard': 'PM25 24-hour 2012', 'date_local': '2024-01-03', 'units_of_measure': 'Micrograms/cubic meter (LC)', 'event_type': 'No Events', 'observation_count': 1, 'observation_percent': 100.0, 'validity_indicator': 'Y', 'arithmetic_mean': 4.9, 'first_max_value': 4.9, 'first_max_hour': 0, 'aqi': 27.0, 'method_code': 170, 'method': 'Met One BAM-1020 Mass Monitor w/VSCC - Beta Attenuation', 'local_site_name': 'Seattle-10th & Weller', 'site_address': '10th & Weller', 'state': 'Washington', 'county': 'King', 'city': 'Seattle', 'cbsa_code': 42660.0, 'cbsa': 'Seattle-Tacoma-Bellevue, WA', 'date_of_last_change': '2025-04-04'}
{'_id': ObjectId('67f8583fa48cef0646241d46'), 'state_c

In [13]:
# to count the number of the docs in the collection
total_docs = collection_name.count_documents({})
print(f"total number of rows/docs in the collection: {total_docs}")

total number of rows/docs in the collection: 70939


In [14]:
# Dropped columns that are not required

drop_columns = {"state_code": "","state": "",
                "site_number": "","parameter_code": "",
                "poc": "","latitude": "","longitude": "",
                "cbsa_code": "","sample_duration_code": "",
                "sample_duration": "","pollutant_standard": "",
                "units_of_measure": "","event_type": "",
                "site_address": "","site_address": ""}

collection_name.update_many({}, {"$unset":drop_columns}) # empty string values are just placeholders for $unset

UpdateResult({'n': 70939, 'nModified': 70939, 'ok': 1.0, 'updatedExisting': True}, acknowledged=True)

In [15]:
# to check the field names and keys 
updated_collection = collection_name.find_one()
print(updated_collection.keys())

dict_keys(['_id', 'county_code', 'datum', 'parameter', 'date_local', 'observation_count', 'observation_percent', 'validity_indicator', 'arithmetic_mean', 'first_max_value', 'first_max_hour', 'aqi', 'method_code', 'method', 'local_site_name', 'county', 'city', 'cbsa', 'date_of_last_change'])


In [16]:
# to check 5 docs in the collection

for doc in collection_name.find().limit(5):
    print(doc)

{'_id': ObjectId('67f8583fa48cef0646241d45'), 'county_code': 33, 'datum': 'WGS84', 'parameter': 'PM2.5 - Local Conditions', 'date_local': '2024-01-03', 'observation_count': 1, 'observation_percent': 100.0, 'validity_indicator': 'Y', 'arithmetic_mean': 4.9, 'first_max_value': 4.9, 'first_max_hour': 0, 'aqi': 27.0, 'method_code': 170, 'method': 'Met One BAM-1020 Mass Monitor w/VSCC - Beta Attenuation', 'local_site_name': 'Seattle-10th & Weller', 'county': 'King', 'city': 'Seattle', 'cbsa': 'Seattle-Tacoma-Bellevue, WA', 'date_of_last_change': '2025-04-04'}
{'_id': ObjectId('67f8583fa48cef0646241d46'), 'county_code': 33, 'datum': 'WGS84', 'parameter': 'PM2.5 - Local Conditions', 'date_local': '2024-01-01', 'observation_count': 24, 'observation_percent': 100.0, 'validity_indicator': 'Y', 'arithmetic_mean': 20.083333, 'first_max_value': 43.0, 'first_max_hour': 6, 'aqi': nan, 'method_code': 170, 'method': 'Met One BAM-1020 Mass Monitor w/VSCC - Beta Attenuation', 'local_site_name': 'Seattle-

# Preprocessing of the Data

In [18]:
import pandas as pd

df_data = collection_name.find() # retrieving all documents from the collection

df = pd.DataFrame(list(df_data)) # converted to dataframe


In [19]:
# removed the id column
df.drop(columns=['_id'], inplace= True)

In [20]:
df.head()

Unnamed: 0,county_code,datum,parameter,date_local,observation_count,observation_percent,validity_indicator,arithmetic_mean,first_max_value,first_max_hour,aqi,method_code,method,local_site_name,county,city,cbsa,date_of_last_change
0,33,WGS84,PM2.5 - Local Conditions,2024-01-03,1,100.0,Y,4.9,4.9,0,27.0,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Seattle-10th & Weller,King,Seattle,"Seattle-Tacoma-Bellevue, WA",2025-04-04
1,33,WGS84,PM2.5 - Local Conditions,2024-01-01,24,100.0,Y,20.083333,43.0,6,,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Seattle-10th & Weller,King,Seattle,"Seattle-Tacoma-Bellevue, WA",2025-04-04
2,33,WGS84,PM2.5 - Local Conditions,2024-01-02,24,100.0,Y,14.875,29.0,8,,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Seattle-10th & Weller,King,Seattle,"Seattle-Tacoma-Bellevue, WA",2025-04-04
3,33,WGS84,PM2.5 - Local Conditions,2024-01-03,23,96.0,Y,4.956522,14.0,8,,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Seattle-10th & Weller,King,Seattle,"Seattle-Tacoma-Bellevue, WA",2025-04-04
4,33,WGS84,PM2.5 - Local Conditions,2024-01-01,1,100.0,Y,20.0,20.0,0,71.0,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Seattle-10th & Weller,King,Seattle,"Seattle-Tacoma-Bellevue, WA",2025-04-04


#### Removing duplicates

In [22]:
df.drop_duplicates(inplace= True)

In [23]:
df.dropna(subset=['aqi'], inplace = True) # dropped duplicate rows

In [24]:
print(df.isnull().sum()) # checking if there are any missing values

county_code              0
datum                    0
parameter                0
date_local               0
observation_count        0
observation_percent      0
validity_indicator       0
arithmetic_mean          0
first_max_value          0
first_max_hour           0
aqi                      0
method_code              0
method                   0
local_site_name        226
county                   0
city                     0
cbsa                   411
date_of_last_change      0
dtype: int64


In [25]:
df['cbsa'].fillna('Unknown', inplace = True)  #filling missing values in column 'cbsa'

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['cbsa'].fillna('Unknown', inplace = True)  #filling missing values in column 'cbsa'


In [26]:
df['local_site_name'].fillna('Unknown', inplace = True) #filling missing values in column 'local_site_name'

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['local_site_name'].fillna('Unknown', inplace = True) #filling missing values in column 'local_site_name'


In [27]:
# convert normal date format to datetime, so that year and month can be extracted
df['date_local'] = pd.to_datetime(df['date_local'], errors = 'coerce') 


In [28]:
print(df['date_local'].head()) # check few entries to check the format

0    2024-01-03
4    2024-01-01
5    2024-01-02
28   2024-01-04
37   2024-01-05
Name: date_local, dtype: datetime64[ns]


In [29]:
df['year'] = df['date_local'].dt.year #extracted year from date_local column and create a new column 'year'
df['month'] = df['date_local'].dt.month #extracted month from date_local column and create a new column 'month'

In [30]:
df.head()

Unnamed: 0,county_code,datum,parameter,date_local,observation_count,observation_percent,validity_indicator,arithmetic_mean,first_max_value,first_max_hour,aqi,method_code,method,local_site_name,county,city,cbsa,date_of_last_change,year,month
0,33,WGS84,PM2.5 - Local Conditions,2024-01-03,1,100.0,Y,4.9,4.9,0,27.0,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Seattle-10th & Weller,King,Seattle,"Seattle-Tacoma-Bellevue, WA",2025-04-04,2024,1
4,33,WGS84,PM2.5 - Local Conditions,2024-01-01,1,100.0,Y,20.0,20.0,0,71.0,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Seattle-10th & Weller,King,Seattle,"Seattle-Tacoma-Bellevue, WA",2025-04-04,2024,1
5,33,WGS84,PM2.5 - Local Conditions,2024-01-02,1,100.0,Y,14.8,14.8,0,62.0,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Seattle-10th & Weller,King,Seattle,"Seattle-Tacoma-Bellevue, WA",2025-04-04,2024,1
28,33,WGS84,PM2.5 - Local Conditions,2024-01-04,1,100.0,Y,3.6,3.6,0,20.0,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Seattle-10th & Weller,King,Seattle,"Seattle-Tacoma-Bellevue, WA",2025-04-04,2024,1
37,33,WGS84,PM2.5 - Local Conditions,2024-01-05,1,100.0,Y,4.6,4.6,0,26.0,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Seattle-10th & Weller,King,Seattle,"Seattle-Tacoma-Bellevue, WA",2025-04-04,2024,1
