# AQI prediction Model with Python

- PM2.5 PM10
- NO, NO2
- NH3 - Ammonia
- CO
- So2
- o3
- Benzane, Toluene, Xylene

In [None]:
pip install numpy pandas matplotlib seaborn scikit-learn

In [5]:
# importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

In [6]:
df = pd. read_csv('air quality data.csv')
df.head() # top 5 rows!

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,


In [7]:
#shape - rows and cols!
df.shape

(29531, 16)

In [8]:
#information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        29531 non-null  object 
 1   Date        29531 non-null  object 
 2   PM2.5       24933 non-null  float64
 3   PM10        18391 non-null  float64
 4   NO          25949 non-null  float64
 5   NO2         25946 non-null  float64
 6   NOx         25346 non-null  float64
 7   NH3         19203 non-null  float64
 8   CO          27472 non-null  float64
 9   SO2         25677 non-null  float64
 10  O3          25509 non-null  float64
 11  Benzene     23908 non-null  float64
 12  Toluene     21490 non-null  float64
 13  Xylene      11422 non-null  float64
 14  AQI         24850 non-null  float64
 15  AQI_Bucket  24850 non-null  object 
dtypes: float64(13), object(3)
memory usage: 3.6+ MB


In [12]:
# to know the duplicate values
df.duplicated().sum()

0

In [13]:
# to check missing values
df.isnull().sum()

City              0
Date              0
PM2.5          4598
PM10          11140
NO             3582
NO2            3585
NOx            4185
NH3           10328
CO             2059
SO2            3854
O3             4022
Benzene        5623
Toluene        8041
Xylene        18109
AQI            4681
AQI_Bucket     4681
dtype: int64

In [None]:
# drop the rows where 'AQI' has missing values
df.dropna(subset=['AQI'], inplace = True)

In [14]:
df.isnull().sum().sort_values (ascending=False)

Xylene        18109
PM10          11140
NH3           10328
Toluene        8041
Benzene        5623
AQI            4681
AQI_Bucket     4681
PM2.5          4598
NOx            4185
O3             4022
SO2            3854
NO2            3585
NO             3582
CO             2059
City              0
Date              0
dtype: int64

In [15]:
df.shape

(29531, 16)

In [18]:
#summary of statistics in the dataset
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PM2.5,24933.0,67.450578,64.661449,0.04,28.82,48.57,80.59,949.99
PM10,18391.0,118.127103,90.60511,0.01,56.255,95.68,149.745,1000.0
NO,25949.0,17.57473,22.785846,0.02,5.63,9.89,19.95,390.68
NO2,25946.0,28.560659,24.474746,0.01,11.75,21.69,37.62,362.21
NOx,25346.0,32.309123,31.646011,0.0,12.82,23.52,40.1275,467.63
NH3,19203.0,23.483476,25.684275,0.01,8.58,15.85,30.02,352.89
CO,27472.0,2.248598,6.962884,0.0,0.51,0.89,1.45,175.81
SO2,25677.0,14.531977,18.133775,0.01,5.67,9.16,15.22,193.86
O3,25509.0,34.49143,21.694928,0.01,18.86,30.84,45.57,257.73
Benzene,23908.0,3.28084,15.811136,0.0,0.12,1.07,3.08,455.03


In [21]:
#Percentage of null values
null_values_percentage = (df.isnull().sum()/df.isnull().count()+100).sort_values(ascending=False)
null_values_percentage

Xylene        100.613220
PM10          100.377231
NH3           100.349734
Toluene       100.272290
Benzene       100.190410
AQI           100.158511
AQI_Bucket    100.158511
PM2.5         100.155701
NOx           100.141715
O3            100.136196
SO2           100.130507
NO2           100.121398
NO            100.121296
CO            100.069723
City          100.000000
Date          100.000000
dtype: float64

#### Key Considerations:
- Xylene has the highest percentage of missing values - 61.86%
- PM10 and NH3 28 - 26 %