## EDA and classification prediction on AQI India data

This notebook is maintained as a solution to the Kaggle dataset: https://www.kaggle.com/rohanrao/air-quality-data-in-india.

In [1]:
#import standard data sci libs
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#!pip install chart_studio
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

For convenience purposes, I have saved a copy of the dataset in my github repo: https://github.com/adityarc19/aqi-india/blob/main/city_day.csv

In [2]:
df = pd.read_csv('/home/thu/INT3041E_AI_PM2.5-Concentration-Estimation/data/add_AQI.csv')
df

Unnamed: 0,time,ID,pm25,lat,lon,SQRT_SEA_DEM_LAT,WSPD,WDIR,TMP,TX,...,NDVI,CO,HCHO,NO2,SO2,CLOUD,O3,AAI,AQI,AQI_Class
0,2020-01-01,19,116.949130,21.049750,105.741870,5.922647,1.136119,145.942749,20.811243,23.219995,...,0.000551,0.045586,0.000060,0.000072,0.000035,0.711612,0.101653,-0.836203,183,3
1,2020-01-01,79,105.103044,21.015250,105.800130,4.307231,1.136119,145.942749,20.811243,23.219995,...,-0.003006,0.041913,0.000107,0.000086,0.000056,0.761720,0.103635,-0.718537,177,3
2,2020-01-01,163,118.285100,21.024347,106.017288,4.988467,0.651509,145.395233,20.677492,23.029993,...,0.004388,0.042329,0.000108,0.000071,0.000026,0.867689,0.102144,-0.942304,183,3
3,2020-01-01,300,116.739130,21.023532,105.853941,4.865087,1.136119,145.942749,20.811243,23.219995,...,-0.001733,0.041913,0.000097,0.000085,0.000079,0.779294,0.102875,-0.692613,183,3
4,2020-01-02,19,76.856667,21.049750,105.741870,5.922647,2.744283,147.084442,22.176249,25.119989,...,0.016035,0.042298,0.000115,0.000082,-0.000345,0.705396,0.103488,-1.119681,162,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11503,2021-12-31,163,16.154054,21.024347,106.017288,4.988467,1.669253,22.916901,16.846243,18.989985,...,-0.002742,0.044463,0.000133,0.000080,0.000107,1.000000,0.113309,-0.477924,60,1
11504,2021-12-31,172,17.153631,20.927000,106.314000,4.784339,1.322285,353.486359,17.187494,18.839991,...,0.001342,0.047781,0.000043,0.000096,0.000121,1.000000,0.113023,-0.511212,62,1
11505,2021-12-31,173,11.901323,20.977400,106.430800,4.589203,0.235641,347.438232,17.732492,19.739985,...,0.010294,0.047366,0.000045,0.000073,0.000172,1.000000,0.112750,-0.629895,50,0
11506,2021-12-31,195,24.209832,21.338470,105.367300,5.228505,1.295622,97.865257,16.122246,17.824003,...,-0.003163,0.049873,0.000110,0.000077,0.000105,1.000000,0.112451,-0.337096,76,1


In [3]:
# show columns
df.columns

Index(['time', 'ID', 'pm25', 'lat', 'lon', 'SQRT_SEA_DEM_LAT', 'WSPD', 'WDIR',
       'TMP', 'TX', 'TN', 'TP', 'RH', 'PRES2M', 'NDVI', 'CO', 'HCHO', 'NO2',
       'SO2', 'CLOUD', 'O3', 'AAI', 'AQI', 'AQI_Class'],
      dtype='object')

In [4]:
df.info()  # more info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11508 entries, 0 to 11507
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   time              11508 non-null  object 
 1   ID                11508 non-null  int64  
 2   pm25              11508 non-null  float64
 3   lat               11508 non-null  float64
 4   lon               11508 non-null  float64
 5   SQRT_SEA_DEM_LAT  11508 non-null  float64
 6   WSPD              11508 non-null  float64
 7   WDIR              11508 non-null  float64
 8   TMP               11508 non-null  float64
 9   TX                11508 non-null  float64
 10  TN                11508 non-null  float64
 11  TP                11508 non-null  float64
 12  RH                11508 non-null  float64
 13  PRES2M            11508 non-null  float64
 14  NDVI              11508 non-null  float64
 15  CO                11508 non-null  float64
 16  HCHO              11508 non-null  float6

In [5]:
# null counts
df.isnull().sum()

time                0
ID                  0
pm25                0
lat                 0
lon                 0
SQRT_SEA_DEM_LAT    0
WSPD                0
WDIR                0
TMP                 0
TX                  0
TN                  0
TP                  0
RH                  0
PRES2M              0
NDVI                0
CO                  0
HCHO                0
NO2                 0
SO2                 0
CLOUD               0
O3                  0
AAI                 0
AQI                 0
AQI_Class           0
dtype: int64

In [6]:
df.describe()  # data statistics

Unnamed: 0,ID,pm25,lat,lon,SQRT_SEA_DEM_LAT,WSPD,WDIR,TMP,TX,TN,...,NDVI,CO,HCHO,NO2,SO2,CLOUD,O3,AAI,AQI,AQI_Class
count,11508.0,11508.0,11508.0,11508.0,11508.0,11508.0,11508.0,11508.0,11508.0,11508.0,...,11508.0,11508.0,11508.0,11508.0,11508.0,11508.0,11508.0,11508.0,11508.0,11508.0
mean,125.563434,35.549743,21.101977,106.018574,5.325964,2.783826,128.810995,24.694296,28.78024,21.765171,...,0.138434,0.047092,0.000189,7.3e-05,1.8e-05,0.606207,0.119568,-0.89773,94.518683,1.402763
std,70.516935,29.450255,0.0854,0.238192,0.620108,1.37897,78.842842,5.099146,6.039901,4.751163,...,0.156417,0.013641,0.000113,4.4e-05,0.000282,0.348222,0.007462,1.057986,52.073276,0.97924
min,5.0,3.0231,20.927,105.3673,3.382894,0.020039,0.093567,9.192495,11.040003,7.230005,...,-0.073627,0.021639,-0.000376,4e-06,-0.000984,0.0,0.098486,-4.205145,13.0,0.0
25%,60.0,16.326228,21.027221,105.889544,4.865087,1.696836,87.5905,20.934993,24.40912,18.265497,...,0.022693,0.036886,0.000122,5.3e-05,-8.1e-05,0.315847,0.113938,-1.621201,60.0,1.0
50%,156.0,27.171879,21.113745,106.017288,5.465172,2.720441,136.356522,24.915009,28.959986,22.269983,...,0.065286,0.044785,0.00017,6.6e-05,5e-06,0.638603,0.1215,-1.00611,83.0,1.0
75%,163.0,44.422146,21.152745,106.15188,5.693217,3.773106,150.831589,28.925005,33.412499,25.661511,...,0.226967,0.054317,0.000242,8.2e-05,8.2e-05,0.999508,0.125447,-0.345595,124.0,2.0
max,300.0,241.724021,21.33847,106.5291,6.696101,9.585102,359.884491,34.650002,41.749992,30.85,...,1.0,0.115467,0.001042,0.000879,0.002998,1.0,0.136669,4.356118,500.0,5.0


#### Let's look at the missing values

In [8]:
# Missing values
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

missing_values= missing_values_table(df)
missing_values.style.background_gradient(cmap='Reds')

Your selected dataframe has 24 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


### EDA using Pandas Profiling 

In [9]:
import pandas_profiling

In [10]:
data = pd.read_csv('/home/thu/INT3041E_AI_PM2.5-Concentration-Estimation/data/add_AQI.csv')
data

Unnamed: 0,time,ID,pm25,lat,lon,SQRT_SEA_DEM_LAT,WSPD,WDIR,TMP,TX,...,NDVI,CO,HCHO,NO2,SO2,CLOUD,O3,AAI,AQI,AQI_Class
0,2020-01-01,19,116.949130,21.049750,105.741870,5.922647,1.136119,145.942749,20.811243,23.219995,...,0.000551,0.045586,0.000060,0.000072,0.000035,0.711612,0.101653,-0.836203,183,3
1,2020-01-01,79,105.103044,21.015250,105.800130,4.307231,1.136119,145.942749,20.811243,23.219995,...,-0.003006,0.041913,0.000107,0.000086,0.000056,0.761720,0.103635,-0.718537,177,3
2,2020-01-01,163,118.285100,21.024347,106.017288,4.988467,0.651509,145.395233,20.677492,23.029993,...,0.004388,0.042329,0.000108,0.000071,0.000026,0.867689,0.102144,-0.942304,183,3
3,2020-01-01,300,116.739130,21.023532,105.853941,4.865087,1.136119,145.942749,20.811243,23.219995,...,-0.001733,0.041913,0.000097,0.000085,0.000079,0.779294,0.102875,-0.692613,183,3
4,2020-01-02,19,76.856667,21.049750,105.741870,5.922647,2.744283,147.084442,22.176249,25.119989,...,0.016035,0.042298,0.000115,0.000082,-0.000345,0.705396,0.103488,-1.119681,162,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11503,2021-12-31,163,16.154054,21.024347,106.017288,4.988467,1.669253,22.916901,16.846243,18.989985,...,-0.002742,0.044463,0.000133,0.000080,0.000107,1.000000,0.113309,-0.477924,60,1
11504,2021-12-31,172,17.153631,20.927000,106.314000,4.784339,1.322285,353.486359,17.187494,18.839991,...,0.001342,0.047781,0.000043,0.000096,0.000121,1.000000,0.113023,-0.511212,62,1
11505,2021-12-31,173,11.901323,20.977400,106.430800,4.589203,0.235641,347.438232,17.732492,19.739985,...,0.010294,0.047366,0.000045,0.000073,0.000172,1.000000,0.112750,-0.629895,50,0
11506,2021-12-31,195,24.209832,21.338470,105.367300,5.228505,1.295622,97.865257,16.122246,17.824003,...,-0.003163,0.049873,0.000110,0.000077,0.000105,1.000000,0.112451,-0.337096,76,1


In [12]:
def aqi_bucket(aqi):
    if aqi <= 50: return 'Good'
    elif aqi <= 100: return 'Moderate'
    elif aqi <= 150: return 'Unhealthy for Sensitive'
    elif aqi <= 200: return 'Unhealthy'
    elif aqi <= 300: return 'Very Unhealthy'
    else: return 'Hazardous'

In [14]:
# Tính toán AQI và phân loại thành bucket
data['AQI_Bucket'] = data['AQI'].apply(aqi_bucket)

In [15]:
data

Unnamed: 0,time,ID,pm25,lat,lon,SQRT_SEA_DEM_LAT,WSPD,WDIR,TMP,TX,...,CO,HCHO,NO2,SO2,CLOUD,O3,AAI,AQI,AQI_Class,AQI_Bucket
0,2020-01-01,19,116.949130,21.049750,105.741870,5.922647,1.136119,145.942749,20.811243,23.219995,...,0.045586,0.000060,0.000072,0.000035,0.711612,0.101653,-0.836203,183,3,Unhealthy
1,2020-01-01,79,105.103044,21.015250,105.800130,4.307231,1.136119,145.942749,20.811243,23.219995,...,0.041913,0.000107,0.000086,0.000056,0.761720,0.103635,-0.718537,177,3,Unhealthy
2,2020-01-01,163,118.285100,21.024347,106.017288,4.988467,0.651509,145.395233,20.677492,23.029993,...,0.042329,0.000108,0.000071,0.000026,0.867689,0.102144,-0.942304,183,3,Unhealthy
3,2020-01-01,300,116.739130,21.023532,105.853941,4.865087,1.136119,145.942749,20.811243,23.219995,...,0.041913,0.000097,0.000085,0.000079,0.779294,0.102875,-0.692613,183,3,Unhealthy
4,2020-01-02,19,76.856667,21.049750,105.741870,5.922647,2.744283,147.084442,22.176249,25.119989,...,0.042298,0.000115,0.000082,-0.000345,0.705396,0.103488,-1.119681,162,3,Unhealthy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11503,2021-12-31,163,16.154054,21.024347,106.017288,4.988467,1.669253,22.916901,16.846243,18.989985,...,0.044463,0.000133,0.000080,0.000107,1.000000,0.113309,-0.477924,60,1,Moderate
11504,2021-12-31,172,17.153631,20.927000,106.314000,4.784339,1.322285,353.486359,17.187494,18.839991,...,0.047781,0.000043,0.000096,0.000121,1.000000,0.113023,-0.511212,62,1,Moderate
11505,2021-12-31,173,11.901323,20.977400,106.430800,4.589203,0.235641,347.438232,17.732492,19.739985,...,0.047366,0.000045,0.000073,0.000172,1.000000,0.112750,-0.629895,50,0,Good
11506,2021-12-31,195,24.209832,21.338470,105.367300,5.228505,1.295622,97.865257,16.122246,17.824003,...,0.049873,0.000110,0.000077,0.000105,1.000000,0.112451,-0.337096,76,1,Moderate


In [None]:
data = data.drop(columns=['time', 'ID', 'AQI', 'AQI_Class'])

### Prediction of AQI Bucket using PyCaret

In [22]:
data

Unnamed: 0,pm25,lat,lon,SQRT_SEA_DEM_LAT,WSPD,WDIR,TMP,TX,TN,TP,RH,PRES2M,NDVI,CO,HCHO,NO2,SO2,CLOUD,O3,AAI,AQI_Bucket
0,116.949130,21.049750,105.741870,5.922647,1.136119,145.942749,20.811243,23.219995,18.939997,0.3750,74.750000,102197.203125,0.000551,0.045586,0.000060,0.000072,0.000035,0.711612,0.101653,-0.836203,Unhealthy
1,105.103044,21.015250,105.800130,4.307231,1.136119,145.942749,20.811243,23.219995,18.939997,0.3750,74.750000,102197.203125,-0.003006,0.041913,0.000107,0.000086,0.000056,0.761720,0.103635,-0.718537,Unhealthy
2,118.285100,21.024347,106.017288,4.988467,0.651509,145.395233,20.677492,23.029993,18.879999,0.1250,73.687500,102228.796875,0.004388,0.042329,0.000108,0.000071,0.000026,0.867689,0.102144,-0.942304,Unhealthy
3,116.739130,21.023532,105.853941,4.865087,1.136119,145.942749,20.811243,23.219995,18.939997,0.3750,74.750000,102197.203125,-0.001733,0.041913,0.000097,0.000085,0.000079,0.779294,0.102875,-0.692613,Unhealthy
4,76.856667,21.049750,105.741870,5.922647,2.744283,147.084442,22.176249,25.119989,20.299982,0.1875,78.012497,102025.328125,0.016035,0.042298,0.000115,0.000082,-0.000345,0.705396,0.103488,-1.119681,Unhealthy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11503,16.154054,21.024347,106.017288,4.988467,1.669253,22.916901,16.846243,18.989985,14.730005,0.0625,65.837502,102317.015625,-0.002742,0.044463,0.000133,0.000080,0.000107,1.000000,0.113309,-0.477924,Moderate
11504,17.153631,20.927000,106.314000,4.784339,1.322285,353.486359,17.187494,18.839991,15.249994,0.0000,63.812500,102335.617188,0.001342,0.047781,0.000043,0.000096,0.000121,1.000000,0.113023,-0.511212,Moderate
11505,11.901323,20.977400,106.430800,4.589203,0.235641,347.438232,17.732492,19.739985,15.609980,0.0000,65.050003,102115.812500,0.010294,0.047366,0.000045,0.000073,0.000172,1.000000,0.112750,-0.629895,Good
11506,24.209832,21.338470,105.367300,5.228505,1.295622,97.865257,16.122246,17.824003,14.982986,0.0000,70.632500,101743.898438,-0.003163,0.049873,0.000110,0.000077,0.000105,1.000000,0.112451,-0.337096,Moderate


In [23]:
from pycaret.classification import *

In [24]:
reg = setup(data = data, 
             target = 'AQI_Bucket',
             silent = True)

Unnamed: 0,Description,Value
0,session_id,934
1,Target,AQI_Bucket
2,Target Type,Multiclass
3,Label Encoded,"Good: 0, Hazardous: 1, Moderate: 2, Unhealthy: 3, Unhealthy for Sensitive: 4, Very Unhealthy: 5"
4,Original Data,"(11508, 21)"
5,Missing Values,False
6,Numeric Features,20
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [None]:
best = compare_models()   #compare all models

IntProgress(value=0, description='Processing: ', max=74)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.9998,0.9999,0.9957,0.9998,0.9998,0.9996,0.9996,0.124
gbc,Gradient Boosting Classifier,0.9998,1.0,0.9957,0.9998,0.9998,0.9996,0.9996,37.697
rf,Random Forest Classifier,0.9948,0.9999,0.8387,0.9926,0.9932,0.9923,0.9923,0.936
lr,Logistic Regression,0.9222,0.9863,0.7134,0.9179,0.9193,0.8847,0.8849,6.353
et,Extra Trees Classifier,0.9155,0.9896,0.718,0.9124,0.9124,0.8732,0.8746,0.718
qda,Quadratic Discriminant Analysis,0.8744,0.973,0.7126,0.8808,0.8761,0.8172,0.8183,0.109
ada,Ada Boost Classifier,0.8345,0.9595,0.6667,0.7445,0.7763,0.7541,0.7857,1.2
lda,Linear Discriminant Analysis,0.8138,0.9589,0.6606,0.823,0.8037,0.7121,0.7221,0.132
nb,Naive Bayes,0.8094,0.9635,0.6885,0.8457,0.8173,0.7258,0.7326,0.079
knn,K Neighbors Classifier,0.7681,0.9133,0.5772,0.7629,0.7595,0.6444,0.6497,0.236


In [None]:
dt_model = create_model('dt')   #Model creation

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,1.0,0.0,1.0,1.0,1.0,1.0
1,1.0,0.0,1.0,1.0,1.0,1.0
2,1.0,0.0,1.0,1.0,1.0,1.0
3,1.0,0.0,1.0,1.0,1.0,1.0
4,1.0,0.0,1.0,1.0,1.0,1.0
5,1.0,0.0,1.0,1.0,1.0,1.0
6,1.0,0.0,1.0,1.0,1.0,1.0
7,1.0,0.0,1.0,1.0,1.0,1.0
8,1.0,0.0,1.0,1.0,1.0,1.0
9,1.0,0.0,1.0,1.0,1.0,1.0


In [None]:
best = dt_model

In [None]:
#run the code below to evaluate model:
evaluate_model(best)

In [None]:
plot_model(best, plot = 'auc')

In [None]:
plot_model(best, plot = 'confusion_matrix')

In [None]:
predictions = predict_model(best, data=data)
predictions.head()

In [None]:
predictions = predict_model(best, data=data, raw_score=True)
predictions.head()

In [None]:
# save the model
save_model(best, 'my_best_pipeline')

Transformation Pipeline and Model Succesfully Saved


In [None]:
# Lấy danh sách các mô hình đã tạo
all_models = get_config('models')

# Xuất tham số của từng mô hình
for model_name, model_obj in all_models.items():
    print(f"Model: {model_name}")
    print(model_obj.get_params())

In [None]:
# # load model
# loaded_model = load_model('my_best_pipeline')
# print(loaded_model)