We are two UIUC students from Indonesia that is attempting to predict air quality trends in Jakarta. 

## Importing Libraries

In [679]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Cleaning The Data

In [680]:
df = pd.read_csv('dataset/airquality-data.csv')
df.drop(columns=['periode_data'])
df

Unnamed: 0,periode_data,tanggal,pm_10,pm_duakomalima,so2,co,o3,no2,max,critical,categori,lokasi_spku
0,202212,2022-12-02,60,94,52,18,42,30,94,"PM2,5",SEDANG,DKI4
1,202212,2022-12-03,60,100,52,16,49,31,100,"PM2,5",SEDANG,DKI4
2,202212,2022-12-04,65,95,52,19,45,34,95,"PM2,5",SEDANG,DKI4
3,202212,2022-12-05,74,117,52,19,48,33,117,"PM2,5",TIDAK SEHAT,DKI4
4,202212,2022-12-06,70,107,49,24,41,44,107,"PM2,5",TIDAK SEHAT,DKI4
...,...,...,...,...,...,...,...,...,...,...,...,...
360,202201,2022-01-16,67,102,47,20,65,32,102,"PM2,5",TIDAK SEHAT,DKI5
361,202201,2022-01-15,55,69,51,15,68,29,69,"PM2,5",SEDANG,DKI3
362,202201,2022-01-14,46,63,48,15,63,32,63,"PM2,5",SEDANG,DKI2
363,202201,2022-01-13,38,62,52,15,63,32,63,O3,SEDANG,DKI2


Translating the Data

In [681]:
translations = {
    'periode_data': 'Data Period',
    'tanggal' : 'Date',
    'pm_10' : 'PM 10',
    'so2' : 'SO2',
    'co' : 'CO',
    'o3' : 'O3',
    'no2' : 'NO2',
    'max' : 'Max',
    'critical' : 'Critical',
    'pm_duakomalima' : 'PM 2.5',
    'lokasi_spku' : 'Station Location',
    'categori' : 'Category'
}
df.rename(columns=translations, inplace=True)

category_translation = {
    'SEDANG': 'MODERATE',
    'TIDAK SEHAT' : 'POOR',
    'BAIK' : 'GOOD',
}
df['Category'] = df['Category'].replace(category_translation)

station_location_translation = {
    '0' : '0',
    'DKI1' : 'DKI1 (Bunderan HI)',
    'DKI2' : 'DKI2 (Kelapa Gading)',
    'DKI3' : 'DKI3 (Jagakarsa)',
    'DKI4' : 'DKI4 (Lubang Buaya)',
    'DKI5' : 'DKI5 (Kebon Jeruk)'
}

df['Station Location'] = df['Station Location'].replace(station_location_translation)
df

Unnamed: 0,Data Period,Date,PM 10,PM 2.5,SO2,CO,O3,NO2,Max,Critical,Category,Station Location
0,202212,2022-12-02,60,94,52,18,42,30,94,"PM2,5",MODERATE,DKI4 (Lubang Buaya)
1,202212,2022-12-03,60,100,52,16,49,31,100,"PM2,5",MODERATE,DKI4 (Lubang Buaya)
2,202212,2022-12-04,65,95,52,19,45,34,95,"PM2,5",MODERATE,DKI4 (Lubang Buaya)
3,202212,2022-12-05,74,117,52,19,48,33,117,"PM2,5",POOR,DKI4 (Lubang Buaya)
4,202212,2022-12-06,70,107,49,24,41,44,107,"PM2,5",POOR,DKI4 (Lubang Buaya)
...,...,...,...,...,...,...,...,...,...,...,...,...
360,202201,2022-01-16,67,102,47,20,65,32,102,"PM2,5",POOR,DKI5 (Kebon Jeruk)
361,202201,2022-01-15,55,69,51,15,68,29,69,"PM2,5",MODERATE,DKI3 (Jagakarsa)
362,202201,2022-01-14,46,63,48,15,63,32,63,"PM2,5",MODERATE,DKI2 (Kelapa Gading)
363,202201,2022-01-13,38,62,52,15,63,32,63,O3,MODERATE,DKI2 (Kelapa Gading)


finding coordinate for each station

Translating category column

In [682]:
df = df.sort_values(by = 'Date', ascending = True).reset_index(drop = True)
df

Unnamed: 0,Data Period,Date,PM 10,PM 2.5,SO2,CO,O3,NO2,Max,Critical,Category,Station Location
0,202202,2020-02-01,64,89,52,13,76,21,89,"PM2,5",MODERATE,DKI3 (Jagakarsa)
1,202202,2020-02-02,66,101,47,22,64,26,101,"PM2,5",POOR,DKI4 (Lubang Buaya)
2,202202,2020-02-03,58,99,48,20,65,33,99,"PM2,5",MODERATE,DKI4 (Lubang Buaya)
3,202202,2020-02-04,54,72,46,12,68,27,72,"PM2,5",MODERATE,DKI3 (Jagakarsa)
4,202202,2020-02-05,43,62,53,18,55,30,62,"PM2,5",MODERATE,DKI5 (Kebon Jeruk)
...,...,...,...,...,...,...,...,...,...,...,...,...
360,202212,2022-12-27,36,47,58,42,20,18,58,SO2,MODERATE,DKI2 (Kelapa Gading)
361,202212,2022-12-28,46,66,57,41,15,19,66,"PM2,5",MODERATE,DKI3 (Jagakarsa)
362,202212,2022-12-29,23,50,57,12,16,15,57,SO2,MODERATE,DKI2 (Kelapa Gading)
363,202212,2022-12-30,40,64,57,21,17,24,64,"PM2,5",MODERATE,DKI4 (Lubang Buaya)


In [699]:
df_clean = df.drop(364)



Unnamed: 0,Data Period,Date,PM 10,PM 2.5,SO2,CO,O3,NO2,Max,Critical,Category,Station Location
0,202202,2020-02-01,64,89,52,13,76,21,89,"PM2,5",MODERATE,DKI3 (Jagakarsa)
1,202202,2020-02-02,66,101,47,22,64,26,101,"PM2,5",POOR,DKI4 (Lubang Buaya)
2,202202,2020-02-03,58,99,48,20,65,33,99,"PM2,5",MODERATE,DKI4 (Lubang Buaya)
3,202202,2020-02-04,54,72,46,12,68,27,72,"PM2,5",MODERATE,DKI3 (Jagakarsa)
4,202202,2020-02-05,43,62,53,18,55,30,62,"PM2,5",MODERATE,DKI5 (Kebon Jeruk)
...,...,...,...,...,...,...,...,...,...,...,...,...
359,202212,2022-12-26,43,62,58,54,25,27,62,"PM2,5",MODERATE,DKI2 (Kelapa Gading)
360,202212,2022-12-27,36,47,58,42,20,18,58,SO2,MODERATE,DKI2 (Kelapa Gading)
361,202212,2022-12-28,46,66,57,41,15,19,66,"PM2,5",MODERATE,DKI3 (Jagakarsa)
362,202212,2022-12-29,23,50,57,12,16,15,57,SO2,MODERATE,DKI2 (Kelapa Gading)


In [700]:
df2.replace('null', np.NaN, inplace=True)
df2

Unnamed: 0,Data Period,Date,PM 10,PM 2.5,SO2,CO,O3,NO2,Max,Critical,Category,Station Location
0,202202,2020-02-01,64,89,52,13,76,21,89,"PM2,5",MODERATE,DKI3 (Jagakarsa)
1,202202,2020-02-02,66,101,47,22,64,26,101,"PM2,5",POOR,DKI4 (Lubang Buaya)
2,202202,2020-02-03,58,99,48,20,65,33,99,"PM2,5",MODERATE,DKI4 (Lubang Buaya)
3,202202,2020-02-04,54,72,46,12,68,27,72,"PM2,5",MODERATE,DKI3 (Jagakarsa)
4,202202,2020-02-05,43,62,53,18,55,30,62,"PM2,5",MODERATE,DKI5 (Kebon Jeruk)
...,...,...,...,...,...,...,...,...,...,...,...,...
359,202212,2022-12-26,43,62,58,54,25,27,62,"PM2,5",MODERATE,DKI2 (Kelapa Gading)
360,202212,2022-12-27,36,47,58,42,20,18,58,SO2,MODERATE,DKI2 (Kelapa Gading)
361,202212,2022-12-28,46,66,57,41,15,19,66,"PM2,5",MODERATE,DKI3 (Jagakarsa)
362,202212,2022-12-29,23,50,57,12,16,15,57,SO2,MODERATE,DKI2 (Kelapa Gading)


## Understanding the data

In [685]:
df2.head()

Unnamed: 0,Data Period,Date,PM 10,PM 2.5,SO2,CO,O3,NO2,Max,Critical,Category,Station Location
0,202202,2020-02-01,64,89,52,13,76,21,89,"PM2,5",MODERATE,DKI3 (Jagakarsa)
1,202202,2020-02-02,66,101,47,22,64,26,101,"PM2,5",POOR,DKI4 (Lubang Buaya)
2,202202,2020-02-03,58,99,48,20,65,33,99,"PM2,5",MODERATE,DKI4 (Lubang Buaya)
3,202202,2020-02-04,54,72,46,12,68,27,72,"PM2,5",MODERATE,DKI3 (Jagakarsa)
4,202202,2020-02-05,43,62,53,18,55,30,62,"PM2,5",MODERATE,DKI5 (Kebon Jeruk)


In [686]:
df.tail()

Unnamed: 0,Data Period,Date,PM 10,PM 2.5,SO2,CO,O3,NO2,Max,Critical,Category,Station Location
360,202212,2022-12-27,36,47,58,42,20,18,58,SO2,MODERATE,DKI2 (Kelapa Gading)
361,202212,2022-12-28,46,66,57,41,15,19,66,"PM2,5",MODERATE,DKI3 (Jagakarsa)
362,202212,2022-12-29,23,50,57,12,16,15,57,SO2,MODERATE,DKI2 (Kelapa Gading)
363,202212,2022-12-30,40,64,57,21,17,24,64,"PM2,5",MODERATE,DKI4 (Lubang Buaya)
364,202212,44926.625,54,73,56,24,23,24,73,"PM2,5",MODERATE,DKI4 (Lubang Buaya)


In [687]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Data Period       365 non-null    int64 
 1   Date              365 non-null    object
 2   PM 10             365 non-null    int64 
 3   PM 2.5            365 non-null    int64 
 4   SO2               365 non-null    int64 
 5   CO                365 non-null    int64 
 6   O3                365 non-null    int64 
 7   NO2               365 non-null    int64 
 8   Max               365 non-null    int64 
 9   Critical          364 non-null    object
 10  Category          365 non-null    object
 11  Station Location  365 non-null    object
dtypes: int64(8), object(4)
memory usage: 34.3+ KB


In [688]:
df.shape

(365, 12)

The dataset used in this model is taken from the Jakarta Open Data Website. It contains information about the Air Pollution Quality Index (APSI) measured by 5 air quality monitoring systems (AQMS) in the DKI Jakarta province for the year 2022.

Dataset components:

1. Date: Data of air quality measurement
2. PM 10: Particulate matter, one of the monitored parameters
3. PM 2.5: Particulate matter with a size below 2.5 microns, one of the monitored parameters
4. SO2: Sulfide, one of the monitored parameters
5. CO: Carbon Monoxide, one of the monitored parameters
6. O3: Ozone, one of the monitored parameters
7. NO2: Nitrogen dioxide, one of the monitored parameters
8. Max: Highest measurement value from all parameters measured at the same time
9. Critical: Parameter with the highest measurement value
10. Station Location: Air Quality Monitoring Systems Location

In [689]:
df.describe()

Unnamed: 0,Data Period,PM 10,PM 2.5,SO2,CO,O3,NO2,Max
count,365.0,365.0,365.0,365.0,365.0,365.0,365.0,365.0
mean,202206.526027,59.819178,92.939726,47.926027,18.424658,57.243836,28.753425,93.852055
std,3.452584,13.330135,24.421363,4.569669,7.097666,23.345547,8.677337,24.43572
min,202201.0,23.0,40.0,37.0,7.0,15.0,6.0,49.0
25%,202204.0,52.0,75.0,44.0,14.0,39.0,23.0,76.0
50%,202207.0,60.0,92.0,49.0,17.0,54.0,28.0,93.0
75%,202210.0,68.0,111.0,51.0,21.0,71.0,34.0,111.0
max,202212.0,95.0,165.0,62.0,55.0,181.0,52.0,181.0


In [690]:
df.isnull().sum()
df = df[df.Critical != 'null']

In [1]:
#df.Date = pd.to_datetime(df.Date)

#df_resample = (df.set_index('Date')).resample('D').mean().reset_index()
#df_resample.plot(x='Date', y='PM 10', kind='line', color='red')
