In [25]:
import pandas as pd
import seaborn as sns
import datetime as dt

# Import the data

c_data = \
    pd.read_csv("jakarta-central (us consulate), indonesia-air-quality.csv", 
    header = 0, 
    names = ["Date", "PM25", "PM10"])
c_data["Loc"] = "Central Jakarta"

s_data = \
    pd.read_csv("jakarta-south (us consulate), indonesia-air-quality.csv", 
    header = 0, 
    names = ["Date", "PM25", "PM10"])
s_data["Loc"] = "South Jakarta"

# Combine the data for Central Jakarta and South Jakarta

data = pd.concat([c_data, s_data])

In [26]:
data.head()

Unnamed: 0,Date,PM25,PM10,Loc
0,2022/10/1,86,,Central Jakarta
1,2022/10/2,47,,Central Jakarta
2,2022/10/3,84,,Central Jakarta
3,2022/10/4,66,,Central Jakarta
4,2022/10/5,71,,Central Jakarta


In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4959 entries, 0 to 2461
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    4959 non-null   object
 1   PM25    4959 non-null   object
 2   PM10    4959 non-null   object
 3   Loc     4959 non-null   object
dtypes: object(4)
memory usage: 193.7+ KB


In [28]:
# Calculate the AQI as the maximum values between PM25 and PM10

data["AQI"] = data[["PM25", "PM10"]].max(axis = 1)

# Drop PM25 and PM10 columns

data.drop(["PM25", "PM10"], axis = 1, inplace = True)

# Convert date column to datetime format

data["Date"] = pd.to_datetime(data["Date"])

# Convert AQI column to numeric ("mean" operation behavior is changed in pandas 2.0)

data["AQI"] = pd.to_numeric(data["AQI"])

In [29]:
data.head()

Unnamed: 0,Date,Loc,AQI
0,2022-10-01,Central Jakarta,86
1,2022-10-02,Central Jakarta,47
2,2022-10-03,Central Jakarta,84
3,2022-10-04,Central Jakarta,66
4,2022-10-05,Central Jakarta,71


In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4959 entries, 0 to 2461
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    4959 non-null   datetime64[ns]
 1   Loc     4959 non-null   object        
 2   AQI     4959 non-null   int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 155.0+ KB


In [31]:
# Drop data below 2016 (the data is incomplete)

data["Year"] = data["Date"].dt.year
data["Month"] = data["Date"].dt.strftime("%Y-%m")
data.drop(index = data[data["Year"] < 2016].index, inplace = True)

In [32]:
# Group data (monthly)

data_monthly = \
    data.drop(columns = ["Date", "Year"]).\
    groupby(["Month", "Loc"]).\
    agg(["min", "mean", "max"]).\
    reset_index()
data_monthly.columns = \
    [" ".join(column) for column in data_monthly.columns.to_flat_index()]
data_monthly.columns = \
    [column.strip() for column in data_monthly.columns]

In [33]:
data_monthly

Unnamed: 0,Month,Loc,AQI min,AQI mean,AQI max
0,2016-01,Central Jakarta,43,78.586207,121
1,2016-01,South Jakarta,50,90.655172,138
2,2016-02,Central Jakarta,45,76.304348,120
3,2016-02,South Jakarta,36,86.068966,147
4,2016-03,Central Jakarta,53,93.129032,138
...,...,...,...,...,...
159,2022-08,South Jakarta,88,110.291667,150
160,2022-09,Central Jakarta,53,93.200000,131
161,2022-09,South Jakarta,68,111.800000,149
162,2022-10,Central Jakarta,47,75.636364,119


In [35]:
# Group date (yearly)

data_yearly = \
    data.drop(columns = ["Date", "Month"]).\
    groupby(["Year", "Loc"]).\
    agg(["min", "mean", "max"]).\
    reset_index()
data_yearly.columns = \
    [" ".join(column) for column in data_yearly.columns.to_flat_index()]
data_yearly.columns = \
    [column.strip() for column in data_yearly.columns]

In [36]:
data_yearly

Unnamed: 0,Year,Loc,AQI min,AQI mean,AQI max
0,2016,Central Jakarta,32,99.891061,209
1,2016,South Jakarta,33,101.972527,165
2,2017,Central Jakarta,7,79.558282,148
3,2017,South Jakarta,7,82.29878,165
4,2018,Central Jakarta,12,73.144509,161
5,2018,South Jakarta,4,77.495702,209
6,2019,Central Jakarta,9,77.811798,159
7,2019,South Jakarta,23,83.713873,165
8,2020,Central Jakarta,7,62.746479,137
9,2020,South Jakarta,6,56.741758,140
