In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
np.random.seed(30)
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("./data/air-quality-index.csv")
df.head()

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
0,1,2010,1,1,0,,-21,-11.0,1021.0,NW,1.79,0,0
1,2,2010,1,1,1,,-21,-12.0,1020.0,NW,4.92,0,0
2,3,2010,1,1,2,,-21,-11.0,1019.0,NW,6.71,0,0
3,4,2010,1,1,3,,-21,-14.0,1019.0,NW,9.84,0,0
4,5,2010,1,1,4,,-20,-12.0,1018.0,NW,12.97,0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43824 entries, 0 to 43823
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   No      43824 non-null  int64  
 1   year    43824 non-null  int64  
 2   month   43824 non-null  int64  
 3   day     43824 non-null  int64  
 4   hour    43824 non-null  int64  
 5   pm2.5   41757 non-null  float64
 6   DEWP    43824 non-null  int64  
 7   TEMP    43824 non-null  float64
 8   PRES    43824 non-null  float64
 9   cbwd    43824 non-null  object 
 10  Iws     43824 non-null  float64
 11  Is      43824 non-null  int64  
 12  Ir      43824 non-null  int64  
dtypes: float64(4), int64(8), object(1)
memory usage: 4.3+ MB


In [4]:
df.describe()

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,Iws,Is,Ir
count,43824.0,43824.0,43824.0,43824.0,43824.0,41757.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0
mean,21912.5,2012.0,6.523549,15.72782,11.5,98.613215,1.817246,12.448521,1016.447654,23.88914,0.052734,0.194916
std,12651.043435,1.413842,3.448572,8.799425,6.922266,92.050387,14.43344,12.198613,10.268698,50.010635,0.760375,1.415867
min,1.0,2010.0,1.0,1.0,0.0,0.0,-40.0,-19.0,991.0,0.45,0.0,0.0
25%,10956.75,2011.0,4.0,8.0,5.75,29.0,-10.0,2.0,1008.0,1.79,0.0,0.0
50%,21912.5,2012.0,7.0,16.0,11.5,72.0,2.0,14.0,1016.0,5.37,0.0,0.0
75%,32868.25,2013.0,10.0,23.0,17.25,137.0,15.0,23.0,1025.0,21.91,0.0,0.0
max,43824.0,2014.0,12.0,31.0,23.0,994.0,28.0,42.0,1046.0,585.6,27.0,36.0


In [5]:
missing_value = df.isnull().sum()
missing_value

No          0
year        0
month       0
day         0
hour        0
pm2.5    2067
DEWP        0
TEMP        0
PRES        0
cbwd        0
Iws         0
Is          0
Ir          0
dtype: int64

In [6]:
total_cells = np.product(df.shape)
total_missing_value = missing_value.sum()

missing_percentage = (total_missing_value/total_cells) * 100
missing_percentage

0.36281489594742605

In [7]:
df.sample(5)

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
8268,8269,2010,12,11,12,41.0,-17,2.0,1021.0,SE,21.01,0,0
18492,18493,2012,2,10,12,23.0,-21,2.0,1030.0,cv,2.68,0,0
35305,35306,2014,1,11,1,141.0,-19,-9.0,1032.0,NE,1.78,0,0
15189,15190,2011,9,25,21,289.0,16,17.0,1017.0,SE,2.67,0,0
13225,13226,2011,7,6,1,173.0,21,23.0,998.0,cv,2.23,0,0


In [8]:
df = df.drop(['No','year','month', 'day', 'hour','Ir', 'Is'], axis=1)

In [9]:
df.sample(10)

Unnamed: 0,pm2.5,DEWP,TEMP,PRES,cbwd,Iws
17120,11.0,-17,-5.0,1037.0,NW,13.86
10767,13.0,-13,14.0,1025.0,NE,4.02
9239,100.0,-21,-9.0,1029.0,cv,4.02
19311,68.0,-7,10.0,1020.0,SE,12.52
14408,,20,23.0,1013.0,SE,2.68
15576,212.0,14,15.0,1021.0,SE,1.78
11096,,-11,11.0,1023.0,NW,1.79
38351,28.0,6,21.0,1011.0,SE,52.29
34257,10.0,-27,0.0,1028.0,NW,386.21
29249,60.0,4,22.0,1015.0,SE,8.04


In [10]:
df['cbwd'].unique()

array(['NW', 'cv', 'NE', 'SE'], dtype=object)

## Clean and Prepare Date

### Encode `cbwd` 
- NW : 1
- cv : 2
- NE : 3
- SE : 4

In [11]:
df.isnull().sum().sum()

2067

In [23]:
df.fillna(df.mean(), inplace=True)

In [25]:
df.sample(10)

Unnamed: 0,pm2.5,DEWP,TEMP,PRES,cbwd,Iws
22506,48.0,24,26.0,1002.0,SE,33.97
34445,155.0,-11,-5.0,1020.0,NE,0.89
19407,128.0,-3,5.0,1023.0,SE,8.05
32153,49.0,11,28.0,1011.0,SE,3.13
31685,63.0,22,25.0,1006.0,SE,12.96
1590,61.0,-5,-4.0,1038.0,SE,13.87
21708,142.0,20,25.0,1002.0,SE,17.44
40604,80.0,19,28.0,1010.0,SE,32.17
24126,70.0,9,10.0,1016.0,NW,5.81
33842,20.0,-13,2.0,1031.0,cv,0.89
