Vy Duong - ALY6010 - Module 5 - Air Quality

Practice project on Air Quality dataset https://www.kaggle.com/datasets/fedesoriano/air-quality-data-set

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline 
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import math
from datetime import datetime

In [2]:
# import data and rename columns
df = pd.read_excel('AirQualityUCI.xlsx',parse_dates = ['Date'], names =['Date','Time','CO2','CO2 sensor response',
'NMHC','Benzene','NMHC sensor response','NOx','NOx sensor response','NO2','NO2 sensor response',
'O3 sensor response','Temperature','Humidity %','Absolute Humidity'])

In [3]:
df.head()

Unnamed: 0,Date,Time,CO2,CO2 sensor response,NMHC,Benzene,NMHC sensor response,NOx,NOx sensor response,NO2,NO2 sensor response,O3 sensor response,Temperature,Humidity %,Absolute Humidity
0,2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


In [4]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Date,9357.0,2004-09-21 04:30:05.193972480,2004-03-10 00:00:00,2004-06-16 00:00:00,2004-09-21 00:00:00,2004-12-28 00:00:00,2005-04-04 00:00:00,
CO2,9357.0,-34.207524,-200.0,0.6,1.5,2.6,11.9,77.65717
CO2 sensor response,9357.0,1048.869652,-200.0,921.0,1052.5,1221.25,2039.75,329.817015
NMHC,9357.0,-159.090093,-200.0,-200.0,-200.0,-200.0,1189.0,139.789093
Benzene,9357.0,1.865576,-200.0,4.004958,7.886653,13.636091,63.741476,41.380154
NMHC sensor response,9357.0,894.475963,-200.0,711.0,894.5,1104.75,2214.0,342.315902
NOx,9357.0,168.6042,-200.0,50.0,141.0,284.2,1479.0,257.424561
NOx sensor response,9357.0,794.872333,-200.0,637.0,794.25,960.25,2682.75,321.977031
NO2,9357.0,58.135898,-200.0,53.0,96.0,133.0,339.7,126.931428
NO2 sensor response,9357.0,1391.363266,-200.0,1184.75,1445.5,1662.0,2775.0,467.192382


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9357 entries, 0 to 9356
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Date                  9357 non-null   datetime64[ns]
 1   Time                  9357 non-null   object        
 2   CO2                   9357 non-null   float64       
 3   CO2 sensor response   9357 non-null   float64       
 4   NMHC                  9357 non-null   int64         
 5   Benzene               9357 non-null   float64       
 6   NMHC sensor response  9357 non-null   float64       
 7   NOx                   9357 non-null   float64       
 8   NOx sensor response   9357 non-null   float64       
 9   NO2                   9357 non-null   float64       
 10  NO2 sensor response   9357 non-null   float64       
 11  O3 sensor response    9357 non-null   float64       
 12  Temperature           9357 non-null   float64       
 13  Humidity %        

In [6]:
df.isnull().sum()
# there seems to be no NULL records

Date                    0
Time                    0
CO2                     0
CO2 sensor response     0
NMHC                    0
Benzene                 0
NMHC sensor response    0
NOx                     0
NOx sensor response     0
NO2                     0
NO2 sensor response     0
O3 sensor response      0
Temperature             0
Humidity %              0
Absolute Humidity       0
dtype: int64

In [None]:
# check potential relationships and outliners
sns.pairplot(df)
plt.show()

In [None]:
# drop humidity % and humidity absolute column since there're no seemingly relevance with others
# drop -200 values in all sensors: probably -200 is NA value due to high repeatance
df2 = df.drop(['Humidity %','Absolute Humidity'],axis = 1)

In [None]:
df2.replace(to_replace=-200,value=np.nan,inplace=True)

In [None]:
df3 = df2.dropna()

In [None]:
df3.describe()

In [None]:
# check outliners again
for i in df3.columns[2:11]:
    sns.boxplot(x=df3[i])
    plt.show()

In [None]:
# pairplot new df
sns.pairplot(df3)
plt.show()

In [None]:
# correlation for CO2
plt.scatter(df3['CO2'],df3['CO2 sensor response'],c=df3['Temperature'])
plt.xlabel('Traditional monitor for CO2')
plt.ylabel('Sensor response for CO2')
plt.title('Relationship between Traditional monitor and Sensor response for CO2 concerntration')
cbar= plt.colorbar()
cbar.set_label('Temperature')
plt.plot(np.unique(df3['CO2']), np.poly1d(np.polyfit(df3['CO2'], df3['CO2 sensor response'], 1))
         (np.unique(df3['CO2'])), color='red')
plt.show()


In [None]:
r_CO2, p_CO2 = pearsonr(df3['CO2'],df3['CO2 sensor response'])
print('Pearsons correlation: %.3f' % r_CO2)
print('P-value: %.4f' % p_CO2)

In [None]:
# correlation for NOx
plt.scatter(df3['NOx'],df3['NOx sensor response'],c=df3['Temperature'])
plt.xlabel('Traditional monitor for NOx')
plt.ylabel('Sensor response for NOx')
plt.title('Relationship between Traditional monitor and Sensor response for NOx concerntration')
cbar= plt.colorbar()
cbar.set_label('Temperature')
plt.plot(np.unique(df3['NOx']), np.poly1d(np.polyfit(df3['NOx'], df3['NOx sensor response'], 1))
         (np.unique(df3['NOx'])), color='red')
plt.show()

In [None]:
r_NOx, p_NOx = pearsonr(df3['NOx'],df3['NOx sensor response'])
print('Pearsons correlation: %.3f' % r_NOx)
print('P-value: %.4f' % p_NOx)

In [None]:
# correlation for NO2
plt.scatter(df3['NO2'],df3['NO2 sensor response'],c=df3['Temperature'])
plt.xlabel('Traditional monitor for NO2')
plt.ylabel('Sensor response for NO2')
plt.title('Relationship between Traditional monitor and Sensor response for NO2 concerntration')
cbar= plt.colorbar()
cbar.set_label('Temperature')
plt.plot(np.unique(df3['NO2']), np.poly1d(np.polyfit(df3['NO2'], df3['NO2 sensor response'], 1))
         (np.unique(df3['NO2'])), color='red')
plt.show()

In [None]:
r_NO2, p_NO2 = pearsonr(df3['NO2'],df3['NO2 sensor response'])
print('Pearsons correlation: %.3f' % r_NO2)
print('P-value: %.4f' % p_NO2)

Testing 3 pairs all give p-value very close to zero, in rejecting Null hypothesis area (which p=0, no correlation)) at alpha 0.05 (2 tailed test)

In [None]:
# create 2 interest group for correlation chart
interests1 = ['CO2 sensor response','NMHC sensor response','NOx sensor response','NO2 sensor response']
interests2 = ['CO2','NMHC','NOx','NO2']

In [None]:
df3[interests1].describe()

In [None]:
df3[interests1].corr()

In [None]:
sns.heatmap(df3[interests1], annot=True)
plt.show()

In [None]:
# correclation map in simpler view

plt.matshow(df3[interests1].corr())
plt.xticks(ticks = np.arange(4),labels= ['CO2 sensor','NMHC sensor','NOx sensor','NO2 sensor'], rotation = 'vertical')
plt.yticks(ticks = np.arange(4),labels= ['CO2 sensor','NMHC sensor','NOx sensor','NO2 sensor'])
plt.subplots_adjust(left=0.8,bottom=0.1, right=1, top=0.9)
plt.colorbar(orientation = 'horizontal')
plt.title("Correlation table of 4 chemicals monitored result by sensors")
plt.show()

In [None]:
df3[interests2].describe()

In [None]:
df3[interests2].corr()

In [None]:
plt.matshow(df3[interests2].corr())
plt.xticks(ticks = np.arange(4),labels= ['CO2','NMHC','NOx','NO2'], rotation = 'vertical')
plt.yticks(ticks = np.arange(4),labels= ['CO2','NMHC','NOx','NO2'])
plt.subplots_adjust(left=0.8,bottom=0.1, right=1, top=0.9)
plt.colorbar(orientation = 'horizontal')
plt.title("Correlation table of 4 chemicals monitored result by traditional stations")
plt.show()

In [None]:
# output used for report writing