In [1]:
import pandas as pd
import numpy as np
from pandas import datetime
from matplotlib import pyplot as plt


  from pandas import datetime


In [2]:
"""
Load AirQualityUCI Data
"""

def parser(x):
    return datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

input_file = './data/AirQualityUCI_refined.csv'

df = pd.read_csv(input_file,
                 index_col=[0],
                 parse_dates=[0],
                 date_parser=parser)



# Visualization setup
%matplotlib
from matplotlib import pyplot as plt
import seaborn; seaborn.set()  # set plot styles
%config InlineBackend.figure_format = 'svg'
plt.rcParams['figure.figsize'] = [10, 5]
plt.ion() # enable the interactive mode

# Visualize the 'CO(GT)' variable
df['CO(GT)'].plot()

# Linear interpolation
co = df['CO(GT)'].copy()
co.interpolate(inplace=True)

# Visualize original and imputed data
plt.plot(df['CO(GT)'], label='original', zorder=2)
plt.plot(co, label='linear interpolation', zorder=1)
plt.legend(loc='best')

# Detecting outliers using Boxplot
plt.boxplot(co)
plt.title("Detecting outliers using Boxplot")
plt.xlabel('CO(GT)')

# Calculate correlations between variables
corr_matrix = df.corr()
print(corr_matrix)

# Choose the least correlated variable
rh = df['RH'].copy().interpolate() # RH(Relative Humidity)

# Visualize a scatter plot(CO, RH)
plt.scatter(co, rh, s=12, c='black')
plt.xlabel('CO(GT)')
plt.ylabel('RH')

# Choose the most correlated variable
nmhc = df['PT08.S2(NMHC)'].copy().interpolate() # NMHC(non-methane hydrocarbons)

# Visualize a scatter plot(CO, NMHC)
plt.scatter(co, nmhc, s=12, c='black')
plt.xlabel('CO(GT)')
plt.ylabel('PT08.S2(NMHC)')


Using matplotlib backend: TkAgg
                 CO(GT)  PT08.S1(CO)  PT08.S2(NMHC)   NOx(GT)  PT08.S3(NOx)  \
CO(GT)         1.000000     0.877203       0.914973  0.792557     -0.701703   
PT08.S1(CO)    0.877203     1.000000       0.892964  0.713654     -0.771938   
PT08.S2(NMHC)  0.914973     0.892964       1.000000  0.704435     -0.796703   
NOx(GT)        0.792557     0.713654       0.704435  1.000000     -0.655707   
PT08.S3(NOx)  -0.701703    -0.771938      -0.796703 -0.655707      1.000000   
NO2(GT)        0.679262     0.641529       0.646245  0.763111     -0.652083   
PT08.S4(NO2)   0.639470     0.682881       0.777254  0.233731     -0.538468   
PT08.S5(O3)    0.851403     0.899324       0.880578  0.787046     -0.796569   
RH             0.040218     0.114606      -0.090380  0.221032     -0.056740   
AH             0.065809     0.135324       0.186933 -0.149323     -0.232017   
C6H6(GT)       0.845144     0.883795       0.981950  0.626638     -0.735744   

                NO2

Text(0, 0.5, 'PT08.S2(NMHC)')

In [None]:
"""
IQR-based Outlier Detection
"""

# Q1, Q2(median), Q3
q1 = co.quantile(0.25)
median = co.quantile(0.5)
q3 = co.quantile(0.75)
print(q1, median, q3)

# IQR, upper_fence, lower_fence
iqr = q3-q1
upper_fence = q3 + 1.5*iqr
lower_fence = q1 - 1.5*iqr
print(upper_fence, lower_fence)

# Filtering the outliers
outliers = co.loc[(co > upper_fence) | (co < 0)]
print(outliers)

# Mask for outliers
mask = co.index.isin(outliers.index)
mask

# Visualize the normal data and outliers
plt.plot(co[~mask], label='normal', color='blue',
    marker='o', markersize=3, linestyle='None')
plt.plot(outliers, label='outliers', color='red',
    marker='x', markersize=3, linestyle='None')
plt.legend(loc='best')

# Removing the outliers
co_refined = co.copy()
co_refined[mask] = np.nan
print(co_refined[mask])
co_refined.plot()

# Linear interpolation for reconstructing outliers removed.
co_refined.interpolate(inplace=True)
co_refined.plot()

In [3]:
"""
Detecting Outliers with Z-Scores
"""

# Visualize the distribution of the 'CO(GT)' variable
import seaborn as sns
sns.distplot(co)

# Mean, Standard deviation
mean = np.mean(co)
std = np.std(co)
print(mean, std)

# Calculate Z-scores for each data points

outliers = []
thres = 3   # Z-score threshold

for i in co:
    z_score = (i-mean) / std
    if (np.abs(z_score) > thres):
        print(z_score)
        outliers.append(i)


# Simplified version of filtering outliers
outliers = co.loc[np.abs((co-mean)/std) > 3].copy()
outliers

# Mask for outliers
mask = co.index.isin(outliers.index)
mask

# Comparison of distributions before/after outlier removal
sns.distplot(co, axlabel='CO(GT)', label='original')
sns.distplot(co[~mask], label='outliers removed')
plt.legend(loc='best')

# [exer] Adjust thres

# Flooring and Capping
floor = co.quantile(0.1)
cap = co.quantile(0.9)

co.loc[co < floor] = floor
co.loc[co > cap] = cap

# Visualize the result
co.plot()




2.126146200705357 1.4369815748596482
3.3221398818289734
3.113368937755246
4.157223658123882
4.087633343432639
3.0437786230640036
3.113368937755246
3.8092720846676698
3.182959252446488
3.113368937755246
3.600501140593942
3.5309108259027
3.0437786230640036
3.0437786230640036
3.182959252446488
3.0437786230640036
3.600501140593942
3.182959252446488
3.7396817699764275
3.0437786230640036
3.182959252446488
3.3917301965202156
4.087633343432639
3.878862399358912
3.2525495671377302
4.36599460219761
3.2525495671377302
4.018043028741397
4.36599460219761
4.087633343432639
4.36599460219761
5.131488063801275
3.0437786230640036
3.113368937755246
3.7396817699764275
3.2525495671377302
4.018043028741397
4.087633343432639
4.922717119727547
3.3917301965202156
3.786075313103922
4.180420429687629
4.574765546271336
4.157223658123882
4.087633343432639
3.113368937755246
3.2525495671377302
3.9484527140501537
5.618620266639971
3.113368937755246
3.7396817699764275
3.5309108259027
4.296404287506367
4.50517523158009



<AxesSubplot:xlabel='Datetime', ylabel='Density'>