In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge
import pickle
%matplotlib inline

We will use two techniques for handling missing data:
1. Fill the missing data using an imputer with the best score
2. If data is missing for more than 3 hours, remove those rows, otherwise linearly interpolate

In [None]:
df = pd.read_csv('./datasets/Karpos.csv', index_col=0)
df.drop(['NO2'], axis=1, inplace=True)
# use the following lines only if you want to interpolate the missing data
print("Number of rows before interpolation: {}".format(len(df)))
print("Number of NaN rows: {}".format(df.isna().sum()))
df = df.interpolate(method="linear", limit=1, limit_area="inside")
df = df.dropna()
print("Number of rows after interpolation: {}".format(len(df)))

Number of rows before interpolation: 17904
Number of NaN rows: relative_humidity      11
wind_speed              0
visibility              0
pressure               17
snow                   13
solar_radiation         0
wind_direction         11
temperature            13
precipitation           0
cloud_coverage          0
hour                    0
day                     0
month                   0
year                    0
weekend                 0
holiday                 0
PM10                 3326
PM25                  644
SO2                  2329
dtype: int64
Number of rows after interpolation: 13160


**Train-validation-test data split**

In [None]:
train_size = int(0.8 * df.shape[0])
validation_size = int(0.1 * df.shape[0])
df_train = df.iloc[:train_size]
df_validation = df.iloc[train_size:train_size + validation_size]
df_test = df.iloc[train_size + validation_size:]

**Scaling the data**


In [None]:
features_to_standardize = ['PM10', 'PM25', 'SO2', 'relative_humidity', 'pressure',
                           'solar_radiation', 'wind_direction', 'temperature']
features_to_normalize = ['wind_speed', 'visibility', 'snow', 'precipitation', 'cloud_coverage',
                         'hour', 'day', 'month', 'year']

scalers = {}
for feature in features_to_standardize:
  scaler = StandardScaler()
  scaler.fit(df_train[feature].values.reshape(-1, 1))
  scalers[feature] = scaler

for feature in features_to_normalize:
  scaler = MinMaxScaler()
  scaler.fit(df_train[feature].values.reshape(-1, 1))
  scalers[feature] = scaler

for feature, scaler in scalers.items():
  with open(f'./scalers/Karpos/{feature}', 'wb') as f:
    pickle.dump(scaler, f)

In [None]:
df_train_scaled = df_train.copy()
df_validation_scaled = df_validation.copy()
df_test_scaled = df_test.copy()
for feature, scaler in scalers.items():
  df_train_scaled[feature] = scaler.transform(df_train[feature].values.reshape(-1, 1)).flatten()
  df_validation_scaled[feature] = scaler.transform(df_validation[feature].values.reshape(-1, 1)).flatten()
  df_test_scaled[feature] = scaler.transform(df_test[feature].values.reshape(-1, 1)).flatten()

df_validation_scaled.describe()

Unnamed: 0,relative_humidity,wind_speed,visibility,pressure,snow,solar_radiation,wind_direction,temperature,precipitation,cloud_coverage,hour,day,month,year,weekend,holiday,PM10,PM25,SO2
count,1316.0,1316.0,1316.0,1316.0,1316.0,1316.0,1316.0,1316.0,1316.0,1316.0,1316.0,1316.0,1316.0,1316.0,1316.0,1316.0,1316.0,1316.0,1316.0
mean,-0.320853,0.265458,0.482776,-0.366892,0.0,0.570652,0.122767,1.092187,0.009538,0.355114,0.501454,0.467376,0.479829,1.0,0.072948,0.0,-0.832714,-0.595272,-0.383143
std,0.936908,0.115307,0.431751,0.669971,0.0,1.379757,0.950437,0.625738,0.05794,0.338406,0.301912,0.293298,0.055731,0.0,0.260151,0.0,0.639086,0.635631,0.542258
min,-2.35356,0.028571,0.0,-1.688309,0.0,-0.654738,-1.397954,-0.378663,0.0,0.0,0.0,0.0,0.363636,1.0,0.0,0.0,-3.457721,-2.535463,-1.652397
25%,-1.105337,0.171429,0.111111,-0.896201,0.0,-0.654738,-0.594873,0.64007,0.0,0.03,0.26087,0.233333,0.454545,1.0,0.0,0.0,-1.212325,-0.977607,-0.714036
50%,-0.306474,0.257143,0.222222,-0.326557,0.0,-0.102313,0.157218,1.043086,0.0,0.25,0.521739,0.433333,0.454545,1.0,0.0,0.0,-0.769422,-0.491661,-0.436379
75%,0.492389,0.342857,1.0,0.037476,0.0,1.827367,0.981545,1.546855,0.0,0.66,0.782609,0.7,0.545455,1.0,0.0,0.0,-0.417934,-0.15279,-0.122801
max,1.640755,0.685714,1.0,1.547539,0.0,3.247922,1.661402,2.767096,0.654592,0.99,1.0,1.0,0.545455,1.0,1.0,0.0,1.80118,0.646321,2.612347


In [None]:
# use the following lines only if you want to save the interpolated datasets
df_train_scaled.to_csv(f'./interpolated_datasets/Karpos/train.csv', index=True)
df_validation_scaled.to_csv(f'./interpolated_datasets/Karpos/validation.csv', index=True)
df_test_scaled.to_csv(f'./interpolated_datasets/Karpos/test.csv', index=True)

**Imputing missing values**

This part should be done without previously interpolating the missing data.

In [None]:
missing_train = df_train_scaled.isna().sum().values
missing_train_prob = missing_train / missing_train.sum()
missing_train_prob

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.86219739e-03,
       3.10366232e-04, 0.00000000e+00, 0.00000000e+00, 3.10366232e-04,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       6.70391061e-01, 1.96772191e-01, 1.30353818e-01])

In [None]:
train_values = df_train_scaled.values.copy()
validation_values = df_validation_scaled.values.copy()
test_values = df_test_scaled.values.copy()
validation_values_true = df_validation_scaled.dropna().values.copy()

np.random.seed(0)
validation_values_missing = validation_values_true.copy()
n_samples, n_features = validation_values_missing.shape
missing_samples = np.random.choice(n_samples, int(0.1*n_samples), replace=False)
for s in missing_samples:
  n_missing_features = np.random.randint(1, 5)
  missing_features = np.random.choice(n_features, n_missing_features, replace=False, p=missing_train_prob)
  validation_values_missing[s, missing_features] = np.nan

In [None]:
imputers = {'mean': SimpleImputer(missing_values=np.nan, strategy='mean'),
            'median': SimpleImputer(missing_values=np.nan, strategy='median'),
            'bayesian_ridge': IterativeImputer(estimator=BayesianRidge(), random_state=0),
            'decision_tree': IterativeImputer(estimator=DecisionTreeRegressor(max_features='sqrt', random_state=0), random_state=0),
            'k_neighbors_7': IterativeImputer(estimator=KNeighborsRegressor(n_neighbors=7), random_state=0, skip_complete=True),
            'k_neighbors_11': IterativeImputer(estimator=KNeighborsRegressor(n_neighbors=11), random_state=0, skip_complete=True),
            'k_neighbors_15': IterativeImputer(estimator=KNeighborsRegressor(n_neighbors=15), random_state=0, skip_complete=True),
            'extra_trees_10': IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=10, random_state=0), random_state=0, skip_complete=True),
            'extra_trees_15': IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=15, random_state=0), random_state=0, skip_complete=True),
            'extra_trees_20': IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=20, random_state=0), random_state=0, skip_complete=True)
            }

for imputer_name, imputer in imputers.items():
  imputer.fit(train_values)
  validation_values_filled = imputer.transform(validation_values_missing)
  norm = np.linalg.norm(validation_values_true - validation_values_filled, ord='fro')
  print(f'{imputer_name}:\t\t{norm}')

mean:		12.474401747314596
median:		12.655102572349586




bayesian_ridge:		13.298130551688669




decision_tree:		15.617717245396236




k_neighbors_7:		12.406505319322077




k_neighbors_11:		12.197414295570242




k_neighbors_15:		12.094524532571054




extra_trees_10:		13.709553608450967




extra_trees_15:		14.043563220099912
extra_trees_20:		14.046359301036286




Based on the scores, K-Nearest Neighbors estimator with K=15 produced the best result for filling the missing values.

In [None]:
imputer = imputers['k_neighbors_15']
imputed_train_values = imputer.fit_transform(train_values)
imputed_validation_values = imputer.transform(validation_values)
imputed_test_values = imputer.transform(test_values)
df_train_imputed = pd.DataFrame(data=imputed_train_values,
                                index=df_train_scaled.index,
                                columns=df_train_scaled.columns)
df_validation_imputed = pd.DataFrame(data=imputed_validation_values,
                                     index=df_validation_scaled.index,
                                     columns=df_validation_scaled.columns)
df_test_imputed = pd.DataFrame(data=imputed_test_values,
                               index=df_test_scaled.index,
                               columns=df_test_scaled.columns)
df_train_imputed.to_csv(f'./preprocessed_datasets/Karpos/train.csv', index=True)
df_validation_imputed.to_csv(f'./preprocessed_datasets/Karpos/validation.csv', index=True)
df_test_imputed.to_csv(f'./preprocessed_datasets/Karpos/test.csv', index=True)

