In [1]:
import os
import numpy as np
import pandas as pd
import datetime
from scipy.optimize import curve_fit
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import datasets, preprocessing, models, layers
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

In [2]:
Data = pd.read_csv('src/data/airquality-dataset/sample_dataset.csv', index_col=0, chunksize=50)

In [3]:
Data.get_chunk().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   channel_id  50 non-null     int64  
 1   pm2_5       50 non-null     float64
 2   pm10        50 non-null     float64
 3   s2_pm2_5    50 non-null     float64
 4   s2_pm10     50 non-null     float64
 5   Site        50 non-null     object 
 6   TimeStamp   50 non-null     object 
dtypes: float64(4), int64(1), object(2)
memory usage: 3.1+ KB


In [4]:
dataType = {'channel_id' : 'uint32',
            'pm2_5' : 'float32',
            'pm10' : 'float32',
            's2_pm2_5' : 'float32',
            's2_pm10' : 'float32',
            'Site' : 'str',
            'TimeStamp' : 'str'
           }

In [5]:
def load_data(dataPath, dataType: dict) -> pd.core.frame.DataFrame:
    data = pd.read_csv(dataPath, dtype=dataType, index_col=0)
    data = data.drop_duplicates(ignore_index=True)
    data['TimeStamp'] = pd.to_datetime(data['TimeStamp'])
    
    return data

In [6]:
data = load_data(dataPath='src/data/airquality-dataset/sample_dataset.csv', dataType=dataType)

  mask |= (ar1 == a)


In [None]:
data.head()

In [None]:
data.info()

In [None]:
channelSite = dict(zip(data['channel_id'].unique(), data['Site'].unique()))

In [None]:
siteGroups = data.groupby('channel_id')

In [None]:
# for x in siteGroups.groups:
#     if not os.path.exists('data_group'):
#         os.makedirs('data_group')
#     siteGroups.get_group(x).to_csv(f'data_group/{x}.csv')

In [None]:
def readChannel(channelId: int, dataPath='src/data/data_group/') -> pd.core.frame.DataFrame:
    path = dataPath + str(channelId) + '.csv'
    channelData = pd.read_csv(path, index_col=0, parse_dates=['TimeStamp'])
    return channelData

In [None]:
channel_one = readChannel(channelId=list(channelSite.keys())[0])

In [None]:
channel_one = readChannel(channelId=list(channelSite.keys())[0])

In [None]:
channel_one_new.index.weekofyear.unique()

In [None]:
channel_one_new2 = preprocess(channel_one, ['hour', 'day', 'month', 'day_of_week', 'week_of_year'])

In [None]:
def getFeatures(df: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    
    df = df.drop(labels='channel_id', axis=1).set_index(['TimeStamp']).resample('H').mean().fillna(method='ffill')
    #df = df.drop(labels='channel_id', axis=1).groupby(pd.Grouper(key='TimeStamp', freq='1H')).mean().fillna(method='ffill')
    df = df.assign(hour = df.index.hour,
                   day = df.index.day,
                   month = df.index.month,
                   day_of_week = df.index.dayofweek,
                   week_of_year = df.index.week)
    
    return df

def generate_cyclical_features(df: pd.core.frame.DataFrame, col_name: list) -> pd.core.frame.DataFrame:
    
    
    for time_col in col_name:
        kwargs = {
            f'sin_{time_col}' :lambda x: np.sin(2*np.pi*(x[time_col] - x[time_col].min()) / x[time_col].nunique()),
            f'cos_{time_col}' :lambda x: np.cos(2*np.pi*(x[time_col] - x[time_col].min()) / x[time_col].nunique())    
                 }
        df_time = df.assign(**kwargs)[['sin_'+time_col, 'cos_'+time_col]]
        
        df = pd.concat([df, df_time], axis=1)
    
    return df

def oneHotEncoding(df: pd.core.frame.DataFrame, col_name:list) -> pd.core.frame.DataFrame:
    
    return pd.get_dummies(data=df, columns=col_name, drop_first=True)

def preprocess(df, col_list=['hour', 'day', 'month', 'day_of_week', 'week_of_year']):

    df = getFeatures(df)
    df = generate_cyclical_features(df, col_list)
    df = oneHotEncoding(df, col_list)

    return df

In [None]:
target=['pm2_5', 'pm10', 's2_pm2_5', 's2_pm10']

In [None]:
channel_one_new.resample('H').mean()

In [None]:
fig = plt.figure(figsize=(14,10))
channel_one_new[:1000].drop('channel_id', axis=1).plot(subplots=True, layout=(2,2),
                                                              figsize=(14,10), sharex=False, rot=45)
plt.tight_layout(pad=5)

In [None]:
batch_data = TimeseriesGenerator(data=channel_one_new2.to_numpy(), targets=channel_one_new2['pm2_5'].to_numpy().reshape(-1,1), length=24, sampling_rate=1, batch_size=100)

In [None]:
batch_0 = batch_data[0]
x, y = batch_0
print(y.shape)

In [None]:
channel_one_new2.shape

In [None]:
x

In [None]:
y

In [None]:
y

In [None]:
channel_one_new2[:20]

In [None]:
batch_1 = DDD[1]
x1, y1 = batch_1
print(y1.shape)

In [None]:
x1.shape

In [None]:
x1

In [None]:
dataa = np.array([[i] for i in range(50)])
targets = np.array([[i] for i in range(50)])
data_gen = TimeseriesGenerator(dataa, targets,
                               length=10, sampling_rate=2,
                               batch_size=2)

In [None]:
data_gen[0][1].shape

In [None]:
layers.Dense?

In [None]:
TimeseriesGenerator?

In [None]:
model.fit?

In [None]:
model = models.Sequential([
    layers.Dense(100, input_shape=(x.shape[1], x.shape[2]), activation='relu'),
    layers.Flatten(),
    layers.Dense(1)
])
print(model.summary())

In [None]:
model.compile(optimizer='adam', loss='mse')

In [None]:
history = model.fit(batch_data, epochs=5)

In [None]:
history.history

In [None]:
checkpoint = './models/912223_NN_model.ckpt.data-00000-of-00001'

In [None]:
del models

In [None]:
from src.model import Model

In [None]:
model_nn = 

In [None]:
model.evaluate(batch_data)

In [None]:
model.fit?

In [None]:
model.outputs

In [None]:
YYY = model.evaluate(batch_data, verbose=0)

In [None]:
YYY

In [None]:
del model

In [None]:
batch_data[0][0].shape[1:]

RNN

In [None]:
x.shape

In [None]:
rnn_model = models.Sequential([
    layers.SimpleRNN(100, input_shape=(x.shape[1], x.shape[2]), dropout=0.5, recurrent_dropout=0.5),
    layers.Dense(1)
])
print(rnn_model.summary())

In [None]:
rnn_model.compile(optimizer='adam', loss='mse')
rnn_model.fit(batch_data, epochs=5)

LSTM

In [None]:
lstm_model = models.Sequential([
    layers.Bidirectional(layers.LSTM(100, return_sequences=True, recurrent_dropout=0.5), input_shape=(x.shape[1], x.shape[2])),
    layers.Bidirectional(layers.LSTM(100, recurrent_dropout=0.5)),
    layers.Dense(1)
])
print(lstm_model.summary())

In [None]:
lstm_model.compile(optimizer='adam', loss='mse')
lstm_model.fit(batch_data, epochs=5)

In [None]:
channel_one.groupby(pd.Grouper(key='TimeStamp', freq='1H')).mean()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, RobustScaler, QuantileTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.multioutput import MultiOutputRegressor

In [None]:
def getFeatures(df: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    df = df.drop(labels='channel_id', axis=1).groupby(pd.Grouper(key='TimeStamp', freq='1H')).mean().fillna(method='ffill')
    df = df.assign(hour = KK.index.hour,
                   day = KK.index.day,
                   month = KK.index.month,
                   day_of_week = KK.index.dayofweek,
                   week_of_year = KK.index.week)
    
    return df

def generate_cyclical_features(df: pd.core.frame.DataFrame, col_name: list) -> pd.core.frame.DataFrame:
    
    
    for time_col in col_name:
        kwargs = {
            f'sin_{time_col}' :lambda x: np.sin(2*np.pi*(x[time_col] - x[time_col].min()) / x[time_col].nunique()),
            f'cos_{time_col}' :lambda x: np.cos(2*np.pi*(x[time_col] - x[time_col].min()) / x[time_col].nunique())    
                 }
        df_time = df.assign(**kwargs)[['sin_'+time_col, 'cos_'+time_col]]
        
        df = pd.concat([df, df_time], axis=1)
    
    return df

def oneHotEncoding(df: pd.core.frame.DataFrame, col_name:list) -> pd.core.frame.DataFrame:
    
    return pd.get_dummies(data=df, columns=col_name, drop_first=True)
    
    
def featureLabelSplit(df, target:list):
    Y = df[target].to_numpy()
    X = df.drop(columns=target).to_numpy()
    return (X,Y)
    
def train_test_spliting(X , Y, test_ratio:float):
    X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=test_ratio, shuffle=False)
    return X_train, Y_train, X_val, Y_val

In [None]:
def scaler_transform(X, Y, scaler="standard"):
    scalers = {
        "minmax": MinMaxScaler,
        "standard": StandardScaler,
        "power": PowerTransformer,
        "robust": RobustScaler,
        "quantile": QuantileTransformer
    }
    
    scaler = scalers[scaler]()
    X = scaler.fit_transform(X)
    Y = scaler.fit_transform(Y)
    
    return scaler, X, Y

In [None]:
AA = getFeatures(channel_one)

In [None]:
BB = generate_cyclical_features(AA, ['hour', 'day', 'month',
       'day_of_week', 'week_of_year'])

In [None]:
CC = oneHotEncoding(BB, col_name=['hour', 'day', 'month','day_of_week', 'week_of_year'])

In [None]:
DD = featureLabelSplit(CC, target=['pm2_5', 'pm10', 's2_pm2_5', 's2_pm10'])

In [None]:
P, Q = DD

In [None]:
P_train, Q_train, P_val, Q_val = train_test_spliting(P, Q, test_ratio=0.3)

In [None]:
scaler = RobustScaler()
P_train_scale = scaler.fit_transform(P_train)
P_val_scale = scaler.transform(P_val)

Q_train_scale = scaler.fit_transform(Q_train)
Q_val_scale = scaler.transform(Q_val)

In [None]:
clf = MultiOutputRegressor(estimator=Ridge()).fit(P_train_scale, Q_train_scale)

In [None]:
Y_pred = clf.predict(P_val_scale)

In [None]:
Y_pred

In [None]:
scaler.inverse_transform(Y_pred)

In [None]:
Q_val

In [None]:
Q_val_scale

In [None]:
Y_pred

In [None]:
Q_train_scale

In [None]:
Q_val_scale

In [None]:
def generate_time_lags(df, n_lags):
    df_n = df.copy()
    for n in range(1, n_lags + 1):
        df_n[f"lag{n}"] = df_n["pm2_5"].shift(n)
    df_n = df_n.iloc[n_lags:]
    return df_n

In [None]:
generate_time_lags(KK, 24)

In [None]:
KK.assign(hour=KK.index)

In [None]:
feat = KK.assign(hour = KK.index.hour,
                 day = KK.index.day,
                 month = KK.index.month,
                 day_of_week = KK.index.dayofweek,
                 week_of_year = KK.index.week)

In [None]:
feat.day_of_week.nunique()

In [None]:
time_col = 'hour'

In [None]:
dico = {f'sin_{time_col}' :lambda x: np.sin(2*np.pi*(x[time_col] - x[time_col].min()) / x[time_col].nunique()),
        f'cos_{time_col}' :lambda x: np.cos(2*np.pi*(x[time_col] - x[time_col].min()) / x[time_col].nunique())}

In [None]:
np.sin(2*np.pi*(13-0)/24)

In [None]:
KK.assign(hour = KK.index.hour, month=KK.).assign(**dico)

In [None]:
KK.assign?

In [None]:
KKKK.agg?

In [None]:
KKKK['hour'].apply?

In [None]:
KKKK[['hour', 'month']].apply(lambda x: np.sin(2*np.pi*(x - x.min()) / x.nunique()), axis=1, result_type='expand')

In [None]:
KK_features = (KK.assign)

In [None]:
KK['pm2_5'].shift(1)

In [None]:
KK['pm2_5']

In [None]:
channel_one.groupby(pd.Grouper(key='TimeStamp', freq='1H')).median()

In [None]:
KK.corr()

In [None]:
channel_one.TimeStamp.dt

In [None]:
hour_mean = channel_one.groupby(channel_one.TimeStamp.dt.month)['pm2_5'].mean().to_numpy()

In [None]:
lower, upper = stats.t.interval(alpha=0.95, df=channel_one.groupby(channel_one.TimeStamp.dt.month)['pm2_5'].count()-1, 
                 loc=channel_one.groupby(channel_one.TimeStamp.dt.month)['pm2_5'].mean(),
                scale=channel_one.groupby(channel_one.TimeStamp.dt.month)['pm2_5'].std())

In [None]:
lower.reshape(-1,1).shape

In [None]:
hour_mean

In [None]:
pd.DataFrame?

In [None]:
DDD = pd.DataFrame(data=np.array([hour_mean, lower, upper]).T)

In [None]:
DDD.plot()

In [None]:
channel_one.groupby(channel_one.TimeStamp.dt.year)['pm2_5'].describe()

In [None]:
plt.figure(figsize=(16,7))

ax=sns.boxplot(x=channel_one.TimeStamp.dt.year, y='pm2_5', data=channel_one, orient='v', palette = sns.color_palette("deep", 5))
ax.set(ylim=(-50, 600))

plt.title('Boxplots of Hourly PM 2.5 by Year', fontsize=16)
plt.xlabel('')
plt.ylabel('ug/m^3', fontsize=12);

In [None]:
fig = plt.figure(figsize=(14,10))
testing[['date', 'pm2_5', 'pm10', 's2_pm2_5', 's2_pm10']].plot(x='date', subplots=True, layout=(2,2),
                                                              figsize=(14,10), sharex=False, rot=45)
plt.tight_layout(pad=5)

In [None]:
#delta = datetime.timedelta(hours=24)

In [None]:
pd.to_numeric?

In [None]:
assert?