In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.preprocessing import timeseries_dataset_from_array

from hyperas import optim
from hyperas.distributions import choice, uniform
from hyperopt import Trials, STATUS_OK, tpe

Using TensorFlow backend.


In [3]:
df = pd.read_csv("model_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27140 entries, 0 to 27139
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        27140 non-null  int64  
 1   Date              27140 non-null  object 
 2   temp              27140 non-null  float64
 3   dewp              27140 non-null  float64
 4   slp               27140 non-null  float64
 5   visib             27140 non-null  float64
 6   wdsp              27140 non-null  float64
 7   max               27140 non-null  float64
 8   min               27140 non-null  float64
 9   fog               27140 non-null  float64
 10  rain_drizzle      27140 non-null  float64
 11  snow_ice_pellets  27140 non-null  float64
 12  hail              27140 non-null  float64
 13  thunder           27140 non-null  float64
 14  year sin          27140 non-null  float64
 15  year cos          27140 non-null  float64
dtypes: float64(14), int64(1), object(1)
memo

In [4]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

In [5]:
values = df.drop(["Date"], axis=1).values
values = values.astype("float32")
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)

In [6]:
values

array([[ 1.64300000e+03,  5.29000015e+01,  4.22000008e+01, ...,
         0.00000000e+00, -2.42320392e-02, -9.99706388e-01],
       [ 1.64400000e+03,  6.12999992e+01,  5.29000015e+01, ...,
         0.00000000e+00, -4.14253324e-02, -9.99141574e-01],
       [ 1.64500000e+03,  6.51999969e+01,  6.02999992e+01, ...,
         0.00000000e+00, -5.86063638e-02, -9.98281181e-01],
       ...,
       [ 2.87800000e+04,  7.27799988e+01,  5.95000000e+01, ...,
         0.00000000e+00,  9.93693531e-01, -1.12130076e-01],
       [ 2.87810000e+04,  7.28000031e+01,  5.85800018e+01, ...,
         0.00000000e+00,  9.91617680e-01, -1.29206926e-01],
       [ 2.87820000e+04,  7.63399963e+01,  6.75800018e+01, ...,
         0.00000000e+00,  9.89248335e-01, -1.46245554e-01]], dtype=float32)

In [7]:
series_to_supervised(scaled, 2, 1)

Unnamed: 0,var1(t-2),var2(t-2),var3(t-2),var4(t-2),var5(t-2),var6(t-2),var7(t-2),var8(t-2),var9(t-2),var10(t-2),...,var6(t),var7(t),var8(t),var9(t),var10(t),var11(t),var12(t),var13(t),var14(t),var15(t)
2,0.000000,0.478950,0.551425,0.580982,0.372414,0.169381,0.469502,0.447147,0.0,0.0,...,0.221498,0.591092,0.662809,0.0,0.0,0.0,0.0,0.0,0.470697,0.000859
3,0.000037,0.593028,0.684015,0.558163,0.396552,0.351792,0.529695,0.596347,0.0,0.0,...,0.407166,0.577849,0.730627,0.0,1.0,0.0,0.0,0.0,0.462115,0.001437
4,0.000074,0.645994,0.775712,0.512520,0.362069,0.221498,0.591092,0.662809,0.0,0.0,...,0.315961,0.577849,0.608554,0.0,1.0,0.0,0.0,1.0,0.453544,0.002163
5,0.000111,0.666365,0.812887,0.455467,0.296552,0.407166,0.577849,0.730627,0.0,1.0,...,0.201954,0.615169,0.623474,1.0,1.0,0.0,0.0,0.0,0.444987,0.003036
6,0.000147,0.617474,0.686493,0.225357,0.324138,0.315961,0.577849,0.608554,0.0,1.0,...,0.192182,0.540530,0.703499,1.0,1.0,0.0,0.0,0.0,0.436447,0.004055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27135,0.999779,0.741602,0.658736,0.548273,0.320690,0.160261,0.701605,0.683425,0.0,0.0,...,0.067101,0.730979,0.658739,0.2,0.0,0.0,0.0,0.0,0.998481,0.461059
27136,0.999816,0.693526,0.696654,0.504913,0.315172,0.087296,0.705939,0.623745,0.0,0.0,...,0.103583,0.704254,0.739850,0.0,0.0,0.0,0.0,0.0,0.997738,0.452490
27137,0.999853,0.736170,0.759851,0.515562,0.297241,0.067101,0.730979,0.658739,0.2,0.0,...,0.106189,0.731461,0.715978,0.0,0.0,0.0,0.0,0.0,0.996847,0.443935
27138,0.999889,0.756813,0.784634,0.519367,0.311034,0.103583,0.704254,0.739850,0.0,0.0,...,0.102280,0.745425,0.680441,0.0,0.0,0.0,0.0,0.0,0.995809,0.435397


In [8]:
reframed = series_to_supervised(scaled, 1, 1)
reframed.drop(reframed.columns[[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]], axis=1, inplace=True)

In [None]:
reframed.head()

In [44]:
reframe_values = reframed.values
X = reframe_values[:,:-1]
y = reframe_values[:, -1]

x_train, x_test, y_train, y_test = train_test_split(X, y, shuffle=False)
X_train = np.expand_dims(x_train, axis=1)
X_test = np.expand_dims(x_test, axis=1)
y_train = np.expand_dims(y_train, axis=1)
y_test = np.expand_dims(y_test, axis=1)

In [40]:
y_test

array([[0.7500276 ],
       [0.75006443],
       [0.7501013 ],
       ...,
       [0.9999263 ],
       [0.99996316],
       [1.        ]], dtype=float32)

In [51]:
def data():
    return X_train, y_train, X_test, y_test


In [42]:

def create_model(X_train, Y_train, X_test, Y_test):
    model = Sequential()
    model.add(LSTM({{uniform(10,50)}}, 
            input_shape=(X_train.shape[1], 
            X_train.shape[2])
            ))
    model.add(Dense({{choice([10, 20, 50, 100])}}))
    model.add(Dense(1))
    model.compile(loss={{choice(["mae", 'categorical_crossentropy'])}}, 
                    optimizer={{choice(["adam", "sgd", "rmsprop"])}}, 
                    metrics=["mse"])

    result = model.fit(X_train, Y_train, epochs=25, batch_size=60, validation_split=0.1, verbose=2, shuffle=False)

    validation_acc=np.amax(result.history['val_acc'])
    print('Best Validation acc of epoch:', validation_acc)
    return {'loss': -validation_acc, 'status': STATUS_OK, 'model':model}

In [53]:
X_train

array([[[0.0000000e+00, 4.7894979e-01, 5.5142498e-01, ...,
         0.0000000e+00, 4.8788399e-01, 1.4680624e-04]],

       [[3.6846846e-05, 5.9302849e-01, 6.8401486e-01, ...,
         0.0000000e+00, 4.7928733e-01, 4.2921305e-04]],

       [[7.3693693e-05, 6.4599359e-01, 7.7571243e-01, ...,
         0.0000000e+00, 4.7069681e-01, 8.5940957e-04]],

       ...,

       [[7.4988019e-01, 6.8809414e-01, 8.1734812e-01, ...,
         0.0000000e+00, 4.0242881e-02, 3.0347168e-01]],

       [[7.4991709e-01, 6.8048882e-01, 7.0161086e-01, ...,
         0.0000000e+00, 3.6930233e-02, 3.1140947e-01]],

       [[7.4995393e-01, 4.8275238e-01, 2.9962823e-01, ...,
         0.0000000e+00, 3.3754647e-02, 3.1940305e-01]]], dtype=float32)

In [49]:


if __name__ == '__main__':
    X_train, y_train, X_test, y_test = data()
    best_run, best_model = optim.minimize(model=create_model,
                                          data=data,
                                          algo=tpe.suggest,
                                          max_evals=5,
                                          trials=Trials(),
                                          notebook_name = 'BlackBox')
    
    print('Evaluation of best performing model:')
    print(best_model.evaluate(X_test, y_test))
    print("Best Performing Model Hyper-Parameters:")
    print(best_run)

>>> Imports:
#coding=utf-8

try:
    import pandas as pd
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    import tensorflow as tf
except:
    pass

try:
    from matplotlib import pyplot as plt
except:
    pass

try:
    from sklearn.preprocessing import LabelEncoder
except:
    pass

try:
    from sklearn.preprocessing import MinMaxScaler
except:
    pass

try:
    from sklearn.model_selection import train_test_split
except:
    pass

try:
    from sklearn.metrics import mean_squared_error
except:
    pass

try:
    from tensorflow.keras.models import Sequential
except:
    pass

try:
    from tensorflow.keras.layers import Dense
except:
    pass

try:
    from tensorflow.keras.layers import LSTM
except:
    pass

try:
    from tensorflow.keras.preprocessing import timeseries_dataset_from_array
except:
    pass

try:
    from hyperas import optim
except:
    pass

try:
    from hyperas.distributions import choice, uniform
except:
    pass

try:
    from hyperop

NameError: name 'X_train' is not defined

In [None]:
def data():
    X_train = np.expand_dims(X_train, axis=1)
    X_test = np.expand_dims(X_test, axis=1)
    y_train = np.expand_dims(y_train, axis=1)
    y_test = np.expand_dims(y_test, axis=1)
    return X_train, y_train, X_test, y_test

def create_model(X_train, y_train, X_test, y_test):
    model = Sequential()
    model.add(LSTM({{uniform(10,50)}}, 
            input_shape=(X_train.shape[1], 
            X_train.shape[2])
            ))
    model.add(Dense({{choice([10, 20, 50, 100])}}))
    model.add(Dense(1))
    model.compile(loss={{choice(["mae", 'categorical_crossentropy'])}}, 
                    optimizer={{choice(["adam", "sgd", "rmsprop"])}}, 
                    metrics=["mse"])

    score = model.evaluate(X_test, y_test, verbose=0)
    accuracy = score[1]
    return {'loss': -accuracy, 'status': STATUS_OK, 'model': model}

In [None]:
    best_run = optim.minimize(model=create_model,
                                          data=data,
                                          algo=tpe.suggest,
                                          max_evals=5,
                                          trials=Trials())

In [None]:
plt.plot(history.history["mse"], label="train")
plt.plot(history.history["val_mse"], label="test")
plt.legend()
plt.title('Train vs. Test MSE')
plt.show()

In [None]:
residual = pred - y_test
plt.plot(residual)

In [None]:
plt.plot(residual[0:1000])

In [None]:
pred = model.predict(X_test)
X_test_reshape = X_test.reshape((X_test.shape[0], X_test.shape[2]))
inv = np.concatenate((pred, X_test_reshape[:, 1:]), axis=1)



inv_scale = scaler.inverse_transform(inv)

y_inv = np.concatenate((y_test, X_test_reshape[:, 1:]), axis=1)
y_inv = scaler.inverse_transform(y_inv)
y_inv

# y_test.shape
inv_scale[0].shape
# scaler.inverse_transform(pred)
predictions = pd.DataFrame({"Prediction": inv_scale[:,:1].flatten(), "Actual": y_inv[:,:1].flatten()})

In [None]:
predictions.head(50)