In [1]:
!pip install kaggle



Upload your credential key from kaggle

In [2]:
!mkdir -p ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Download datasets with API Command from kaggle

In [3]:
!kaggle datasets download -d muthuj7/weather-dataset

Downloading weather-dataset.zip to /content
  0% 0.00/2.23M [00:00<?, ?B/s]
100% 2.23M/2.23M [00:00<00:00, 176MB/s]


Extract the data

In [None]:
!unzip weather-dataset.zip -d ./data

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

In [7]:
df = pd.read_csv('/content/data/weatherHistory.csv')
df.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


In [8]:
main_df = df[['Formatted Date','Temperature (C)','Apparent Temperature (C)']]
main_df = main_df.rename(columns={'Formatted Date': 'time'})

main_df.head()

Unnamed: 0,time,Temperature (C),Apparent Temperature (C)
0,2006-04-01 00:00:00.000 +0200,9.472222,7.388889
1,2006-04-01 01:00:00.000 +0200,9.355556,7.227778
2,2006-04-01 02:00:00.000 +0200,9.377778,9.377778
3,2006-04-01 03:00:00.000 +0200,8.288889,5.944444
4,2006-04-01 04:00:00.000 +0200,8.755556,6.977778


In [9]:
def make_time_features(series):

    #convert series to datetimes
    times = series.apply(lambda x: x.split('+')[0])
    datetimes = pd.DatetimeIndex(times)

    hours = datetimes.hour.values
    day = datetimes.dayofweek.values
    months = datetimes.month.values

    hour = pd.Series(hours, name='hours')
    dayofw = pd.Series(day, name='dayofw')
    month = pd.Series(months, name='months')

    return hour, dayofw, month

In [10]:
def clean_data(series):
    """Fills missing values.

        Interpolate missing values with a linear approximation.
    """
    series_filled = series.interpolate(method='linear')

    return series_filled


def min_max_scale(dataframe):
    """ Applies MinMax Scaling

        Wrapper for sklearn's MinMaxScaler class.
    """
    mm = MinMaxScaler()
    return mm.fit_transform(dataframe)

def split_data(series, train_fraq, test_len=8760):
    """Splits input series into train, val and test.

        Default to 1 year of test data.
    """
    #slice the last year of data for testing 1 year has 8760 hours
    test_slice = len(series)-test_len

    test_data = series[test_slice:]
    train_val_data = series[:test_slice]

    #make train and validation from the remaining
    train_size = int(len(train_val_data) * train_fraq)

    train_data = train_val_data[:train_size]
    val_data = train_val_data[train_size:]

    return train_data, val_data, test_data

In [24]:
multivar_df = clean_data(main_df)

#add hour and month features
hours, day, months = make_time_features(multivar_df.time)
multivar_df = pd.concat([multivar_df.drop(['time'], axis=1), hours, day, months], axis=1)

print(multivar_df)

       Temperature (C)  Apparent Temperature (C)  hours  dayofw  months
0             9.472222                  7.388889      0       5       4
1             9.355556                  7.227778      1       5       4
2             9.377778                  9.377778      2       5       4
3             8.288889                  5.944444      3       5       4
4             8.755556                  6.977778      4       5       4
...                ...                       ...    ...     ...     ...
96448        26.016667                 26.016667     19       4       9
96449        24.583333                 24.583333     20       4       9
96450        22.038889                 22.038889     21       4       9
96451        21.522222                 21.522222     22       4       9
96452        20.438889                 20.438889     23       4       9

[96453 rows x 5 columns]


In [25]:
#scale
multivar_df = min_max_scale(multivar_df)

train_multi, val_multi, test_multi = split_data(multivar_df, train_fraq=0.65, test_len=8760)

In [19]:
train_multi

array([[0.50697507, 0.52348604, 0.        , 0.83333333, 0.27272727],
       [0.50508505, 0.52108359, 0.04347826, 0.83333333, 0.27272727],
       [0.50544505, 0.5531439 , 0.08695652, 0.83333333, 0.27272727],
       ...,
       [0.79371794, 0.82221854, 0.91304348, 0.        , 0.54545455],
       [0.79128791, 0.82428962, 0.95652174, 0.        , 0.54545455],
       [0.76203762, 0.7893298 , 1.        , 0.        , 0.54545455]])

In [20]:
def window_dataset(data, n_steps, n_horizon, batch_size, shuffle_buffer, multi_var=False, expand_dims=False):
    """ Create a windowed tensorflow dataset

    """

    #create a window with n steps back plus the size of the prediction length
    window = n_steps + n_horizon

    #expand dimensions to 3D to fit with LSTM inputs
    #creat the inital tensor dataset
    if expand_dims:
        ds = tf.expand_dims(data, axis=-1)
        ds = tf.data.Dataset.from_tensor_slices(ds)
    else:
        ds = tf.data.Dataset.from_tensor_slices(data)

    #create the window function shifting the data by the prediction length
    ds = ds.window(window, shift=n_horizon, drop_remainder=True)

    #flatten the dataset and batch into the window size
    ds = ds.flat_map(lambda x : x.batch(window))
    ds = ds.shuffle(shuffle_buffer)

    #create the supervised learning problem x and y and batch
    if multi_var:
        ds = ds.map(lambda x : (x[:-n_horizon], x[-n_horizon:, :1]))
    else:
        ds = ds.map(lambda x : (x[:-n_horizon], x[-n_horizon:]))

    ds = ds.batch(batch_size).prefetch(1)

    return ds

tf.random.set_seed(42)

n_steps = 72
n_horizon = 24
batch_size = 1
shuffle_buffer = 100

train_ds = window_dataset(train_multi, n_steps, n_horizon, batch_size, shuffle_buffer, multi_var=True)
val_ds = window_dataset(val_multi, n_steps, n_horizon, batch_size, shuffle_buffer, multi_var=True)
test_ds = window_dataset(test_multi, n_steps, n_horizon, batch_size, shuffle_buffer, multi_var=True)

print('Example sample shapes')
for idx,(x,y) in enumerate(train_ds):
    print("x = ", x.numpy().shape)
    print("y = ", y.numpy().shape)
    break

Example sample shapes
x =  (1, 72, 5)
y =  (1, 24, 1)


In [21]:
def get_params(multivar=False):
    lr = 3e-4
    n_steps=24*30
    n_horizon=24
    if multivar:
        n_features=5
    else:
        n_features=1

    return n_steps, n_horizon, n_features, lr

In [22]:
def lstm_model(n_steps, n_horizon, n_features, lr):

    tf.keras.backend.clear_session()

    model = tf.keras.models.Sequential([
        tf.keras.layers.LSTM(72, activation='relu', input_shape=(n_steps, n_features), return_sequences=True),
        tf.keras.layers.LSTM(48, activation='relu', return_sequences=False),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(n_horizon)
    ])

    loss = tf.keras.losses.Huber()
    optimizer = tf.keras.optimizers.Adam(lr=lr)

    model.compile(loss=loss, optimizer='adam', metrics=['mae'])

    return model

lstm = lstm_model(*get_params(multivar=True))
lstm.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 720, 72)           22464     
                                                                 
 lstm_1 (LSTM)               (None, 48)                23232     
                                                                 
 flatten (Flatten)           (None, 48)                0         
                                                                 
 dropout (Dropout)           (None, 48)                0         
                                                                 
 dense (Dense)               (None, 128)               6272      
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 24)                3

In [23]:
lstm.fit(train_ds, validation_data=val_ds, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7b23ff7adcc0>