In [1]:
%load_ext autoreload
%autoreload 2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Flatten, LSTM
import pandas as pd
import numpy as np

In [2]:
from RainPredictionMachine.data import CleanDataRpm
cleaner = CleanDataRpm()
df = cleaner.clean_data(1)

In [3]:
df.head(2).to_dict()

{'Chuva': {0: 0.0, 1: 0.0},
 'Pres': {0: 957.9, 1: 958.0},
 'Pres_max': {0: 957.9, 1: 958.3},
 'Pres_min': {0: 957.1, 1: 957.9},
 'Radiacao': {0: 0.0, 1: 0.0},
 'Temp': {0: 21.7, 1: 21.4},
 'Temp_orvalho': {0: 21.7, 1: 21.4},
 'Temp_max': {0: 23.7, 1: 21.7},
 'Temp_min': {0: 21.7, 1: 21.4},
 'Temp_orvalho_max': {0: 23.7, 1: 21.6},
 'Temp_orvalho_min': {0: 21.7, 1: 21.4},
 'Umid_max': {0: 100.0, 1: 100.0},
 'Umid_min': {0: 100.0, 1: 100.0},
 'Umid': {0: 100.0, 1: 100.0},
 'Dir_vento': {0: 130.5, 1: 109.0},
 'Rajada_vento': {0: 3.75, 1: 5.4},
 'Vel_vento': {0: 2.1, 1: 3.35},
 'Latitude': {0: -21.927251, 1: -21.927251},
 'Longitude': {0: -50.490251, 1: -50.490251},
 'Altitude': {0: 498.0, 1: 498.0},
 'datahora': {0: Timestamp('2021-01-01 00:00:00+0000', tz='UTC'),
  1: Timestamp('2021-01-01 01:00:00+0000', tz='UTC')},
 'classe_chuva': {0: 0, 1: 0}}

In [4]:
df.isnull().sum()

Chuva               0
Pres                0
Pres_max            0
Pres_min            0
Radiacao            0
Temp                0
Temp_orvalho        0
Temp_max            0
Temp_min            0
Temp_orvalho_max    0
Temp_orvalho_min    0
Umid_max            0
Umid_min            0
Umid                0
Dir_vento           0
Rajada_vento        0
Vel_vento           0
Latitude            0
Longitude           0
Altitude            0
datahora            0
classe_chuva        0
dtype: int64

In [5]:
def subsample_sequence(df, length):
    index = np.random.randint(0, df.shape[0] - length)
    df_sample = df.iloc[index:index+length]
    
    return df_sample

In [6]:
df_sample = subsample_sequence(df,48)
df_sample.head()

Unnamed: 0,Chuva,Pres,Pres_max,Pres_min,Radiacao,Temp,Temp_orvalho,Temp_max,Temp_min,Temp_orvalho_max,...,Umid_min,Umid,Dir_vento,Rajada_vento,Vel_vento,Latitude,Longitude,Altitude,datahora,classe_chuva
2122,0.0,955.6,955.6,955.0,49.2,19.6,19.6,20.1,18.8,20.0,...,100.0,100.0,107.0,2.4,0.5,-21.927251,-50.490251,498.0,2021-03-30 10:00:00+00:00,0
2123,0.0,956.1,956.2,955.6,558.5,22.0,20.9,22.1,19.6,21.0,...,90.0,93.0,37.0,2.7,1.8,-21.927251,-50.490251,498.0,2021-03-30 11:00:00+00:00,0
2124,0.118347,957.201226,957.464789,956.940262,829.614453,23.506298,15.272879,24.208814,22.826405,15.814165,...,61.544826,64.635291,147.019878,5.888722,2.862701,-21.927251,-50.490251,498.0,2021-03-30 12:00:00+00:00,1
2125,0.0,956.7,956.8,956.6,2201.8,26.2,21.5,26.6,24.0,21.6,...,70.0,75.0,222.0,3.4,1.2,-21.927251,-50.490251,498.0,2021-03-30 13:00:00+00:00,0
2126,0.0,956.4,956.7,956.4,2886.6,27.8,20.8,28.3,26.2,21.8,...,62.0,66.0,251.0,4.9,2.2,-21.927251,-50.490251,498.0,2021-03-30 14:00:00+00:00,0


In [7]:
def split_subsample_sequence(df, length):
    '''Create one single random (X,y) pair'''
    
    df_subsample = subsample_sequence(df, length)
    y_sample = df_subsample['classe_chuva'].iloc[length - 24:]
    
    X_sample = df_subsample.drop(columns= ['classe_chuva', 'datahora'])[0:length -24]
    X_sample = X_sample.values
   
    return np.array(X_sample), np.array(y_sample)

In [8]:
X,y = split_subsample_sequence(df,48)
X.shape

(24, 20)

In [9]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [10]:
X,y 

(array([[ 0.0000000e+00,  9.6020000e+02,  9.6030000e+02,  9.6000000e+02,
          0.0000000e+00,  1.5200000e+01,  8.4000000e+00,  1.5600000e+01,
          1.5000000e+01,  8.5000000e+00,  7.9000000e+00,  6.4000000e+01,
          6.1000000e+01,  6.4000000e+01,  1.2600000e+02,  4.4000000e+00,
          2.2000000e+00, -2.1927251e+01, -5.0490251e+01,  4.9800000e+02],
        [ 0.0000000e+00,  9.6090000e+02,  9.6090000e+02,  9.6020000e+02,
          9.6500000e+01,  1.4800000e+01,  8.8000000e+00,  1.5500000e+01,
          1.4800000e+01,  8.8000000e+00,  8.4000000e+00,  6.7000000e+01,
          6.4000000e+01,  6.7000000e+01,  1.0800000e+02,  4.8000000e+00,
          1.9000000e+00, -2.1927251e+01, -5.0490251e+01,  4.9800000e+02],
        [ 0.0000000e+00,  9.6200000e+02,  9.6200000e+02,  9.6100000e+02,
          1.9300000e+02,  1.6800000e+01,  9.5000000e+00,  1.6800000e+01,
          1.4800000e+01,  9.6000000e+00,  8.7000000e+00,  6.7000000e+01,
          6.2000000e+01,  6.2000000e+01,  1.09000

In [11]:
def get_X_y(df, n_sequences, length):
    '''Return a list of samples (X, y)'''
    X, y = [], []

    for i in range(n_sequences):
        (xi, yi) = split_subsample_sequence(df, length)
        X.append(xi)
        y.append(yi)
        
    X = np.array(X)
    y = np.array(y)
    return X, y

In [12]:
df.shape[0]*0.8

7008.0

In [13]:
train_size = int(df.shape[0]*0.8)
df_train = df.iloc[:train_size,:]
df_test = df.iloc[train_size:,:]

In [14]:
df_train.value_counts(['classe_chuva'])

classe_chuva
0               6580
1                391
2                 34
3                  3
dtype: int64

In [15]:
df_test.shape

(1752, 22)

In [16]:
X_train,y_train = get_X_y(df_train,6000,72)

In [17]:
X_test,y_test = get_X_y(df_test,6000,72)

In [18]:
y_train.shape

(6000, 24)

In [19]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# label = LabelEncoder()
# y_enc = label.fit_transform(y.reshape((-1,)))
y_cat = to_categorical(y_train)
y_cat.shape

(6000, 24, 4)

In [20]:
from tensorflow.keras.layers.experimental.preprocessing import Normalization

norm = Normalization()
norm.adapt(X_train)

2022-03-24 09:06:51.005914: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [21]:
model = Sequential()

model.add(norm)

model.add(LSTM(units=20, activation='tanh'))
model.add(Dense(10, activation="tanh"))
model.add(Dense(24, activation="sigmoid"))

model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization (Normalizatio  (None, None, 20)         41        
 n)                                                              
                                                                 
 lstm (LSTM)                 (None, 20)                3280      
                                                                 
 dense (Dense)               (None, 10)                210       
                                                                 
 dense_1 (Dense)             (None, 24)                264       
                                                                 
Total params: 3,795
Trainable params: 3,754
Non-trainable params: 41
_________________________________________________________________


In [22]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=3, restore_best_weights=True)

model.fit(X_train, y_train, batch_size=32, epochs=10, verbose=1,
         validation_split=0.2,
          callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a582bdc0>

In [23]:
model.predict(X_test)[0]

array([0.0204567 , 0.02107978, 0.0151518 , 0.01571482, 0.01991513,
       0.0170511 , 0.01246396, 0.01454297, 0.01182646, 0.01037672,
       0.01331779, 0.01705784, 0.02010471, 0.01637325, 0.01310965,
       0.02093032, 0.02258494, 0.01668856, 0.01950866, 0.02388427,
       0.02403748, 0.02877757, 0.02385899, 0.02777055], dtype=float32)