In [1]:
%load_ext autoreload
%autoreload 2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Flatten, LSTM
import pandas as pd
import numpy as np

In [2]:
from RainPredictionMachine.data import CleanDataRpm
cleaner = CleanDataRpm()
df = cleaner.clean_data(1)

In [3]:
df.head(2).to_dict()

{'Chuva': {0: 0.0, 1: 0.0},
 'Pres': {0: 957.9, 1: 958.0},
 'Pres_max': {0: 957.9, 1: 958.3},
 'Pres_min': {0: 957.1, 1: 957.9},
 'Radiacao': {0: 0.0, 1: 0.0},
 'Temp': {0: 21.7, 1: 21.4},
 'Temp_orvalho': {0: 21.7, 1: 21.4},
 'Temp_max': {0: 23.7, 1: 21.7},
 'Temp_min': {0: 21.7, 1: 21.4},
 'Temp_orvalho_max': {0: 23.7, 1: 21.6},
 'Temp_orvalho_min': {0: 21.7, 1: 21.4},
 'Umid_max': {0: 100.0, 1: 100.0},
 'Umid_min': {0: 100.0, 1: 100.0},
 'Umid': {0: 100.0, 1: 100.0},
 'Dir_vento': {0: 130.5, 1: 109.0},
 'Rajada_vento': {0: 3.75, 1: 5.4},
 'Vel_vento': {0: 2.1, 1: 3.35},
 'Latitude': {0: -21.927251, 1: -21.927251},
 'Longitude': {0: -50.490251, 1: -50.490251},
 'Altitude': {0: 498.0, 1: 498.0},
 'datahora': {0: Timestamp('2021-01-01 00:00:00+0000', tz='UTC'),
  1: Timestamp('2021-01-01 01:00:00+0000', tz='UTC')},
 'classe_chuva': {0: 0, 1: 0}}

In [4]:
df.isnull().sum()

Chuva               0
Pres                0
Pres_max            0
Pres_min            0
Radiacao            0
Temp                0
Temp_orvalho        0
Temp_max            0
Temp_min            0
Temp_orvalho_max    0
Temp_orvalho_min    0
Umid_max            0
Umid_min            0
Umid                0
Dir_vento           0
Rajada_vento        0
Vel_vento           0
Latitude            0
Longitude           0
Altitude            0
datahora            0
classe_chuva        0
dtype: int64

In [5]:
def subsample_sequence(df, length):
    index = np.random.randint(0, df.shape[0] - length)
    df_sample = df.iloc[index:index+length]
    
    return df_sample

In [6]:
df_sample = subsample_sequence(df,48)
df_sample.head()

Unnamed: 0,Chuva,Pres,Pres_max,Pres_min,Radiacao,Temp,Temp_orvalho,Temp_max,Temp_min,Temp_orvalho_max,...,Umid_min,Umid,Dir_vento,Rajada_vento,Vel_vento,Latitude,Longitude,Altitude,datahora,classe_chuva
1136,0.0,950.5,950.5,950.3,0.0,22.4,22.4,23.0,22.2,23.0,...,100.0,100.0,56.0,3.85,1.5,-21.927251,-50.490251,498.0,2021-02-17 08:00:00+00:00,0
1137,0.0,950.7,950.8,950.4,187.15,22.4,22.4,22.6,21.7,22.6,...,100.0,100.0,46.5,5.35,3.1,-21.927251,-50.490251,498.0,2021-02-17 09:00:00+00:00,0
1138,0.0,951.6,951.6,950.7,55.4,23.3,23.3,23.5,22.4,23.5,...,100.0,100.0,161.0,2.95,1.45,-21.927251,-50.490251,498.0,2021-02-17 10:00:00+00:00,0
1139,0.0,951.9,951.9,951.6,488.3,25.1,21.8,25.1,23.3,23.7,...,80.0,82.0,174.0,1.95,0.9,-21.927251,-50.490251,498.0,2021-02-17 11:00:00+00:00,0
1140,0.0,952.5,952.5,951.9,751.5,25.0,21.6,25.7,25.0,21.9,...,73.0,82.0,166.0,4.25,2.0,-21.927251,-50.490251,498.0,2021-02-17 12:00:00+00:00,0


In [15]:
def split_subsample_sequence(df, length):
    '''Create one single random (X,y) pair'''
    
    df_subsample = subsample_sequence(df, length)
    y_sample = df_subsample['Chuva'].iloc[length - 24:]
    
    X_sample = df_subsample.drop(columns= ['classe_chuva', 'datahora'])[0:length -24]
    X_sample = X_sample.values
   
    return np.array(X_sample), np.array(y_sample)

In [16]:
def get_X_y(df, n_sequences, length):
    '''Return a list of samples (X, y)'''
    X, y = [], []

    for i in range(n_sequences):
        (xi, yi) = split_subsample_sequence(df, length)
        X.append(xi)
        y.append(yi)
        
    X = np.array(X)
    y = np.array(y)
    return X, y

In [17]:
train_size = int(df.shape[0]*0.8)
df_train = df.iloc[:train_size,:]
df_test = df.iloc[train_size:,:]

In [19]:
df_train.value_counts(['Chuva'])

Chuva    
0.000000     6580
0.118347      174
0.200000       71
0.400000       34
0.600000       19
1.000000       16
0.800000       11
2.600000        8
1.600000        8
1.400000        7
2.000000        7
1.200000        7
2.200000        6
1.800000        4
3.000000        4
7.600000        4
5.400000        4
9.800000        3
7.000000        3
9.000000        3
3.200000        3
2.400000        3
4.600000        2
4.800000        2
20.800000       2
8.600000        2
22.600000       1
24.800000       1
34.200000       1
9.200000        1
20.200000       1
18.800000       1
15.400000       1
10.200000       1
37.800000       1
5.000000        1
8.800000        1
8.200000        1
8.000000        1
7.800000        1
7.400000        1
5.800000        1
4.400000        1
4.000000        1
3.800000        1
3.600000        1
45.000000       1
dtype: int64

In [20]:
df_test.shape

(1752, 22)

In [21]:
X_train,y_train = get_X_y(df_train,6000,72)

In [22]:
X_test,y_test = get_X_y(df_test,6000,72)

In [23]:
y_train.shape

(6000, 24)

In [24]:
# from sklearn.preprocessing import LabelEncoder
# from tensorflow.keras.utils import to_categorical

# # label = LabelEncoder()
# # y_enc = label.fit_transform(y.reshape((-1,)))
# y_cat = to_categorical(y_train)
# y_cat.shape

In [25]:
from tensorflow.keras.layers.experimental.preprocessing import Normalization

norm = Normalization()
norm.adapt(X_train)

2022-03-23 17:25:30.794338: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [28]:
X_train.shape

(6000, 48, 20)

In [42]:
model = Sequential()

model.add(norm)

model.add(LSTM(units=20, activation='tanh'))
model.add(Dense(10, activation="tanh"))
model.add(Dense(24, activation="relu"))

model.compile(loss='mse',
                  optimizer='rmsprop',
                  metrics=['mae','mse','mape'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization (Normalizatio  (None, None, 20)         41        
 n)                                                              
                                                                 
 lstm_3 (LSTM)               (None, 20)                3280      
                                                                 
 dense_6 (Dense)             (None, 10)                210       
                                                                 
 dense_7 (Dense)             (None, 24)                264       
                                                                 
Total params: 3,795
Trainable params: 3,754
Non-trainable params: 41
_________________________________________________________________


In [43]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=3, restore_best_weights=True)

model.fit(X_train, y_train, batch_size=32, epochs=10, verbose=1,
         validation_split=0.2,
          callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a5ebcac0>

In [44]:
y_pred = model.predict(X_test)
y_pred

array([[1.4073079 , 1.6721784 , 0.        , ..., 0.6364898 , 0.4593822 ,
        0.        ],
       [0.        , 0.        , 0.07714836, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.0425825 , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.10916512, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.2632447 , 0.17786537, 0.        , ..., 0.39763182, 0.51449025,
        0.        ]], dtype=float32)

In [45]:
model.evaluate(X_test,y_test)



[1.89149808883667, 0.2292419970035553, 1.89149808883667, 52473084.0]

In [65]:
df_y_test = pd.DataFrame(y_test).T #cada coluna sera o indice para os dias do futuro -6000 dias

In [66]:
df_y_pred = pd.DataFrame(y_pred).T

In [56]:
pd.DataFrame(y_pred).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,...,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,0.102326,0.093907,0.010789,0.038967,0.059802,0.058311,0.036377,0.023362,0.027757,0.027284,...,0.03924,0.056136,0.075604,0.087266,0.000164,0.1611,0.167479,0.135534,0.115474,0.000863
std,0.307448,0.315244,0.036708,0.210054,0.301974,0.183321,0.128109,0.103947,0.129587,0.134687,...,0.113718,0.134614,0.166078,0.173847,0.002638,0.223643,0.221037,0.189152,0.173022,0.008745
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.074136,0.090874,0.055378,0.023214,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.031146,0.06707,0.118914,0.0,0.248472,0.237673,0.206544,0.178965,0.0
max,2.326106,2.45781,0.314969,1.976095,2.529535,1.337049,0.963521,1.446426,1.370263,1.460431,...,0.847133,0.866307,1.020473,1.21639,0.070429,1.300596,1.192397,1.051156,0.953769,0.181308


In [68]:
def classe_chuva(precipitacao):
    mm=precipitacao
    if np.isnan(mm):
        chuva = "NaN"
    if mm == 0:
        chuva = 0 #'nao chove'
    elif mm >0 and mm <=5.0:
        chuva = 1 #'fraca'
    elif mm >5.0 and mm<=25.0:
        chuva = 2 #'moderada'
    else:
        chuva = 3 #'forte'
    return chuva

In [74]:
df_y_test_categorical = df_y_test.applymap(classe_chuva)
df_y_test_categorical.T.value_counts()

0  1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20  21  22  23
0  0  0  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0   0   0   0   0     4065
1  0  0  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0   0   0   0   0       33
0  0  0  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0   0   0   0   1       27
                                                          1   0   0   0   0   0   0       25
   1  0  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0   0   0   0   0       24
                                                                                        ... 
      1  1  1  2  1  0  0  0  0   0   1   2   0   0   0   1   1   0   1   1   0   0        1
               1  1  1  0  0  0   0   0   0   0   0   0   0   0   0   0   0   0   0        1
   0  0  0  0  0  2  1  0  0  0   0   0   1   0   0   0   0   0   0   0   0   0   0        1
               1  0  0  0  0  0   0   0   0   2   0   0   0   0   1   1   0   

In [75]:
df_y_pred_categorical = df_y_pred.applymap(classe_chuva)
df_y_pred_categorical.T.value_counts()

0  1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20  21  22  23
0  0  0  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0   0   0   0   0     597
                                                                  1   1   1   1   0     249
                                                  1   1   1   0   1   1   1   1   0     232
                                                  0   0   0   0   0   0   1   1   0     212
                                                                      1   1   1   0     202
                                                                                       ... 
               1  0  0  0  0  0   0   0   0   0   0   0   1   0   0   1   0   0   0       1
1  1  0  0  0  1  0  0  0  0  0   0   0   0   0   0   0   0   0   0   1   0   1   0       1
0  0  0  0  0  0  0  0  0  0  0   0   0   0   0   1   1   1   0   0   0   0   0   0       1
1  0  0  1  0  1  1  0  0  0  0   0   0   0   0   0   0   1   0   1   1   1   1   0    

In [78]:
(df_y_test_categorical == df_y_pred_categorical).sum() # o quanto acertamos por dia

0        7
1       21
2       23
3       14
4       16
        ..
5995    20
5996     5
5997    22
5998    21
5999    14
Length: 6000, dtype: int64

In [81]:
6000*24

144000

In [82]:
(df_y_test_categorical == df_y_pred_categorical).sum().sum()/(6000*24) #accuracy

0.7476180555555556