In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import datetime as dt
from statistics import mean
import pdb
import itertools
import csv

In [34]:
from keras import optimizers
from keras.utils import plot_model
from keras.models import Sequential, Model
from keras.layers import Dense, TimeDistributed, Flatten, Dropout
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.layers.advanced_activations import LeakyReLU

In [4]:
#reading data from webpage
url = "https://www.metnet.hu/napi-adatok?sub=4&pid=10602&date=2009-10"
tables = pd.io.html.read_html(url)

In [5]:
#choosing the second table from the tables object
#data summary
tables[1].head()

Unnamed: 0,0,1,2,3,4,5
0,Nap,T min,T max,Csapadék,Jelenségek,Hóréteg
1,1.,"13,3 °C","22,4 °C","0,0 mm",,0 cm
2,Leírás:,Leírás:,Leírás:,Leírás:,Leírás:,Leírás:
3,2.,"12,6 °C","20,1 °C","0,0 mm",,0 cm
4,Leírás:,Leírás:,Leírás:,Leírás:,Leírás:,Leírás:


In [6]:
#data cleaning, removing unnecessary columns and characters as we only need the values in the min and max columns
avgs = []
daterange = pd.date_range(start='10/1/2009', end='11/1/2019', freq='M')
for single_date in daterange:
    single_date = single_date.to_pydatetime()
    date = single_date.date()
    url = "https://www.metnet.hu/napi-adatok?sub=4&pid=10602&date=%s" % date.strftime("%Y-%m-%d")
    if date.strftime("%Y-%m-%d").split('-')[0] == '2019': # leaving out the test data
        continue
    if date.strftime("%Y-%m-%d").split('-')[1] not in ['9', '10', '11']: # choosing only september, october and november
        continue
    tables = pd.io.html.read_html(url)
    table = tables[1]
    table = table.drop([3, 4, 5], axis=1)
    table[0] = table[0].str.replace(r'^[a-zA-Z]+.*','') # filtering out unnecessary words and symbols
    table[1] = table[1].str.replace(r'^[a-zA-Z]+.*','')
    table[2] = table[2].str.replace(r'^[a-zA-Z]+.*','')
    table = table[table[1] != '']
    table[1] = table[1].str.replace('°','')
    table[2] = table[2].str.replace('°','')
    table[1] = table[1].str.replace(',','.')
    table[1] = table[1].str.replace('C','')
    table[2] = table[2].str.replace(',','.')
    table[2] = table[2].str.replace('C','')
    table = table.set_index(0)
    minmax = np.array(table, dtype=np.float32).tolist()
    middle = [mean(i) for i in minmax] # taking the average of max and min temperatures
    avgs.append(middle)
merged_avgs = list(itertools.chain.from_iterable(avgs)) # flattened list of lists
merged_avgs = list(filter(lambda a: a != 'nan', merged_avgs))



In [7]:
# split a univariate sequence into samples
def split_sequence(sequence, n_steps_in, n_steps_out):
    X, y = list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out
        # check if we are beyond the sequence
        if out_end_ix > len(sequence):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix:out_end_ix]
        X.append(seq_x)
        y.append(seq_y[-1])
    return np.array(X), np.array(y)

In [39]:
# I divided the data into 90 dim vectors and 1 dim labels so I can predict one day that is either the next day, a week later
# or a month later
raw_seq = merged_avgs
n_steps_in = 90
n_steps_out = 1
# split into samples
X, y = split_sequence(raw_seq, n_steps_in, n_steps_out)
# writing training set to csv file
wtr = csv.writer(open ('out_day.csv', 'w'), delimiter=',', lineterminator='\n')
for x in X : wtr.writerow ([x])

model_day = Sequential()
model_day.add(Dense(100, activation='relu', input_dim=n_steps_in))
model_day.add(Dense(50, activation='relu'))
model_day.add(Dense(30, activation='relu'))
model_day.add(Dense(15, activation='relu'))
model_day.add(Dense(1))
model_day.compile(optimizer='adam', loss='mse')
model_day.summary()

model_day.fit(X, y, epochs=40, verbose=0)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_51 (Dense)             (None, 100)               9100      
_________________________________________________________________
dense_52 (Dense)             (None, 50)                5050      
_________________________________________________________________
dense_53 (Dense)             (None, 30)                1530      
_________________________________________________________________
dense_54 (Dense)             (None, 15)                465       
_________________________________________________________________
dense_55 (Dense)             (None, 1)                 16        
Total params: 16,161
Trainable params: 16,161
Non-trainable params: 0
_________________________________________________________________


<keras.callbacks.History at 0x1a37499da0>

In [40]:
# choose a number of time steps
n_steps_in = 90
n_steps_out = 7
# split into samples
X, y = split_sequence(raw_seq, n_steps_in, n_steps_out)
# writing training set to csv file
wtr = csv.writer(open ('out_month.csv', 'w'), delimiter=',', lineterminator='\n')
for x in X : wtr.writerow ([x])

model_week = Sequential()
model_week.add(Dense(100, activation='relu', input_dim=n_steps_in))
model_week.add(Dense(50, activation='relu'))
model_week.add(Dense(30, activation='relu'))
model_week.add(Dense(15, activation='relu'))
model_week.add(Dense(1))
model_week.compile(optimizer='adam', loss='mse')
model_week.summary()

model_week.fit(X, y, epochs=40, verbose=0)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_56 (Dense)             (None, 100)               9100      
_________________________________________________________________
dense_57 (Dense)             (None, 50)                5050      
_________________________________________________________________
dense_58 (Dense)             (None, 30)                1530      
_________________________________________________________________
dense_59 (Dense)             (None, 15)                465       
_________________________________________________________________
dense_60 (Dense)             (None, 1)                 16        
Total params: 16,161
Trainable params: 16,161
Non-trainable params: 0
_________________________________________________________________


<keras.callbacks.History at 0x1a37b37e48>

In [41]:
# choose a number of time steps
raw_seq=merged_avgs
n_steps_in = 90
n_steps_out = 30
# split into samples
X, y = split_sequence(raw_seq, n_steps_in, n_steps_out)
# writing training set to csv file
wtr = csv.writer(open ('out_year.csv', 'w'), delimiter=',', lineterminator='\n')
for x in X : wtr.writerow ([x])

model_month = Sequential()
model_month.add(Dense(100, activation='relu', input_dim=n_steps_in))
model_month.add(Dense(50, activation='relu'))
model_month.add(Dense(30, activation='relu'))
model_month.add(Dense(15, activation='relu'))
model_month.add(Dense(1))
model_month.compile(optimizer='adam', loss='mse')
model_month.summary()

model_month.fit(X, y, epochs=40, verbose=0)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_61 (Dense)             (None, 100)               9100      
_________________________________________________________________
dense_62 (Dense)             (None, 50)                5050      
_________________________________________________________________
dense_63 (Dense)             (None, 30)                1530      
_________________________________________________________________
dense_64 (Dense)             (None, 15)                465       
_________________________________________________________________
dense_65 (Dense)             (None, 1)                 16        
Total params: 16,161
Trainable params: 16,161
Non-trainable params: 0
_________________________________________________________________


<keras.callbacks.History at 0x1a37fdffd0>

In [561]:
x_input = array(merged_test_avgs)
x_input = x_input.reshape((1, n_steps_in))
yhat = model_day.predict(x_input, verbose=0)
print(yhat)


[[-0.7236978]]


In [22]:
# predicting the days from the last 3 months:
test_avgs = []
daterange = pd.date_range(start='8/1/2019', end='11/1/2019', freq='M')
for single_date in daterange:
    single_date = single_date.to_pydatetime()
    date = single_date.date()
    url = "https://www.metnet.hu/napi-adatok?sub=4&pid=10602&date=%s" % date.strftime("%Y-%m-%d")
    tables = pd.io.html.read_html(url)
    table = tables[1]
    table = table.drop([3, 4, 5], axis=1)
    table[0] = table[0].str.replace(r'^[a-zA-Z]+.*','')
    table[1] = table[1].str.replace(r'^[a-zA-Z]+.*','')
    table[2] = table[2].str.replace(r'^[a-zA-Z]+.*','')
    table = table[table[1] != '']
    table[1] = table[1].str.replace('°','')
    table[2] = table[2].str.replace('°','')
    table[1] = table[1].str.replace(',','.')
    table[1] = table[1].str.replace('C','')
    table[2] = table[2].str.replace(',','.')
    table[2] = table[2].str.replace('C','')
    table = table.set_index(0)
    minmax = np.array(table, dtype=np.float32).tolist()
    middle = [mean(i) for i in minmax]
    test_avgs.append(middle)
merged_test_avgs = list(itertools.chain.from_iterable(test_avgs))
merged_test_avgs = list(filter(lambda a: a != 'nan', merged_test_avgs))



In [23]:
# added a 28 for the last day of august so we have a 90 dim vector, i thought 28 was a reasonable number for that
merged_test_avgs=[28] + merged_test_avgs[:-3] # last 3 days of october are nan obviously
merged_test_avgs = np.array(merged_test_avgs)
merged_test_avgs.shape

In [27]:
# prediction for tomorrow:
x_input = np.array(merged_test_avgs)
x_input = x_input.reshape((1, n_steps_in))
yhat = model_day.predict(x_input, verbose=0)
print(yhat)


[[13.124974]]


In [28]:
# prediction for the next week:
x_input = np.array(merged_test_avgs)
x_input = x_input.reshape((1, n_steps_in))
yhat = model_week.predict(x_input, verbose=0)
print(yhat)

[[19.609913]]


In [29]:
# prediction for the next month:
x_input = np.array(merged_test_avgs)
x_input = x_input.reshape((1, n_steps_in))
yhat = model_month.predict(x_input, verbose=0)
print(yhat)

[[13.092235]]
