In [1]:
import os
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from progressbar import progressbar

In [188]:
def generator(data, lookback, delay, min_index, max_index,
              shuffle=False, batch_size=128, step=6):
    """
      data: The original array of floating-point data (normalized)
      lookback: How many timesteps back the input data should go.
      delay: How many timesteps in the future the target should be.
      min_index and max_index—Indices in the data array that delimit which time-steps to draw from.
      This is useful for keeping a segment of the data for validation and another for testing.
      shuffle: Whether to shuffle the samples or draw them in chronological order.
      batch_size: The number of samples per batch.
      step: The period, in timesteps, at which you sample data.
    """
    
    if max_index is None:
        print(f'max_index is None computing new max_index...')
        max_index = len(data) - delay - 1
        print(f'max_index: {max_index}')
    
    i = min_index + lookback
    
    print(f'i: {i} = min_index + lookback')
    
    while True:
        if shuffle:
            rows = np.random.randint(min_index + lookback, max_index, size=batch_size)
            print(f'rows shape: {rows.shape}')
        else:
            if i + batch_size >= max_index:
                print('i + batch_size >= max_index (computing new i)')
                i = min_index + lookback
                print(f'new \'i\'= :{i}')
            rows = np.arange(i, min(i + batch_size, max_index))
            print(f'rows shape (no-suffle): {rows.shape}, rows len: {len(rows)}')
            i += len(rows)
            
            print(f'i = {i}')
            
        samples = np.zeros((len(rows), lookback // step, data.shape[-1]))
        print(f'samples (zeros): {samples.shape}')
        
        targets = np.zeros((len(rows)))
        print(f'targets zeros: {targets.shape}')
        
        for j, row in enumerate(rows):
            print(f'rows[{j}]:', rows[j])
            indices = range(rows[j] - lookback, rows[j], step)
            print(f'indices:{indices}')
            
            samples[j] = data[indices]
            targets[j] = data[rows[j] + delay][1]
            
        yield samples, targets

In [189]:
df = pd.read_csv('data/jena_climate_2009_2016.csv')

In [190]:
X = df[df.columns[1:]].values

In [191]:
X.shape

(420451, 14)

In [225]:
lookback = 2*24*6 # (days*hours*step) Observations will go back 5 days.
step = 6 # Observations will be sampled at one data point per hour.
delay = 1*24*6 # Targets will be 24 hours in the future.
batch_size = 2

gen_train = generator(X,
                      lookback=lookback,
                      delay=delay,
                      min_index=0,
                      max_index=None,
                      shuffle=False,
                      step=step,
                      batch_size=batch_size)

In [226]:
lookback, delay, 288/6

(288, 144, 48.0)

In [228]:
x, y = next(gen_train)

max_index is None computing new max_index...
max_index: 420306
i: 288 = min_index + lookback
rows shape (no-suffle): (2,), rows len: 2
i = 290
samples (zeros): (2, 48, 14)
targets zeros: (2,)
rows[0]: 288
indices:range(0, 288, 6)
rows[1]: 289
indices:range(1, 289, 6)


In [235]:
x.shape

(2, 48, 14)

In [242]:
y

array([-1.83, -1.7 ])

In [236]:
range(0, 288, 6)

range(0, 288, 6)

In [237]:
df[:2]

Unnamed: 0,Date Time,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg)
0,01.01.2009 00:10:00,996.52,-8.02,265.4,-8.9,93.3,3.33,3.11,0.22,1.94,3.12,1307.75,1.03,1.75,152.3
1,01.01.2009 00:20:00,996.57,-8.41,265.01,-9.28,93.4,3.23,3.02,0.21,1.89,3.03,1309.8,0.72,1.5,136.1


In [238]:
X[range(0,288,6)][0]

array([ 9.96520e+02, -8.02000e+00,  2.65400e+02, -8.90000e+00,
        9.33000e+01,  3.33000e+00,  3.11000e+00,  2.20000e-01,
        1.94000e+00,  3.12000e+00,  1.30775e+03,  1.03000e+00,
        1.75000e+00,  1.52300e+02])

In [241]:
X[432]

array([ 992.79,   -1.83,  271.89,   -6.29,   71.4 ,    5.34,    3.81,
          1.53,    2.39,    3.84, 1272.79,    1.44,    2.63,  214.1 ])

In [198]:
A = np.array(['a', 'b', 'c', 'd'])

# Draft for bet data

In [252]:
df = pd.read_csv('../../bet365-predictor/model/data/run3/futebol_eventos.csv')

In [254]:
tmp_df = pd.read_csv()
df[['evento','tempo', 'percentualjogo']]

Unnamed: 0,evento,tempo,percentualjogo
0,#/IP/EV15620970162C1,00:30,0.0
1,#/IP/EV15620970392C1,00:58,0.0
2,#/IP/EV15621025222C1,00:55,0.0
3,#/IP/EV15621026192C1,00:58,0.0
4,#/IP/EV15621376002C1,00:51,0.0
...,...,...,...
151215,#/IP/EV15632469442C1,99:51,1.1
151216,#/IP/EV15632479362C1,99:34,1.1
151217,#/IP/EV15632507502C1,99:43,1.1
151218,#/IP/EV15632507612C1,99:43,1.1
