In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import tensorflow as tf
from sklearn.model_selection import train_test_split
import h5py

from IPython.display import display, Image, HTML
display(HTML("<style>.container { width:100% ; }</style>"))
%matplotlib inline


# Check if any GPU is detected
print("GPU(s) found: ")
print(tf.config.list_physical_devices('GPU'))

GPU(s) found: 
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
def windows(data, size):
    start = 0
    while start < data.count():
        yield int(start), int(start + size)
        start += (size / 2)


def segment_signal(data,window_size = 12, num_features = 6):
    segments = np.empty((0,window_size, num_features))
    labels = np.empty((0))
    for (start, end) in windows(data['Date'], window_size):
        x = data["mav5"][start:end]
        y = data["mav10"][start:end]
        z = data["mav20"][start:end]
        a = data["mav30"][start:end]
        b = data["mav50"][start:end]
        c = data["mav100"][start:end]

        if(len(data['Date'][start:end]) == window_size):
            segments = np.vstack([segments, np.dstack([x,y,z,a,b,c])])
            labels = np.append(labels, stats.mode(data["Action"][start:end])[0][0])
    
    return segments, labels


def get_batches(X, y, batch_size = 100):
    """ Return a generator for batches """
    n_batches = len(X) // batch_size
    X, y = X[ : n_batches * batch_size], y[ : n_batches * batch_size]
    # Loop over batches and yield
    for b in range(0, len(X), batch_size):
        yield X[b : b + batch_size], y[b : b + batch_size]



def create_tensorflow_train_data(csvfilename):
    df = pd.read_csv('Data/sampledata/'+ csvfilename)
    df = df[['Date','symbolid','buyret','sellret','Action','mav5', 'mav10','mav20','mav30','mav50','mav100']]
    symbols = df.symbolid.unique()
    segments, labels = segment_signal(df[df.symbolid == symbols[0]])
    df = df[df.symbolid != symbols[0]]
    symbols = symbols[1:]
    for i in range(0,len(symbols)):
        x, a = segment_signal(df[df.symbolid == symbols[i]])
        segments = np.concatenate((segments, x), axis = 0)
        labels = np.concatenate((labels, a), axis = 0)
        df = df[df.symbolid != symbols[i]]
        print(str(round(i/len(symbols)*100,2)) + ' percent done')
    
    list_ch_train = pd.get_dummies(labels)
    list_ch_train = np.asarray(list_ch_train.columns)
    labels = np.asarray(pd.get_dummies(labels), dtype = np.int8)
    X_tr, X_vld, lab_tr, lab_vld = train_test_split(segments, labels, stratify = labels, random_state = 123)
    
    return X_tr, X_vld, lab_tr, lab_vld, list_ch_train


def create_tensorflow_test_data(csvfilename):
    df = pd.read_csv('Data/sampledata/'+ csvfilename)
    df = df[['time','symbolid','buyret','sellret','Action','mav5', 'mav10','mav20','mav30','mav50','mav100']]
    list_ch_test = df.Action.unique()
    symbols = df.symbolid.unique()
    segments, labels = segment_signal(df[df.symbolid == symbols[0]])
    df = df[df.symbolid != symbols[0]]
    symbols = symbols[1:]
    for i in range(0,len(symbols)):
        x, a = segment_signal(df[df.symbolid == symbols[i]])
        segments = np.concatenate((segments, x), axis = 0)
        labels = np.concatenate((labels, a), axis = 0)
        df = df[df.symbolid != symbols[i]]
        print(str(round(i/len(symbols)*100,2)) + ' percent done')

    list_ch_test = pd.get_dummies(labels)
    list_ch_test = np.asarray(list_ch_test.columns)
    labels = np.asarray(pd.get_dummies(labels), dtype = np.int8)
    X_test = segments
    y_test = labels
    
    return X_test, y_test, list_ch_test

In [3]:
csvfilename = 'train_nonCryptos.csv'
X_tr, X_vld, lab_tr, lab_vld, list_ch_train = create_tensorflow_train_data(csvfilename)

targetFile = 'hdf_nonCryptos.h5'
hf = h5py.File('h5files/'+targetFile, 'w')
hf.create_dataset('X_tr', data = X_tr)
hf.create_dataset('X_vld', data = X_vld)
hf.create_dataset('lab_tr', data = lab_tr)
hf.create_dataset('lab_vld', data = lab_vld)
# hf.create_dataset('list_ch_train', data = list_ch_train)

# hf.create_dataset('X_test', data = X_test)
# hf.create_dataset('y_test', data = y_test)
# hf.create_dataset('list_ch_test', data = list_ch_test)
hf.close()

0.0 percent done
25.0 percent done
50.0 percent done
75.0 percent done
