In [1]:
import math
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import MinMaxScaler

### UNIFIED DATA CONTAINER

In [3]:
data = {
    'type-one': {
        'raw': '',
        'train': [],
        'test': []
    },
    'type-two': {
        'raw': '',
        'train': [],
        'test': []
    }
}

### PROCESS FILE INTO USABLE DATA

In [4]:
def process(src):
    return pd.read_csv(src, delimiter=',', parse_dates=[["Date", "Timestamp"]])

In [5]:
raw = process('EURUSD1m.csv')

In [6]:
raw.shape

(3735014, 6)

In [7]:
raw.head(5)

Unnamed: 0,Date_Timestamp,Open,High,Low,Close,Volume
0,2010-01-01 11:00:00,1.43327,1.43335,1.43319,1.43335,56.299999
1,2010-01-01 11:01:00,1.43333,1.43333,1.43318,1.43327,82.100001
2,2010-01-01 11:02:00,1.43328,1.43333,1.43319,1.43333,86.9
3,2010-01-01 11:03:00,1.43325,1.43333,1.43319,1.43326,68.899999
4,2010-01-01 11:04:00,1.43326,1.43333,1.43319,1.43319,45.3


### RESAMBLE DATA BY DAY - DROP NAN

In [8]:
resampled = raw.resample('D', on='Date_Timestamp').mean().dropna()

In [9]:
resampled.shape

(3128, 5)

In [10]:
resampled.head(5)

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date_Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-01,1.433194,1.433288,1.433089,1.433178,60.243939
2010-01-03,1.431891,1.432094,1.431711,1.431913,25.013333
2010-01-04,1.436431,1.436693,1.436159,1.436451,55.569028
2010-01-05,1.44087,1.441108,1.440588,1.440881,55.477153
2010-01-06,1.436875,1.437116,1.436592,1.436898,56.230417


### ADD LABEL FOR MODEL TRAINING - DROP NAN

In [11]:
resampled['Label'] = resampled['Close'].shift(1)

In [12]:
resampled = resampled.dropna()

In [13]:
resampled.head(5)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Label
Date_Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-03,1.431891,1.432094,1.431711,1.431913,25.013333,1.433178
2010-01-04,1.436431,1.436693,1.436159,1.436451,55.569028,1.431913
2010-01-05,1.44087,1.441108,1.440588,1.440881,55.477153,1.436451
2010-01-06,1.436875,1.437116,1.436592,1.436898,56.230417,1.440881
2010-01-07,1.435212,1.435446,1.434926,1.435225,56.11632,1.436898


### CLONE RESAMPLED DATA & ADD TYPE-1 FEATURES

In [14]:
temp_one = resampled.copy()

In [15]:
def stochastic_k(dataframe, timeframe):
    return ((dataframe['Close'] - dataframe['Low'].rolling(timeframe).min()) / (dataframe['High'].rolling(timeframe).max() - dataframe['Low'].rolling(timeframe).min())) * 100

In [16]:
def stochastic_d(dataframe, timeframe):
    return dataframe['SK'].rolling(timeframe).mean()

#### ADD STOCHASTIC FEATURES AS COLUMNS

In [17]:
temp_one['SK'] = stochastic_k(temp_one, 14)

In [18]:
temp_one['SD'] = stochastic_d(temp_one, 14)

#### DROP NAN ROWS & IRRELEVANT COLUMNS

In [19]:
type_one = temp_one.dropna().filter(['SK', 'SD', 'Label'])

In [20]:
type_one.head(5)

Unnamed: 0_level_0,SK,SD,Label
Date_Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-02-02,14.752107,7.40558,1.389926
2010-02-03,18.040997,6.551441,1.393863
2010-02-04,0.837431,6.495203,1.394784
2010-02-05,0.713646,6.483462,1.382056
2010-02-07,0.486169,6.463243,1.368054


### NORMALIZE VALUES

In [21]:
def normalize(dataframe):

    # INTANTIATE THE SCALER OBJECT & SET MINMAX VALUES
    scaler = MinMaxScaler(feature_range=(0, 1))
    
    # FIT & NORMALIZE VALUES FOR EACH COLUMN
    for column in dataframe.columns:
        dataframe[column] = scaler.fit_transform(dataframe[column].values.reshape(-1, 1))
    
    return dataframe

#### SET IN DATA OBJECT

In [22]:
data['type-one']['raw'] = normalize(type_one)

In [23]:
data['type-one']['raw'].head(5)

Unnamed: 0_level_0,SK,SD,Label
Date_Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-02-02,0.147037,0.06646,0.788176
2010-02-03,0.179976,0.05772,0.797017
2010-02-04,0.007675,0.057145,0.799086
2010-02-05,0.006435,0.057025,0.770506
2010-02-07,0.004157,0.056818,0.739063


### CREATE TRAIN & TEST DATA

In [34]:
def split(dataframe, percent):
    
    # CONVERT DF TO NUMPY ARRAY
    target = dataframe.to_numpy()
    
    # CONTAINERS
    features = []
    labels = []
    
    # LOOP THROUGH
    for row in data:
        features.append([row[0], row[1]])
        labels.append(row[2])
        
    # INDEX TO SPLIT AT
    limit = math.ceil(len(features) * percent)
        
    return {
        'train': {
            'features': features[:limit],
            'labels': labels[:limit]
        },
        'test': {
            'features': features[limit:],
            'labels': labels[limit:]
        }
    }

#### SPLIT INTO TRAIN & TEST DATA

In [32]:
dataset = split(data['type-one']['raw'], 0.8)

#### SET IN DATA OBJECT

In [33]:
data['type-one']['train'] = dataset['train']
data['type-one']['test'] = dataset['test']