In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from common_variables import *

In [2]:
def calculate_rsi(close_prices, period=14):
    # Calculate price changes
    delta = close_prices.diff()
    
    # Separate gains and losses
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    
    # Calculate rolling average of gains and losses
    avg_gain = gain.rolling(window=period, min_periods=1).mean()
    avg_loss = loss.rolling(window=period, min_periods=1).mean()
    
    # Calculate the relative strength (RS)
    rs = avg_gain / avg_loss
    
    # Calculate RSI
    rsi = 100 - (100 / (1 + rs))
    return rsi

def calculate_macd(close_prices, short_period=12, long_period=26, signal_period=9):
    # Calculate short-term and long-term EMAs
    ema_short = close_prices.ewm(span=short_period, adjust=False).mean()
    ema_long = close_prices.ewm(span=long_period, adjust=False).mean()
    
    # Calculate the MACD
    macd = ema_short - ema_long
    
    # Calculate the Signal Line (9-period EMA of the MACD)
    signal_line = macd.ewm(span=signal_period, adjust=False).mean()
    
    return macd, signal_line

def calculate_bollinger_bands(close_prices, window=50, num_std_dev=2):
    # Calculate the 50-period SMA
    sma = close_prices.rolling(window=window).mean()
    
    # Calculate the rolling standard deviation
    rolling_std = close_prices.rolling(window=window).std()
    
    # Calculate the upper and lower Bollinger Bands
    bollinger_upper = sma + (rolling_std * num_std_dev)
    bollinger_lower = sma - (rolling_std * num_std_dev)
    
    return bollinger_upper, bollinger_lower

def calculate_kdj(df, n=9, m1=3, m2=3):
    # Calculate the RSV (Raw Stochastic Value)
    low_min = df['low'].rolling(window=n).min()
    high_max = df['high'].rolling(window=n).max()
    
    RSV = (df['close'] - low_min) / (high_max - low_min) * 100
    
    # Calculate K, D, and J lines
    df['K'] = RSV.ewm(alpha=1/m1, adjust=False).mean()
    df['D'] = df['K'].ewm(alpha=1/m2, adjust=False).mean()
    df['J'] = 3 * df['K'] - 2 * df['D']
    
    return df[['K', 'D', 'J']]

def calculate_atr(df, period=14):
    # Calculate the True Range (TR)
    high_low = df['high'] - df['low']
    high_close = (df['high'] - df['close'].shift()).abs()
    low_close = (df['low'] - df['close'].shift()).abs()
    
    # True Range is the maximum of these three values
    true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    
    # Calculate the ATR (14-period rolling mean of TR)
    atr = true_range.rolling(window=period, min_periods=1).mean()
    
    return atr


In [None]:

df = pd.read_csv(full_time_series_path, index_col= 0)
df.index = pd.to_datetime(df.index, utc=True)
df.head(10)

Unnamed: 0_level_0,open,high,low,close
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-10-23 12:00:00+00:00,1.29656,1.29822,1.29521,1.29754
2012-10-23 16:00:00+00:00,1.29752,1.29869,1.29654,1.29859
2012-10-23 20:00:00+00:00,1.29863,1.29867,1.29728,1.29810
2012-10-24 00:00:00+00:00,1.29810,1.29925,1.29780,1.29865
2012-10-24 04:00:00+00:00,1.29867,1.29967,1.29221,1.29324
...,...,...,...,...
2025-03-21 01:00:00+00:00,1.08530,1.08556,1.08275,1.08304
2025-03-21 05:00:00+00:00,1.08303,1.08457,1.08199,1.08361
2025-03-21 09:00:00+00:00,1.08359,1.08612,1.08248,1.08322
2025-03-21 13:00:00+00:00,1.08320,1.08361,1.07972,1.08183


In [4]:

# Calculate technical indicators as done earlier (SMA, EMA, RSI, etc.)
df['SMA_50'] = df['close'].rolling(window=50).mean()
df['SMA_200'] = df['close'].rolling(window=200).mean()
df['RSI'] = calculate_rsi(df['close'], period=14)
df['MACD'], df['Signal_Line'] = calculate_macd(df['close'])
df['Bollinger_Upper'], df['Bollinger_Lower'] = calculate_bollinger_bands(df['close'], window=50)
df['ATR'] = calculate_atr(df)
df[['K', 'D', 'J']] = calculate_kdj(df)
HLAvg = df['high'].add(df['low']).div(2)
# Simple Moving Average
df['MA'] = HLAvg.rolling(window=ma_periods).mean()
# Log Returns
df['Returns'] = np.log(df['MA']/df['MA'].shift(1))


#Clear empty 
df.dropna(how='any', inplace=True)
df = df[df.shape[0] % batch_size:]
# Reset the index of the DataFrame, and drop the old index
#df.reset_index(drop=True, inplace=True)
df.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 19104 entries, 2012-12-11 12:00:00+00:00 to 2025-03-21 17:00:00+00:00
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   open             19104 non-null  float64
 1   high             19104 non-null  float64
 2   low              19104 non-null  float64
 3   close            19104 non-null  float64
 4   SMA_50           19104 non-null  float64
 5   SMA_200          19104 non-null  float64
 6   RSI              19104 non-null  float64
 7   MACD             19104 non-null  float64
 8   Signal_Line      19104 non-null  float64
 9   Bollinger_Upper  19104 non-null  float64
 10  Bollinger_Lower  19104 non-null  float64
 11  ATR              19104 non-null  float64
 12  K                19104 non-null  float64
 13  D                19104 non-null  float64
 14  J                19104 non-null  float64
 15  MA               19104 non-null  float64
 16  Returns    

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Select the features to apply PCA to 
features = ['close', 'SMA_50', 'SMA_200', 'RSI', 'MACD', 'Signal_Line', 
            'Bollinger_Upper', 'Bollinger_Lower', 'ATR', 'K', 'D', 'J']

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[features])

# Apply PCA
pca = PCA(n_components=0.95)  # Keep components that explain 95% of the variance
pca_result = pca.fit_transform(scaled_features)

# The transformed data is stored in pca_result
# You can check the amount of variance explained by each component
print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Cumulative explained variance:", pca.explained_variance_ratio_.cumsum())

# PCA components as new features to the original data
pca_columns = [f'PC{i+1}' for i in range(pca_result.shape[1])]
pca_df = pd.DataFrame(pca_result, columns=pca_columns)
len(pca_df)


Explained variance ratio: [0.41525711 0.34895225 0.11303935 0.08250434]
Cumulative explained variance: [0.41525711 0.76420935 0.8772487  0.95975304]


19104

In [None]:
# Concatenate the PCA features and target variable
pca_df = pca_df.reset_index(drop=True)  # Remove existing index
pca_df.index = df.index[:len(pca_df)]  # Assign date index from df (ensure they match in length)
dataset = pd.concat([pca_df, df], axis=1)
dataset.head(10)


Unnamed: 0_level_0,PC1,PC2,PC3,PC4,open,high,low,close,SMA_50,SMA_200,...,MACD,Signal_Line,Bollinger_Upper,Bollinger_Lower,ATR,K,D,J,MA,Returns
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-12-11 12:00:00+00:00,3.373803,-1.849751,-2.222553,0.357438,1.29927,1.30075,1.29822,1.29984,1.300558,1.288439,...,-0.001358,-0.002335,1.313314,1.287802,0.003124,81.683253,68.443827,108.162105,1.293071,0.000249
2012-12-11 16:00:00+00:00,3.350967,-2.244847,-2.105291,0.358316,1.29986,1.30151,1.29852,1.30040,1.300593,1.288449,...,-0.000859,-0.002040,1.313338,1.287849,0.003137,84.738547,73.875400,106.464840,1.293578,0.000392
2012-12-11 20:00:00+00:00,3.348567,-2.610816,-2.069897,0.040943,1.30034,1.30142,1.30005,1.30113,1.300658,1.288473,...,-0.000401,-0.001712,1.313379,1.287938,0.002785,88.690692,78.813831,108.444414,1.294283,0.000545
2012-12-12 00:00:00+00:00,3.351790,-2.558022,-1.811837,-0.070624,1.30114,1.30126,1.29995,1.30022,1.300717,1.288508,...,-0.000110,-0.001392,1.313401,1.288033,0.002684,87.816833,81.814832,99.820835,1.294768,0.000375
2012-12-12 04:00:00+00:00,3.337806,-2.883005,-1.644090,-0.232802,1.30020,1.30114,1.29958,1.30032,1.300780,1.288537,...,0.000127,-0.001088,1.313424,1.288136,0.002504,87.281519,83.637061,94.570436,1.295529,0.000587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-21 01:00:00+00:00,-1.567879,2.126618,0.645338,-0.250608,1.08530,1.08556,1.08275,1.08304,1.089069,1.058738,...,-0.001182,-0.000155,1.095358,1.082780,0.002971,26.188433,29.709775,19.145748,1.088478,-0.000681
2025-03-21 05:00:00+00:00,-1.564157,2.197181,0.610772,-0.169752,1.08303,1.08457,1.08199,1.08361,1.089038,1.058997,...,-0.001358,-0.000396,1.095420,1.082657,0.003053,24.372295,27.930615,17.255655,1.087720,-0.000696
2025-03-21 09:00:00+00:00,-1.561171,2.373766,0.625991,-0.043816,1.08359,1.08612,1.08248,1.08322,1.088994,1.059260,...,-0.001512,-0.000619,1.095508,1.082480,0.003192,21.895714,25.918981,13.849178,1.087051,-0.000615
2025-03-21 13:00:00+00:00,-1.556474,2.448633,0.514735,-0.186064,1.08320,1.08361,1.07972,1.08183,1.088796,1.059502,...,-0.001727,-0.000841,1.095567,1.082025,0.003017,21.213637,24.350533,14.939844,1.086422,-0.000579


In [7]:

def split(dataset, validate_rate, test_rate):
    validation_size = int(int(dataset.shape[0]/batch_size*validate_rate)*batch_size)
    test_size = int(int(dataset.shape[0]/batch_size*test_rate)*batch_size)
    #Split files
    df_train = dataset[:- validation_size - test_size]
    df_validation = dataset[- validation_size - test_size - window_size:- test_size]
    df_test = dataset[- test_size - window_size:]
    print(f'df_train.shape {df_train.shape}, df_validation.shape {df_validation.shape}, df_test.shape {df_test.shape}')
    #Load files
    df_train.to_csv(train_time_series_path)
    df_validation.to_csv(validate_time_series_path)
    df_test.to_csv(test_time_series_path)


In [8]:


split(dataset, 0.2, 0.2)
    

df_train.shape (11488, 21), df_validation.shape (4064, 21), df_test.shape (4064, 21)
