In [None]:
!wget http://prdownloads.sourceforge.net/ta-lib/ta-lib-0.4.0-src.tar.gz
!tar -xzvf ta-lib-0.4.0-src.tar.gz
%cd ta-lib
!./configure --prefix=/usr
!make
!make install
!pip install Ta-Lib
import talib

--2024-06-12 09:34:02--  http://prdownloads.sourceforge.net/ta-lib/ta-lib-0.4.0-src.tar.gz
Resolving prdownloads.sourceforge.net (prdownloads.sourceforge.net)... 204.68.111.105
Connecting to prdownloads.sourceforge.net (prdownloads.sourceforge.net)|204.68.111.105|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.sourceforge.net/project/ta-lib/ta-lib/0.4.0/ta-lib-0.4.0-src.tar.gz [following]
--2024-06-12 09:34:03--  http://downloads.sourceforge.net/project/ta-lib/ta-lib/0.4.0/ta-lib-0.4.0-src.tar.gz
Resolving downloads.sourceforge.net (downloads.sourceforge.net)... 204.68.111.105
Reusing existing connection to prdownloads.sourceforge.net:80.
HTTP request sent, awaiting response... 302 Found
Location: http://onboardcloud.dl.sourceforge.net/project/ta-lib/ta-lib/0.4.0/ta-lib-0.4.0-src.tar.gz?viasf=1 [following]
--2024-06-12 09:34:03--  http://onboardcloud.dl.sourceforge.net/project/ta-lib/ta-lib/0.4.0/ta-lib-0.4.0-src.tar.gz?viasf=1

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import talib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Load the dataset
def load_data(file_path):
    data = pd.read_csv(file_path)
    data.columns = ['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume']
    data['Datetime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'], format='%Y.%m.%d %H:%M')
    data.set_index('Datetime', inplace=True)
    data.drop(['Date', 'Time'], axis=1, inplace=True)
    return data

data = load_data('/content/DAT_MT_XAUUSD_M1_202404.csv')

# Resampling function to change time frames
def resample_data(data, freq='5T'):
    ohlc_dict = {
        'Open': 'first',
        'High': 'max',
        'Low': 'min',
        'Close': 'last',
        'Volume': 'sum'
    }
    return data.resample(freq).apply(ohlc_dict).dropna()

# Resample to desired time frame (example for 5-minute data)
data_5min = resample_data(data, freq='5T')

# Wave extraction
def extract_waves(data, threshold=0.01):
    waves = []
    start = 0
    while start < len(data):
        for end in range(start + 2, len(data)):
            regressor_high = LinearRegression()
            regressor_low = LinearRegression()
            x = np.arange(start, end).reshape(-1, 1)
            y_high = data['High'].iloc[start:end].values
            y_low = data['Low'].iloc[start:end].values
            regressor_high.fit(x, y_high)
            regressor_low.fit(x, y_low)
            loss_high = np.mean((regressor_high.predict(x) - y_high) ** 2)
            loss_low = np.mean((regressor_low.predict(x) - y_low) ** 2)

            # Setting Cut-off Values: Different cut-off values are used to determine when to stop capturing a trend, influencing the continuity of the trend and the number of waves extracted.
            if loss_high > threshold or loss_low > threshold:
                waves.append((start, end - 1))
                start = end - 1
                break
        else:
            waves.append((start, len(data) - 1))
            break

    # Feature Preparation: After dividing the data into wave series, features of each wave are prepared for model inputs, including price data features and financial indicators.

    return waves
# Prepare features
def prepare_features(data, waves):
    features = []
    for start, end in waves:
        wave_data = data.iloc[start:end+1]
        num_candles = len(wave_data)
        high_last = wave_data['High'].iloc[-1]
        low_last = wave_data['Low'].iloc[-1]
        avg_high = wave_data['High'].mean()
        avg_low = wave_data['Low'].mean()
        slope_upper = (wave_data['High'].iloc[-1] - wave_data['High'].iloc[0]) / (num_candles - 1)
        slope_lower = (wave_data['Low'].iloc[-1] - wave_data['Low'].iloc[0]) / (num_candles - 1)

        # Financial indicators
        macd, macd_signal, macd_hist = talib.MACD(wave_data['Close'].values)
        ema = talib.EMA(wave_data['Close'].values)
        sma = talib.SMA(wave_data['Close'].values)
        adx = talib.ADX(wave_data['High'].values, wave_data['Low'].values, wave_data['Close'].values)
        rsi = talib.RSI(wave_data['Close'].values)
        stoch_k, stoch_d = talib.STOCH(wave_data['High'].values, wave_data['Low'].values, wave_data['Close'].values)
        willr = talib.WILLR(wave_data['High'].values, wave_data['Low'].values, wave_data['Close'].values)
        upperband, middleband, lowerband = talib.BBANDS(wave_data['Close'].values)
        keltner_upper = talib.EMA((wave_data['High'] + wave_data['Low'] + wave_data['Close']) / 3, timeperiod=20)
        keltner_lower = talib.EMA((wave_data['High'] + wave_data['Low'] + wave_data['Close']) / 3, timeperiod=20) - 2 * talib.ATR(wave_data['High'].values, wave_data['Low'].values, wave_data['Close'].values)

        # Combine features
        feature_set = [num_candles, high_last, low_last, avg_high, avg_low, slope_upper, slope_lower,
                       macd[-1], macd_signal[-1], macd_hist[-1], ema[-1], sma[-1], adx[-1], rsi[-1],
                       stoch_k[-1], stoch_d[-1], willr[-1], upperband[-1], middleband[-1], lowerband[-1],
                       keltner_upper[-1], keltner_lower[-1]]

        features.append(feature_set)

    return features

# Split data into training, validation, and test sets
def split_data(data, train_size=0.8, val_size=0.1):
    train_end = int(len(data) * train_size)
    val_end = int(len(data) * (train_size + val_size))
    train_data = data.iloc[:train_end]
    val_data = data.iloc[train_end:val_end]
    test_data = data.iloc[val_end:]
    return train_data, val_data, test_data

# Extract waves and prepare features
waves = extract_waves(data_5min)
features = prepare_features(data_5min, waves)
features_df = pd.DataFrame(features, columns=['num_candles', 'high_last', 'low_last', 'avg_high', 'avg_low', 'slope_upper', 'slope_lower',
                                              'macd', 'macd_signal', 'macd_hist', 'ema', 'sma', 'adx', 'rsi', 'stoch_k', 'stoch_d', 'willr',
                                              'upperband', 'middleband', 'lowerband', 'keltner_upper', 'keltner_lower'])

# Split the data
train_data, val_data, test_data = split_data(features_df)


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import talib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Load the dataset
def load_data(file_path):
    data = pd.read_csv(file_path)
    data.columns = ['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume']
    data['Datetime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'], format='%Y.%m.%d %H:%M')
    data.set_index('Datetime', inplace=True)
    data.drop(['Date', 'Time'], axis=1, inplace=True)
    return data

# Resampling function to change time frames
def resample_data(data, freq='5T'):
    ohlc_dict = {
        'Open': 'first',
        'High': 'max',
        'Low': 'min',
        'Close': 'last',
        'Volume': 'sum'
    }
    return data.resample(freq).apply(ohlc_dict).dropna()

# Wave extraction
def extract_waves(data, threshold=0.01):
    waves = []
    start = 0
    while start < len(data):
        for end in range(start + 2, len(data)):
            regressor_high = LinearRegression()
            regressor_low = LinearRegression()
            x = np.arange(start, end).reshape(-1, 1)
            y_high = data['High'].iloc[start:end].values
            y_low = data['Low'].iloc[start:end].values
            regressor_high.fit(x, y_high)
            regressor_low.fit(x, y_low)
            loss_high = np.mean((regressor_high.predict(x) - y_high) ** 2)
            loss_low = np.mean((regressor_low.predict(x) - y_low) ** 2)

            # Setting Cut-off Values: Different cut-off values are used to determine when to stop capturing a trend, influencing the continuity of the trend and the number of waves extracted.
            if loss_high > threshold or loss_low > threshold:
                waves.append((start, end - 1))
                start = end - 1
                break
        else:
            waves.append((start, len(data) - 1))
            break

    return waves

# Prepare features
def prepare_features(data, waves):
    features = []
    for start, end in waves:
        wave_data = data.iloc[start:end+1]
        num_candles = len(wave_data)
        high_last = wave_data['High'].iloc[-1]
        low_last = wave_data['Low'].iloc[-1]
        avg_high = wave_data['High'].mean()
        avg_low = wave_data['Low'].mean()
        slope_upper = (wave_data['High'].iloc[-1] - wave_data['High'].iloc[0]) / (num_candles - 1)
        slope_lower = (wave_data['Low'].iloc[-1] - wave_data['Low'].iloc[0]) / (num_candles - 1)

        # Financial indicators
        try:
            macd, macd_signal, macd_hist = talib.MACD(wave_data['Close'].values)
            ema = talib.EMA(wave_data['Close'].values)
            sma = talib.SMA(wave_data['Close'].values)
            adx = talib.ADX(wave_data['High'].values, wave_data['Low'].values, wave_data['Close'].values)
            rsi = talib.RSI(wave_data['Close'].values)
            stoch_k, stoch_d = talib.STOCH(wave_data['High'].values, wave_data['Low'].values, wave_data['Close'].values)
            willr = talib.WILLR(wave_data['High'].values, wave_data['Low'].values, wave_data['Close'].values)
            upperband, middleband, lowerband = talib.BBANDS(wave_data['Close'].values)
            keltner_upper = talib.EMA((wave_data['High'] + wave_data['Low'] + wave_data['Close']) / 3, timeperiod=20)
            keltner_lower = talib.EMA((wave_data['High'] + wave_data['Low'] + wave_data['Close']) / 3, timeperiod=20) - 2 * talib.ATR(wave_data['High'].values, wave_data['Low'].values, wave_data['Close'].values)

            # Combine features
            feature_set = [num_candles, high_last, low_last, avg_high, avg_low, slope_upper, slope_lower,
                           macd[-1], macd_signal[-1], macd_hist[-1], ema[-1], sma[-1], adx[-1], rsi[-1],
                           stoch_k[-1], stoch_d[-1], willr[-1], upperband[-1], middleband[-1], lowerband[-1],
                           keltner_upper[-1], keltner_lower[-1]]

            features.append(feature_set)
        except Exception as e:
            print(f"Error calculating indicators for wave [{start}, {end}]: {e}")
            # Append NaN values for features if indicator calculation fails
            features.append([np.nan] * 22)

    return features

# Split data into training, validation, and test sets
def split_data(data, train_size=0.8, val_size=0.1):
    train_end = int(len(data) * train_size)
    val_end = int(len(data) * (train_size + val_size))
    train_data = data.iloc[:train_end]
    val_data = data.iloc[train_end:val_end]
    test_data = data.iloc[val_end:]
    return train_data, val_data, test_data

# Load the dataset
data = load_data('/content/DAT_MT_XAUUSD_M1_202404.csv')

# Resample to desired time frame (example for 5-minute data)
data_5min = resample_data(data, freq='5T')

# Extract waves
waves = extract_waves(data_5min)

# Prepare features
features = prepare_features(data_5min, waves)
features_df = pd.DataFrame(features, columns=['num_candles', 'high_last', 'low_last', 'avg_high', 'avg_low', 'slope_upper', 'slope_lower',
                                              'macd', 'macd_signal', 'macd_hist', 'ema', 'sma', 'adx', 'rsi', 'stoch_k', 'stoch_d', 'willr',
                                              'upperband', 'middleband', 'lowerband', 'keltner_upper', 'keltner_lower'])

# Split the data
train_data, val_data, test_data = split_data(features_df)

# Check if there are NaN values in the resulting DataFrame
print("NaN values in features DataFrame:")
print(features_df.isna().sum())

# Now, proceed with further analysis using the cleaned data


NaN values in features DataFrame:
num_candles         0
high_last           0
low_last            0
avg_high            0
avg_low             0
slope_upper         0
slope_lower         0
macd             2605
macd_signal      2605
macd_hist        2605
ema              2605
sma              2605
adx              2605
rsi              2605
stoch_k          2605
stoch_d          2605
willr            2605
upperband        2584
middleband       2584
lowerband        2584
keltner_upper    2605
keltner_lower    2605
dtype: int64


In [None]:
waves

[(0, 2),
 (2, 4),
 (4, 6),
 (6, 8),
 (8, 10),
 (10, 12),
 (12, 14),
 (14, 16),
 (16, 18),
 (18, 20),
 (20, 22),
 (22, 24),
 (24, 26),
 (26, 28),
 (28, 31),
 (31, 33),
 (33, 35),
 (35, 37),
 (37, 39),
 (39, 41),
 (41, 43),
 (43, 45),
 (45, 47),
 (47, 49),
 (49, 51),
 (51, 53),
 (53, 55),
 (55, 57),
 (57, 60),
 (60, 63),
 (63, 65),
 (65, 67),
 (67, 69),
 (69, 72),
 (72, 74),
 (74, 76),
 (76, 78),
 (78, 80),
 (80, 82),
 (82, 84),
 (84, 86),
 (86, 88),
 (88, 91),
 (91, 93),
 (93, 95),
 (95, 98),
 (98, 100),
 (100, 102),
 (102, 104),
 (104, 106),
 (106, 108),
 (108, 110),
 (110, 113),
 (113, 115),
 (115, 117),
 (117, 119),
 (119, 121),
 (121, 123),
 (123, 125),
 (125, 127),
 (127, 129),
 (129, 131),
 (131, 133),
 (133, 135),
 (135, 137),
 (137, 139),
 (139, 141),
 (141, 143),
 (143, 145),
 (145, 147),
 (147, 149),
 (149, 151),
 (151, 153),
 (153, 155),
 (155, 157),
 (157, 159),
 (159, 161),
 (161, 163),
 (163, 165),
 (165, 167),
 (167, 170),
 (170, 173),
 (173, 175),
 (175, 178),
 (178, 180

In [None]:
features_df

Unnamed: 0,num_candles,high_last,low_last,avg_high,avg_low,slope_upper,slope_lower,macd,macd_signal,macd_hist,...,adx,rsi,stoch_k,stoch_d,willr,upperband,middleband,lowerband,keltner_upper,keltner_lower
0,3,2262.315,2261.075,2264.385000,2262.225000,-1.635,-1.450,,,,...,,,,,,,,,,
1,3,2260.945,2260.165,2261.515000,2260.428333,-0.685,-0.455,,,,...,,,,,,,,,,
2,3,2260.875,2258.885,2260.965000,2259.691667,-0.035,-0.640,,,,...,,,,,,,,,,
3,3,2258.935,2258.035,2260.171667,2258.581667,-0.970,-0.425,,,,...,,,,,,,,,,
4,3,2259.985,2258.875,2259.608333,2258.475000,0.525,0.420,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2600,3,2339.225,2338.335,2339.512667,2338.848333,-0.305,-0.380,,,,...,,,,,,,,,,
2601,3,2338.815,2337.915,2338.988333,2338.206000,-0.205,-0.210,,,,...,,,,,,,,,,
2602,3,2337.865,2336.555,2338.411667,2336.981667,-0.475,-0.680,,,,...,,,,,,,,,,
2603,3,2337.575,2336.465,2337.708333,2336.609333,-0.145,-0.045,,,,...,,,,,,,,,,


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import talib

# Load the dataset
def load_data(file_path):
    data = pd.read_csv(file_path)
    data.columns = ['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume']
    data['Datetime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'], format='%Y.%m.%d %H:%M')
    data.set_index('Datetime', inplace=True)
    data.drop(['Date', 'Time'], axis=1, inplace=True)
    return data

# Split data into training, validation, and test sets
def split_data(data, train_size=0.8, val_size=0.1):
    train_end = int(len(data) * train_size)
    val_end = int(len(data) * (train_size + val_size))
    train_data = data.iloc[:train_end]
    val_data = data.iloc[train_end:val_end]
    test_data = data.iloc[val_end:]
    return train_data, val_data, test_data
# Resampling function to change time frames
def resample_data(data, freq='5T'):
    ohlc_dict = {
        'Open': 'first',
        'High': 'max',
        'Low': 'min',
        'Close': 'last',
        'Volume': 'sum'
    }
    return data.resample(freq).apply(ohlc_dict).dropna()

# Wave extraction
def extract_waves(data, threshold=0.01):
    waves = []
    start = 0
    while start < len(data):
        for end in range(start + 2, len(data)):
            regressor_high = LinearRegression()
            regressor_low = LinearRegression()
            x = np.arange(start, end).reshape(-1, 1)
            y_high = data['High'].iloc[start:end].values
            y_low = data['Low'].iloc[start:end].values
            regressor_high.fit(x, y_high)
            regressor_low.fit(x, y_low)
            loss_high = np.mean((regressor_high.predict(x) - y_high) ** 2)
            loss_low = np.mean((regressor_low.predict(x) - y_low) ** 2)

            # Setting Cut-off Values: Different cut-off values are used to determine when to stop capturing a trend, influencing the continuity of the trend and the number of waves extracted.
            if loss_high > threshold or loss_low > threshold:
                waves.append((start, end - 1))
                start = end - 1
                break
        else:
            waves.append((start, len(data) - 1))
            break

    return waves

# Prepare features
def prepare_features(data, waves):
    features = []
    for start, end in waves:
        wave_data = data.iloc[start:end+1].copy()  # Use .copy() to avoid SettingWithCopyWarning
        num_candles = len(wave_data)
        if num_candles < 2:  # Ensure we have at least 2 candles for computation
            continue

                # Price data features
        high_last = wave_data['High'].iloc[-1]
        low_last = wave_data['Low'].iloc[-1]
        avg_high = wave_data['High'].mean()
        avg_low = wave_data['Low'].mean()
        slope_upper = (wave_data['High'].iloc[-1] - wave_data['High'].iloc[0]) / (len(wave_data) - 1)
        slope_lower = (wave_data['Low'].iloc[-1] - wave_data['Low'].iloc[0]) / (len(wave_data) - 1)

        # Add indicator columns to wave_data DataFrame before accessing them
        macd, signal, hist = talib.MACD(wave_data['Close'], fastperiod=MACD_FAST, slowperiod=MACD_SLOW, signalperiod=MACD_SIGNAL)
        wave_data['macd'] = macd
        wave_data['macd_signal'] = signal
        wave_data['macd_hist'] = hist
        wave_data['sma_f'] = wave_data['Close'].rolling(window=SMA_FAST).mean()
        wave_data['sma_s'] = wave_data['Close'].rolling(window=SMA_SLOW).mean()
        wave_data['adx'] = talib.ADX(wave_data['High'], wave_data['Low'], wave_data['Close'], timeperiod=ADX_PERIOD)
        wave_data['rsi'] = talib.RSI(wave_data['Close'], timeperiod=RSI_PERIOD)
        wave_data['stoch_k'], wave_data['stoch_d'] = talib.STOCH(wave_data['High'], wave_data['Low'], wave_data['Close'], fastk_period=STOCH_K, slowk_period=STOCH_K, slowd_period=STOCH_D)
        wave_data['willr'] = talib.WILLR(wave_data['High'], wave_data['Low'], wave_data['Close'], timeperiod=WILLR_PERIOD)
        wave_data['upperband'], wave_data['middleband'], wave_data['lowerband'] = talib.BBANDS(wave_data['Close'], timeperiod=BBAND_PERIOD)
        wave_data['keltner_upper'] = talib.EMA((wave_data['High'] + wave_data['Low'] + wave_data['Close']) / 3, timeperiod=KELTNER_PERIOD) + 2 * talib.ATR(wave_data['High'], wave_data['Low'], wave_data['Close'], timeperiod=KELTNER_PERIOD)
        wave_data['keltner_lower'] = talib.EMA((wave_data['High'] + wave_data['Low'] + wave_data['Close']) / 3, timeperiod=KELTNER_PERIOD) - 2 * talib.ATR(wave_data['High'], wave_data['Low'], wave_data['Close'], timeperiod=KELTNER_PERIOD)
        # Extract the most recent value of each indicator
        # Extract the most recent value of each indicator
        feature_set = [
            num_candles, high_last, low_last, avg_high, avg_low, slope_upper, slope_lower,
            macd.iloc[-1], signal.iloc[-1], hist.iloc[-1],  # Changed macd_signal to signal
            wave_data['sma_f'].iloc[-1], wave_data['sma_s'].iloc[-1], wave_data['adx'].iloc[-1], wave_data['rsi'].iloc[-1],
            wave_data['stoch_k'].iloc[-1], wave_data['stoch_d'].iloc[-1], wave_data['willr'].iloc[-1],
            wave_data['upperband'].iloc[-1], wave_data['middleband'].iloc[-1], wave_data['lowerband'].iloc[-1],
            wave_data['keltner_upper'].iloc[-1], wave_data['keltner_lower'].iloc[-1]
        ]


        features.append(feature_set)

    return features



# Define constants for indicators
SMA_FAST = 10
SMA_SLOW = 30
RSI_PERIOD = 14
MACD_FAST = 12
MACD_SLOW = 26
MACD_SIGNAL = 9
STOCH_K = 14
STOCH_D = 3
BBAND_PERIOD = 20
KELTNER_PERIOD = 20
ADX_PERIOD = 14
WILLR_PERIOD = 14  # Define WILLR_PERIOD

# Load and preprocess the data
data = load_data('/content/DAT_MT_XAUUSD_M1_202404.csv')
data_5min = resample_data(data, freq='5T')

# Extract waves
waves = extract_waves(data_5min)

# Prepare features
features = prepare_features(data_5min, waves)

# Convert features to DataFrame
features_df = pd.DataFrame(features, columns=['num_candles', 'high_last', 'low_last', 'avg_high', 'avg_low', 'slope_upper', 'slope_lower',
                                              'macd', 'macd_signal', 'macd_hist', 'sma_f', 'sma_s', 'adx', 'rsi', 'stoch_k', 'stoch_d', 'willr',
                                              'upperband', 'middleband', 'lowerband', 'keltner_upper', 'keltner_lower'])

# Split the data
train_data, val_data, test_data = split_data(features_df)


In [None]:
features

[[3,
  2262.315,
  2261.075,
  2264.385,
  2262.225,
  -1.634999999999991,
  -1.4500000000000455,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan],
 [3,
  2260.945,
  2260.165,
  2261.515,
  2260.4283333333333,
  -0.6849999999999454,
  -0.45499999999992724,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan],
 [3,
  2260.875,
  2258.885,
  2260.965,
  2259.691666666667,
  -0.035000000000081855,
  -0.6399999999998727,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan],
 [3,
  2258.935,
  2258.035,
  2260.1716666666666,
  2258.5816666666665,
  -0.9700000000000273,
  -0.4250000000001819,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan],
 [3,
  2259.985,
  2258.875,
  2259.6083333333336,
  2258.475,
  0.525000000000091,
  0.42000000000007276,
  nan,
  nan,
  nan,
  nan,
  na

In [None]:
+wave_data = data_5min.iloc[0:2]
talib.MACD(wave_data['Close'])

(Datetime
 2024-04-01 00:00:00   NaN
 2024-04-01 00:05:00   NaN
 dtype: float64,
 Datetime
 2024-04-01 00:00:00   NaN
 2024-04-01 00:05:00   NaN
 dtype: float64,
 Datetime
 2024-04-01 00:00:00   NaN
 2024-04-01 00:05:00   NaN
 dtype: float64)

In [None]:
wave_data

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-04-01 00:00:00,2264.345,2265.585,2263.975,2265.265,0
2024-04-01 00:05:00,2265.255,2265.255,2261.625,2261.634,0


In [None]:
data_5min

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-04-01 00:00:00,2264.345,2265.585,2263.975,2265.265,0
2024-04-01 00:05:00,2265.255,2265.255,2261.625,2261.634,0
2024-04-01 00:10:00,2261.635,2262.315,2261.075,2261.075,0
2024-04-01 00:15:00,2261.125,2261.285,2260.045,2260.605,0
2024-04-01 00:20:00,2260.624,2260.945,2260.165,2260.165,0
...,...,...,...,...,...
2024-04-26 16:35:00,2337.015,2337.865,2336.555,2337.685,0
2024-04-26 16:40:00,2337.685,2337.685,2336.808,2336.888,0
2024-04-26 16:45:00,2336.875,2337.575,2336.465,2336.485,0
2024-04-26 16:50:00,2336.495,2337.875,2336.298,2337.325,0


In [None]:
data_5min.isnull().sum()

Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64

In [None]:
waves[:6]

[(0, 2), (2, 4), (4, 6), (6, 8), (8, 10), (10, 12)]

In [None]:
features_df

Unnamed: 0,num_candles,high_last,low_last,avg_high,avg_low,slope_upper,slope_lower,macd,macd_signal,macd_hist,...,adx,rsi,stoch_k,stoch_d,willr,upperband,middleband,lowerband,keltner_upper,keltner_lower
0,3,2262.315,2261.075,2264.385000,2262.225000,-1.635,-1.450,,,,...,,,,,,,,,,
1,3,2260.945,2260.165,2261.515000,2260.428333,-0.685,-0.455,,,,...,,,,,,,,,,
2,3,2260.875,2258.885,2260.965000,2259.691667,-0.035,-0.640,,,,...,,,,,,,,,,
3,3,2258.935,2258.035,2260.171667,2258.581667,-0.970,-0.425,,,,...,,,,,,,,,,
4,3,2259.985,2258.875,2259.608333,2258.475000,0.525,0.420,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2600,3,2339.225,2338.335,2339.512667,2338.848333,-0.305,-0.380,,,,...,,,,,,,,,,
2601,3,2338.815,2337.915,2338.988333,2338.206000,-0.205,-0.210,,,,...,,,,,,,,,,
2602,3,2337.865,2336.555,2338.411667,2336.981667,-0.475,-0.680,,,,...,,,,,,,,,,
2603,3,2337.575,2336.465,2337.708333,2336.609333,-0.145,-0.045,,,,...,,,,,,,,,,


In [None]:
import pandas as pd
import numpy as np
import talib

# Parameters for indicators
SMA_FAST = 10
SMA_SLOW = 30
RSI_PERIOD = 14
RSI_AVG_PERIOD = 5
MACD_FAST = 12
MACD_SLOW = 26
MACD_SIGNAL = 9
STOCH_K = 14
STOCH_D = 3
KELTNER_PERIOD = 20
BBAND_PERIOD = 20

# def calculate_indicators(data):
#     data['sma_f'] = data['Close'].rolling(window=SMA_FAST).mean()
#     data['sma_s'] = data['Close'].rolling(window=SMA_SLOW).mean()
#     data['rsi'] = talib.RSI(data['Close'], timeperiod=RSI_PERIOD)
#     data['sma_r'] = data['rsi'].rolling(window=RSI_AVG_PERIOD).mean()
#     data['macd'], data['macdSignal'], data['macdHist'] = talib.MACD(data['Close'], fastperiod=MACD_FAST, slowperiod=MACD_SLOW, signalperiod=MACD_SIGNAL)
#     data['stoch_k'], data['stoch_d'] = talib.STOCH(data['High'], data['Low'], data['Close'], fastk_period=STOCH_K, slowk_period=STOCH_K, slowd_period=STOCH_D)
#     data['adx'] = talib.ADX(data['High'], data['Low'], data['Close'])
#     data['willr'] = talib.WILLR(data['High'], data['Low'], data['Close'])
#     data['upperband'], data['middleband'], data['lowerband'] = talib.BBANDS(data['Close'], timeperiod=BBAND_PERIOD)

#     typical_price = (data['High'] + data['Low'] + data['Close']) / 3
#     data['keltner_upper'] = talib.EMA(typical_price, timeperiod=KELTNER_PERIOD) + 2 * talib.ATR(data['High'], data['Low'], data['Close'])
#     data['keltner_lower'] = talib.EMA(typical_price, timeperiod=KELTNER_PERIOD) - 2 * talib.ATR(data['High'], data['Low'], data['Close'])

#     # Fill NaN values forward and then backward to handle initial calculation gaps
#     data.fillna(method='bfill', inplace=True)
#     data.fillna(method='ffill', inplace=True)

#     # Remove any remaining NaNs
#     data.dropna(inplace=True)

#     return data

def prepare_features(data, waves):
    features = []
    for start, end in waves:
        wave_data = data.iloc[start:end+1]
        num_candles = len(wave_data)
        if num_candles < max(SMA_SLOW, RSI_PERIOD, MACD_SLOW, STOCH_K):  # Ensure we have enough data points for the slowest indicator
            continue

        high_last = wave_data['High'].iloc[-1]
        low_last = wave_data['Low'].iloc[-1]
        avg_high = wave_data['High'].mean()
        avg_low = wave_data['Low'].mean()
        slope_upper = (wave_data['High'].iloc[-1] - wave_data['High'].iloc[0]) / (num_candles - 1)
        slope_lower = (wave_data['Low'].iloc[-1] - wave_data['Low'].iloc[0]) / (num_candles - 1)

        # Extract the most recent value of each indicator
        feature_set = [
            num_candles, high_last, low_last, avg_high, avg_low, slope_upper, slope_lower,
            wave_data['macd'].iloc[-1], wave_data['macdSignal'].iloc[-1], wave_data['macdHist'].iloc[-1],
            wave_data['sma_f'].iloc[-1], wave_data['sma_s'].iloc[-1], wave_data['adx'].iloc[-1], wave_data['rsi'].iloc[-1],
            wave_data['stoch_k'].iloc[-1], wave_data['stoch_d'].iloc[-1], wave_data['willr'].iloc[-1],
            wave_data['upperband'].iloc[-1], wave_data['middleband'].iloc[-1], wave_data['lowerband'].iloc[-1],
            wave_data['keltner_upper'].iloc[-1], wave_data['keltner_lower'].iloc[-1]
        ]

        features.append(feature_set)

    return features

# Sample data (make sure your DataFrame has 'Close', 'High', 'Low' columns)
np.random.seed(0)
data = pd.DataFrame({
    'Close': np.random.randn(100) + 2000,  # Replace with actual data
    'High': np.random.randn(100) + 2005,   # Replace with actual data
    'Low': np.random.randn(100) + 1995     # Replace with actual data
})

# Calculate indicators for the entire dataset
# data = calculate_indicators(data)

# Define waves (example)
waves = [(i, i + 29) for i in range(0, len(data) - 30, 30)]  # Replace with actual wave indices

# Prepare features for each wave
features = prepare_features(data, waves)

# Display features
features_df = pd.DataFrame(features, columns=[
    'num_candles', 'high_last', 'low_last', 'avg_high', 'avg_low',
    'slope_upper', 'slope_lower', 'macd', 'macdSignal', 'macdHist',
    'sma_f', 'sma_s', 'adx', 'rsi', 'stoch_k', 'stoch_d',
    'willr', 'upperband', 'middleband', 'lowerband',
    'keltner_upper', 'keltner_lower'
])

print(features_df)


KeyError: 'macd'

In [None]:
data

Unnamed: 0,Close,High,Low
0,2001.764052,2006.883151,1994.630818
1,2000.400157,2003.652241,1994.760621
2,2000.978738,2003.729515,1996.099660
3,2002.240893,2005.969397,1995.655264
4,2001.867558,2003.826877,1995.640132
...,...,...,...
95,2000.706573,2004.828454,1996.136891
96,2000.010500,2005.771791,1995.097725
97,2001.785870,2005.823504,1995.582954
98,2000.126912,2007.163236,1994.600551


In [None]:
def create_sequences(data, target, sequence_length):
    sequences = []
    targets = []
    for i in range(len(data) - sequence_length):
        sequences.append(data[i:i+sequence_length])
        targets.append(target[i+sequence_length])
    return np.array(sequences), np.array(targets)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models

# Assuming train_data, val_data, and test_data are pre-defined Pandas DataFrames

# Function to create sequences
def create_sequences(data, target, sequence_length):
    sequences = []
    targets = []
    for i in range(len(data) - sequence_length):
        sequences.append(data[i:i + sequence_length])
        targets.append(target[i + sequence_length])
    return np.array(sequences), np.array(targets)

# Separate features and target variable
X_train = train_data.drop(columns=['high_last']).values
y_train = train_data['high_last'].values
X_val = val_data.drop(columns=['high_last']).values
y_val = val_data['high_last'].values
X_test = test_data.drop(columns=['high_last']).values
y_test = test_data['high_last'].values

# Set sequence length
sequence_length = 24

# Create sequences
X_train_seq, y_train_seq = create_sequences(X_train, y_train, sequence_length)
X_val_seq, y_val_seq = create_sequences(X_val, y_val, sequence_length)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, sequence_length)

# Check the shape of the sequences
print(f"X_train_seq shape: {X_train_seq.shape}")
print(f"X_val_seq shape: {X_val_seq.shape}")
print(f"X_test_seq shape: {X_test_seq.shape}")

# Assuming feature count based on your data
feature_count = X_train_seq.shape[2]

# Define the model architecture
input_shape = (sequence_length, feature_count)
inputs = layers.Input(shape=input_shape)

# Layer Normalization and Multi-Head Attention
x = layers.MultiHeadAttention(num_heads=8, key_dim=feature_count // 8)(inputs, inputs)
x = layers.LayerNormalization()(x)

# Convolutional layers with SELU activation and same padding
conv1 = layers.Conv1D(filters=32, kernel_size=3, dilation_rate=1, activation='selu', padding='same')(x)
conv2 = layers.Conv1D(filters=32, kernel_size=3, dilation_rate=2, activation='selu', padding='same')(conv1)
conv3 = layers.Conv1D(filters=32, kernel_size=3, dilation_rate=2, activation='selu', padding='same')(conv2)
conv4 = layers.Conv1D(filters=32, kernel_size=3, dilation_rate=2, activation='selu', padding='same')(conv3)
conv5 = layers.Conv1D(filters=32, kernel_size=3, dilation_rate=4, activation='selu', padding='same')(conv4)

# Dropout layers
dropout4 = layers.Dropout(0.2)(conv4)
dropout5 = layers.Dropout(0.4)(conv5)
dropout5_final = layers.Dropout(0.5)(conv5)

# Concatenation of convolutional layers
concatenated = layers.Concatenate()([conv1, dropout4, dropout5_final])

# Global Average Pooling
gap = layers.GlobalAveragePooling1D()(concatenated)

# MLP with 1 layer of 32 nodes with ReLU activation
mlp = layers.Dense(32, activation='relu')(gap)

# Output layer with 1 node (since we are predicting a single value)
outputs = layers.Dense(1, activation='linear')(mlp)

# Create model
model = models.Model(inputs=inputs, outputs=outputs)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0)

# Compile the model
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mae'])

# Print model summary
model.summary()

# Train the model
history = model.fit(X_train_seq, y_train_seq, epochs=50, batch_size=32, validation_data=(X_val_seq, y_val_seq))

# Evaluate the model
train_loss, train_mae = model.evaluate(X_train_seq, y_train_seq)
val_loss, val_mae = model.evaluate(X_val_seq, y_val_seq)
test_loss, test_mae = model.evaluate(X_test_seq, y_test_seq)

print(f'Train MAE: {train_mae}, Validation MAE: {val_mae}, Test MAE: {test_mae}')

# Make predictions
y_pred_train = model.predict(X_train_seq)
y_pred_val = model.predict(X_val_seq)
y_pred_test = model.predict(X_test_seq)

# Visualize the predictions
plt.figure(figsize=(14, 7))
plt.plot(y_test_seq, label='True Values')
plt.plot(y_pred_test, label='Predicted Values')
plt.legend()
plt.xlabel('Index')
plt.ylabel('Price')
plt.title('True vs Predicted Prices')
plt.show()

# Save the prepared datasets
train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

# Save the model
model.save('stock_price_prediction_model.h5')


X_train_seq shape: (2060, 24, 21)
X_val_seq shape: (236, 24, 21)
X_test_seq shape: (237, 24, 21)
Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 24, 21)]             0         []                            
                                                                                                  
 multi_head_attention_3 (Mu  (None, 24, 21)               369       ['input_4[0][0]',             
 ltiHeadAttention)                                                   'input_4[0][0]']             
                                                                                                  
 layer_normalization_3 (Lay  (None, 24, 21)               42        ['multi_head_attention_3[0][0]
 erNormalization)                                                   ']                        

KeyboardInterrupt: 

In [None]:

# Train a simple model (Random Forest Regressor as an example)
model = RandomForestRegressor(n_estimators=100, random_state=42)
X_train = train_data.drop(columns=['high_last'])
y_train = train_data['high_last']
X_val = val_data.drop(columns=['high_last'])
y_val = val_data['high_last']
X_test = test_data.drop(columns=['high_last'])
y_test = test_data['high_last']

model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)
y_pred_test = model.predict(X_test)

# Visualize the predictions
plt.figure(figsize=(14, 7))
plt.plot(y_test.index, y_test, label='True Values')
plt.plot(y_test.index, y_pred_test, label='Predicted Values')
plt.legend()
plt.xlabel('Index')
plt.ylabel('Price')
plt.title('True vs Predicted Prices')
plt.show()

# Save the prepared datasets
train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)


In [None]:
path="/content/DAT_MT_XAUUSD_M1_202404.csv"
dt=pd.read_csv(path)

In [None]:
dt.columns= ['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume']

In [None]:
dt

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume
0,2024.04.12,00:01,2383.545,2384.105,2383.545,2383.848,0
1,2024.04.12,00:02,2383.845,2384.195,2383.498,2383.555,0
2,2024.04.12,00:03,2383.555,2383.595,2383.214,2383.315,0
3,2024.04.12,00:04,2383.395,2383.395,2382.898,2383.065,0
4,2024.04.12,00:05,2383.195,2383.765,2382.805,2382.825,0
...,...,...,...,...,...,...,...
27232,2024.04.23,23:54,2327.085,2327.155,2326.845,2326.855,0
27233,2024.04.23,23:55,2326.855,2326.945,2326.535,2326.685,0
27234,2024.04.23,23:56,2326.685,2327.595,2326.495,2327.465,0
27235,2024.04.23,23:57,2327.405,2327.755,2327.175,2327.725,0


In [None]:
len(dt.columns)

7

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import talib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
# Resampling function to change time frames
def resample_data(data, freq='5T'):
    # Define a dictionary specifying the aggregation methods for each column
    ohlc_dict = {
        'Open': 'first',   # The 'Open' price for the resampled period is the first 'Open' price in that period
        'High': 'max',     # The 'High' price for the resampled period is the maximum 'High' price in that period
        'Low': 'min',      # The 'Low' price for the resampled period is the minimum 'Low' price in that period
        'Close': 'last',   # The 'Close' price for the resampled period is the last 'Close' price in that period
        'Volume': 'sum'    # The 'Volume' for the resampled period is the sum of 'Volume' in that period
    }

    # Resample the data using the specified frequency and apply the aggregation methods defined in ohlc_dict
    resampled_data = data.resample(freq).apply(ohlc_dict).dropna()

    return resampled_data
# Resample to desired time frame (example for 5-minute data)
data_5min = resample_data(data, freq='5T')

# Wave extraction
def extract_waves(data, threshold=0.01):
    waves = []  # List to store the extracted wave ranges
    start = 0  # Start index for the current wave
    # Loop until we have traversed the entire data
    while start < len(data):
        # Iterate over possible end points of the wave
        for end in range(start + 2, len(data)):
            # Initialize linear regressors for high and low values
            regressor_high = LinearRegression()
            regressor_low = LinearRegression()
            # Create input data for regression (x values) set to -1, indicating that it should be inferred from the length of the array, and the other dimension is set to 1
            x = np.arange(start, end).reshape(-1, 1)
            # Extract high and low values for the current wave range
            y_high = data['High'].iloc[start:end].values
            y_low = data['Low'].iloc[start:end].values
            # Fit the regressors to the data
            regressor_high.fit(x, y_high)
            regressor_low.fit(x, y_low)
            # Calculate the mean squared error (loss) for high and low values
            loss_high = np.mean((regressor_high.predict(x) - y_high) ** 2)
            loss_low = np.mean((regressor_low.predict(x) - y_low) ** 2)
            # If either the loss for high or low values exceeds the threshold
            if loss_high > threshold or loss_low > threshold:
                # Add the current wave range to the list
                waves.append((start, end - 1))
                # Move the start index to the end of the current wave
                start = end - 1
                # Break the inner loop to start a new wave search
                break
        else:
            # If no break occurred in the inner loop, it means the wave extends until the end of the data
            waves.append((start, len(data) - 1))
            # Exit the outer loop as we have traversed the entire data
            break
    return waves

# Prepare features
def prepare_features(data, waves):
    features = []  # List to store feature sets for each wave
    for start, end in waves:
        wave_data = data.iloc[start:end+1]  # Extract data for the current wave
        num_candles = len(wave_data)        # Number of data points (candles) in the current wave
        high_last = wave_data['High'].iloc[-1]  # Last high value in the wave
        low_last = wave_data['Low'].iloc[-1]  # Last low value in the wave
        avg_high = wave_data['High'].mean()  # Average high value in the wave
        avg_low = wave_data['Low'].mean()  # Average low value in the wave
        slope_upper = (wave_data['High'].iloc[-1] - wave_data['High'].iloc[0]) / num_candles  # Slope of the high values
        slope_lower = (wave_data['Low'].iloc[-1] - wave_data['Low'].iloc[0]) / num_candles  # Slope of the low values
        # Calculate financial indicators using the TA-Lib library
        macd, macd_signal, macd_hist = talib.MACD(wave_data['Close'])  # MACD indicators
        ema = talib.EMA(wave_data['Close'])  # Exponential Moving Average
        sma = talib.SMA(wave_data['Close'])  # Simple Moving Average
        adx = talib.ADX(wave_data['High'], wave_data['Low'], wave_data['Close'])  # Average Directional Index
        rsi = talib.RSI(wave_data['Close'])  # Relative Strength Index
        # technical indicator for measuring momentum and generating overbought and oversold signals
        stoch_k, stoch_d = talib.STOCH(wave_data['High'], wave_data['Low'], wave_data['Close'])  # Stochastic Oscillator
        #Williams %R is a momentum indicator that compares a stock’s closing price to the high-low range over a specific period. It can be used to find entry and exit points in the market
        willr = talib.WILLR(wave_data['High'], wave_data['Low'], wave_data['Close'])  # Williams %R
        #Bollinger Bands are three lines that indicate the price range and volatility of a security.
        upperband, middleband, lowerband = talib.BBANDS(wave_data['Close'])  # Bollinger Bands
        #used to identify trade opportunities in swing action as prices move within an upper and lower band.
        keltner_upper = talib.EMA((wave_data['High'] + wave_data['Low'] + wave_data['Close']) / 3)  # Keltner Channel upper band
        keltner_lower = talib.EMA((wave_data['High'] + wave_data['Low'] + wave_data['Close']) / 3, timeperiod=10) - 2 * talib.ATR(wave_data['High'], wave_data['Low'], wave_data['Close'])  # Keltner Channel lower band
        # Combine features into a single list for the current wave
        feature_set = [
            num_candles, high_last, low_last, avg_high, avg_low, slope_upper, slope_lower,
            macd[-1], macd_signal[-1], macd_hist[-1], ema[-1], sma[-1], adx[-1], rsi[-1],
            stoch_k[-1], stoch_d[-1], willr[-1], upperband[-1], middleband[-1], lowerband[-1],
            keltner_upper[-1], keltner_lower[-1]
        ]

        features.append(feature_set)  # Add the feature set to the list
    return features  # Return the list of feature sets
# Split data into training, validation, and test sets
def split_data(data, train_size=0.8, val_size=0.1):
    train_end = int(len(data) * train_size)
    val_end = int(len(data) * (train_size + val_size))
    train_data = data.iloc[:train_end]
    val_data = data.iloc[train_end:val_end]
    test_data = data.iloc[val_end:]
    return train_data, val_data, test_data

# Extract waves and prepare features
waves = extract_waves(data_5min)
features = prepare_features(data_5min, waves)
features_df = pd.DataFrame(features, columns=['num_candles', 'high_last', 'low_last', 'avg_high', 'avg_low', 'slope_upper', 'slope_lower',
                                              'macd', 'macd_signal', 'macd_hist', 'ema', 'sma', 'adx', 'rsi', 'stoch_k', 'stoch_d', 'willr',
                                              'upperband', 'middleband', 'lowerband', 'keltner_upper', 'keltner_lower'])

# Split the data
train_data, val_data, test_data = split_data(features_df)

# Train a simple model (Random Forest Regressor as an example)
model = RandomForestRegressor(n_estimators=100, random_state=42)
X_train = train_data.drop(columns=['high_last'])
y_train = train_data['high_last']
X_val = val_data.drop(columns=['high_last'])
y_val = val_data['high_last']
X_test = test_data.drop(columns=['high_last'])
y_test = test_data['high_last']
model.fit(X_train, y_train)
# Make predictions
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)
y_pred_test = model.predict(X_test)
# Visualize the predictions
plt.figure(figsize=(14, 7))
plt.plot(y_test.index, y_test, label='True Values')
plt.plot(y_test.index, y_pred_test, label='Predicted Values')
plt.legend()
plt.xlabel('Index')
plt.ylabel('Price')
plt.title('True vs Predicted Prices')
plt.show()
# Save the prepared datasets
train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)


NameError: name 'data' is not defined

In [None]:
data['High']

KeyError: 'High'

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import talib


# Wave extraction
def extract_waves(data, threshold=0.01):
    waves = []
    start = 0
    while start < len(data):
        for end in range(start + 2, len(data)):
            regressor_high = LinearRegression()
            regressor_low = LinearRegression()
            x = np.arange(start, end).reshape(-1, 1)
            y_high = data['High'].iloc[start:end].values
            y_low = data['Low'].iloc[start:end].values
            regressor_high.fit(x, y_high)
            regressor_low.fit(x, y_low)
            loss_high = np.mean((regressor_high.predict(x) - y_high) ** 2)
            loss_low = np.mean((regressor_low.predict(x) - y_low) ** 2)
            if loss_high > threshold or loss_low > threshold:
                waves.append((start, end - 1))
                start = end - 1
                break
        else:
            waves.append((start, len(data) - 1))
            break
    return waves

# Prepare features
def prepare_features(data, waves):
    features = []
    for start, end in waves:
        wave_data = data.iloc[start:end+1]
        num_candles = len(wave_data)
        high_last = wave_data['High'].iloc[-1]
        low_last = wave_data['Low'].iloc[-1]
        avg_high = wave_data['High'].mean()
        avg_low = wave_data['Low'].mean()
        slope_upper = (wave_data['High'].iloc[-1] - wave_data['High'].iloc[0]) / num_candles
        slope_lower = (wave_data['Low'].iloc[-1] - wave_data['Low'].iloc[0]) / num_candles

        # Financial indicators
        macd, macd_signal, macd_hist = talib.MACD(wave_data['Close'])
        ema = talib.EMA(wave_data['Close'])
        sma = talib.SMA(wave_data['Close'])
        adx = talib.ADX(wave_data['High'], wave_data['Low'], wave_data['Close'])
        rsi = talib.RSI(wave_data['Close'])
        stoch_k, stoch_d = talib.STOCH(wave_data['High'], wave_data['Low'], wave_data['Close'])
        willr = talib.WILLR(wave_data['High'], wave_data['Low'], wave_data['Close'])
        upperband, middleband, lowerband = talib.BBANDS(wave_data['Close'])
        keltner_upper = talib.EMA((wave_data['High'] + wave_data['Low'] + wave_data['Close']) / 3)
        keltner_lower = talib.EMA((wave_data['High'] + wave_data['Low'] + wave_data['Close']) / 3, timeperiod=10) - 2 * talib.ATR(wave_data['High'], wave_data['Low'], wave_data['Close'])

        # Combine features
        feature_set = [num_candles, high_last, low_last, avg_high, avg_low, slope_upper, slope_lower,
                       macd[-1], macd_signal[-1], macd_hist[-1], ema[-1], sma[-1], adx[-1], rsi[-1],
                       stoch_k[-1], stoch_d[-1], willr[-1], upperband[-1], middleband[-1], lowerband[-1],
                       keltner_upper[-1], keltner_lower[-1]]

        features.append(feature_set)

    return features

# Split data into training, validation, and test sets
def split_data(data, train_size=0.8, val_size=0.1):
    train_end = int(len(data) * train_size)
    val_end = int(len(data) * (train_size + val_size))
    train_data = data.iloc[:train_end]
    val_data = data.iloc[train_end:val_end]
    test_data = data.iloc[val_end:]
    return train_data, val_data, test_data

waves = extract_waves(data)
features = prepare_features(data, waves)
features_df = pd.DataFrame(features, columns=['num_candles', 'high_last', 'low_last', 'avg_high', 'avg_low', 'slope_upper', 'slope_lower',
                                              'macd', 'macd_signal', 'macd_hist', 'ema', 'sma', 'adx', 'rsi', 'stoch_k', 'stoch_d', 'willr',
                                              'upperband', 'middleband', 'lowerband', 'keltner_upper', 'keltner_lower'])

train_data, val_data, test_data = split_data(features_df)

# Save the prepared datasets
train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)


KeyError: 'High'

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
# Define input shape
input_shape = (24, 39)
# Input layer
inputs = layers.Input(shape=input_shape)
# Layer Normalization
x = layers.MultiHeadAttention(num_heads=2, key_dim=2)(inputs,inputs)
# Multi-Head Attention
x = layers.LayerNormalization()(x)
# Convolutional layers with SELU activation and same padding
conv1 = layers.Conv1D(filters=32, kernel_size=3, dilation_rate=1, activation='selu', padding='same')(x)
conv2 = layers.Conv1D(filters=32, kernel_size=3, dilation_rate=2, activation='selu', padding='same')(conv1)
conv3 = layers.Conv1D(filters=32, kernel_size=3, dilation_rate=2, activation='selu', padding='same')(conv2)
conv4 = layers.Conv1D(filters=32, kernel_size=3, dilation_rate=2, activation='selu', padding='same')(conv3)
conv5 = layers.Conv1D(filters=32, kernel_size=3, dilation_rate=4, activation='selu', padding='same')(conv4)

# Dropout layers
dropout4 = layers.Dropout(0.2)(conv4)
dropout5 = layers.Dropout(0.4)(conv5)
dropout5_final = layers.Dropout(0.5)(conv5)

# Concatenation of convolutional layers
concatenated = layers.Concatenate()([conv1, dropout4, dropout5_final])

# Global Average Pooling
gap = layers.GlobalAveragePooling1D()(concatenated)

# MLP with 1 layer of 32 nodes with ReLU activation
mlp = layers.Dense(32, activation='relu')(gap)

# Output layer with 2 nodes and linear activation
outputs = layers.Dense(2, activation='linear')(mlp)

# Create model
model = models.Model(inputs=inputs, outputs=outputs)

# Print model summary
model.summary()


In [None]:
import pandas as pd


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import talib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Define the neural network model
def create_model(input_shape):
    inputs = layers.Input(shape=input_shape)
    x = layers.MultiHeadAttention(num_heads=2, key_dim=2)(inputs, inputs)
    x = layers.LayerNormalization()(x)
    conv1 = layers.Conv1D(filters=32, kernel_size=3, dilation_rate=1, activation='selu', padding='same')(x)
    conv2 = layers.Conv1D(filters=32, kernel_size=3, dilation_rate=2, activation='selu', padding='same')(conv1)
    conv3 = layers.Conv1D(filters=32, kernel_size=3, dilation_rate=2, activation='selu', padding='same')(conv2)
    conv4 = layers.Conv1D(filters=32, kernel_size=3, dilation_rate=2, activation='selu', padding='same')(conv3)
    conv5 = layers.Conv1D(filters=32, kernel_size=3, dilation_rate=4, activation='selu', padding='same')(conv4)
    dropout4 = layers.Dropout(0.2)(conv4)
    dropout5 = layers.Dropout(0.4)(conv5)
    dropout5_final = layers.Dropout(0.5)(conv5)
    concatenated = layers.Concatenate()([conv1, dropout4, dropout5_final])
    gap = layers.GlobalAveragePooling1D()(concatenated)
    mlp = layers.Dense(32, activation='relu')(gap)
    outputs = layers.Dense(2, activation='linear')(mlp)
    model = models.Model(inputs=inputs, outputs=outputs)
    model.summary()
    return model

# Resampling function to change time frames
def resample_data(data, freq='5T'):
    ohlc_dict = {
        'Open': 'first',
        'High': 'max',
        'Low': 'min',
        'Close': 'last',
        'Volume': 'sum'
    }
    resampled_data = data.resample(freq).apply(ohlc_dict).dropna()
    return resampled_data

# Wave extraction
def extract_waves(data, threshold=0.01):
    waves = []
    start = 0
    while start < len(data):
        for end in range(start + 2, len(data)):
            regressor_high = LinearRegression()
            regressor_low = LinearRegression()
            x = np.arange(start, end).reshape(-1, 1)
            y_high = data['High'].iloc[start:end].values
            y_low = data['Low'].iloc[start:end].values
            regressor_high.fit(x, y_high)
            regressor_low.fit(x, y_low)
            loss_high = np.mean((regressor_high.predict(x) - y_high) ** 2)
            loss_low = np.mean((regressor_low.predict(x) - y_low) ** 2)
            if loss_high > threshold or loss_low > threshold:
                waves.append((start, end - 1))
                start = end - 1
                break
        else:
            waves.append((start, len(data) - 1))
            break
    return waves

# Prepare features
def prepare_features(data, waves):
    features = []
    for start, end in waves:
        wave_data = data.iloc[start:end+1]
        num_candles = len(wave_data)
        high_last = wave_data['High'].iloc[-1]
        low_last = wave_data['Low'].iloc[-1]
        avg_high = wave_data['High'].mean()
        avg_low = wave_data['Low'].mean()
        slope_upper = (wave_data['High'].iloc[-1] - wave_data['High'].iloc[0]) / num_candles
        slope_lower = (wave_data['Low'].iloc[-1] - wave_data['Low'].iloc[0]) / num_candles
        macd, macd_signal, macd_hist = talib.MACD(wave_data['Close'])
        ema = talib.EMA(wave_data['Close'])
        sma = talib.SMA(wave_data['Close'])
        adx = talib.ADX(wave_data['High'], wave_data['Low'], wave_data['Close'])
        rsi = talib.RSI(wave_data['Close'])
        stoch_k, stoch_d = talib.STOCH(wave_data['High'], wave_data['Low'], wave_data['Close'])
        willr = talib.WILLR(wave_data['High'], wave_data['Low'], wave_data['Close'])
        upperband, middleband, lowerband = talib.BBANDS(wave_data['Close'])
        keltner_upper = talib.EMA((wave_data['High'] + wave_data['Low'] + wave_data['Close']) / 3)
        keltner_lower = talib.EMA((wave_data['High'] + wave_data['Low'] + wave_data['Close']) / 3, timeperiod=10) - 2 * talib.ATR(wave_data['High'], wave_data['Low'], wave_data['Close'])
        feature_set = [
            num_candles, high_last, low_last, avg_high, avg_low, slope_upper, slope_lower,
            macd[-1], macd_signal[-1], macd_hist[-1], ema[-1], sma[-1], adx[-1], rsi[-1],
            stoch_k[-1], stoch_d[-1], willr[-1], upperband[-1], middleband[-1], lowerband[-1],
            keltner_upper[-1], keltner_lower[-1]
        ]
        features.append(feature_set)
    return features

# Split data into training, validation, and test sets
def split_data(data, train_size=0.8, val_size=0.1):
    train_end = int(len(data) * train_size)
    val_end = int(len(data) * (train_size + val_size))
    train_data = data.iloc[:train_end]
    val_data = data.iloc[train_end:val_end]
    test_data = data.iloc[val_end:]
    return train_data, val_data, test_data

# # Load your data
# data = pd.read_csv('your_data.csv', index_col='Date', parse_dates=True)

# Resample to desired time frame (example for 5-minute data)
data_5min = resample_data(data, freq='5T')

# Extract waves and prepare features
waves = extract_waves(data_5min)
features = prepare_features(data_5min, waves)
features_df = pd.DataFrame(features, columns=[
    'num_candles', 'high_last', 'low_last', 'avg_high', 'avg_low', 'slope_upper', 'slope_lower',
    'macd', 'macd_signal', 'macd_hist', 'ema', 'sma', 'adx', 'rsi', 'stoch_k', 'stoch_d', 'willr',
    'upperband', 'middleband', 'lowerband', 'keltner_upper', 'keltner_lower'
])

# Split the data
train_data, val_data, test_data = split_data(features_df)

# Train a simple model (Random Forest Regressor as an example)
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
X_train = train_data.drop(columns=['high_last'])
y_train = train_data['high_last']
X_val = val_data.drop(columns=['high_last'])
y_val = val_data['high_last']
X_test = test_data.drop(columns=['high_last'])
y_test = test_data['high_last']
model_rf.fit(X_train, y_train)

# Make predictions
y_pred_train = model_rf.predict(X_train)
y_pred_val = model_rf.predict(X_val)
y_pred_test = model_rf.predict(X_test)

# Visualize the predictions
plt.figure(figsize=(14, 7))
plt.plot(y_test.index, y_test, label='True Values')
plt.plot(y_test.index, y_pred_test, label='Predicted Values')
plt.legend()
plt.xlabel('Index')
plt.ylabel('Price')
plt.title('True vs Predicted Prices')
plt.show()

# Save the prepared datasets
train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)


NameError: name 'data' is not defined