### Imports

In [None]:
# Remove unwanted warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

# Data Management
import pandas as pd
import numpy as np
from pandas_datareader.data import DataReader
from ta import add_all_ta_features
import os

# Statistics
from statsmodels.tsa.stattools import adfuller

# Unsupervised Machine Learning
from sklearn.decomposition import PCA
import joblib

# Supervised Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
import os
# Reporting
import matplotlib.pyplot as plt

### Initial Data Extraction

In [None]:
# Set the directory where the new .pk1 file is saved
augmented_directory = 'Data Augmented'
# Set the file name of the new .pk1 file
file_name = 'XAU_USD_M15_2024.pk1'
# Construct the full path to the .pk1 file
file_path = os.path.join(augmented_directory, file_name)

# Check if the file exists
if os.path.exists(file_path):
    # Load the DataFrame from the pickle file
    df = pd.read_pickle(file_path)

    # Generate a list of columns to drop that contain 'ask' or 'bid'
    cols_to_drop = [col for col in df.columns if 'ask' in col or 'bid' in col]

    # Drop the columns from the DataFrame
    df.drop(cols_to_drop, axis=1, inplace=True)

        # Add TA features to the DataFrame
    # Ensure df has the columns: 'mid_o', 'mid_h', 'mid_l', 'mid_c', 'volume'
    df = add_all_ta_features(
        df,
        open="mid_o", high="mid_h", low="mid_l", close="mid_c", volume="volume",
        fillna=True
    )

    #Convert the 'Time' column to datetime (this step may not be necessary if it's already in datetime format)
    df['time'] = pd.to_datetime(df['time'])

    # Set the 'Time' column as the index of the DataFrame
    df = df.set_index('time')




In [None]:
# Check if the file exists and load it
if os.path.exists(file_path):
    df = pd.read_pickle(file_path)

    # Display the first few rows of the DataFrame
    print("The first few rows of the DataFrame:")
    print(df.head())

    # Display the last few rows of the DataFrame
    print("\nThe last few rows of the DataFrame:")
    print(df.tail())
else:
    print(f"The file does not exist at the specified path: {file_path}")

The first few rows of the DataFrame:
                       time  volume     mid_o     mid_h     mid_l     mid_c  \
0 2024-01-01 23:00:00+00:00     397  2065.845  2068.995  2064.135  2065.865   
1 2024-01-01 23:15:00+00:00     498  2065.870  2066.295  2065.235  2065.275   
2 2024-01-01 23:30:00+00:00     457  2065.225  2065.535  2064.300  2064.620   
3 2024-01-01 23:45:00+00:00     554  2064.560  2064.590  2063.250  2063.795   
4 2024-01-02 00:00:00+00:00     488  2063.835  2065.035  2063.475  2064.105   

     bid_o     bid_h    bid_l    bid_c    ask_o     ask_h    ask_l    ask_c  
0  2064.40  2066.495  2063.04  2065.62  2067.29  2071.495  2064.62  2066.11  
1  2065.63  2066.080  2065.01  2065.07  2066.11  2066.540  2065.43  2065.48  
2  2065.07  2065.360  2064.14  2064.46  2065.38  2065.770  2064.46  2064.78  
3  2064.40  2064.430  2063.10  2063.62  2064.72  2064.750  2063.40  2063.97  
4  2063.67  2064.880  2063.31  2063.95  2064.00  2065.190  2063.64  2064.26  

The last few rows o

### Data Preprocessing - Target Setting

In [None]:
# Set initial signal based on whether the next close is higher or lower than the current close
df["signal"] = 1  # Default to sell
df.loc[df["mid_c"].shift(-1) > df["mid_c"], "signal"] = 2  # Change to buy if next close is higher

# For sell positions, if the distance to next mid_h is greater than the distance to next mid_c, set signal to 0
df.loc[(df["signal"] == 1) & (abs(df["mid_h"].shift(-1) - df["mid_c"]) > abs(df["mid_c"].shift(-1) - df["mid_c"])), "signal"] = 0

# For buy positions, if the distance to next mid_l is greater than the distance to next mid_c, set signal to 0
df.loc[(df["signal"] == 2) & (abs(df["mid_l"].shift(-1) - df["mid_c"]) > abs(df["mid_c"].shift(-1) - df["mid_c"])), "signal"] = 0

# Remove rows with NaN values resulting from the shift operation
df.dropna(inplace=True)

In [None]:
df

In [None]:
# Split Target from Featureset
x = df.iloc[:, :-1]
y = df.iloc[:, -1]


### Data Preprocessing - Stationarity and Scaling

In [None]:
# Identify non-stationary columns
non_stationaries = []
for col in x.columns:
    # Perform Augmented Dickey-Fuller test only on numeric columns
    if x[col].dtype == 'float64' or x[col].dtype == 'int64':
        result = adfuller(x[col].dropna())  # Drop NA values as ADF doesn't handle them
        p_value = result[1]
        test_statistic = result[0]
        critical_value = result[4]["1%"]

        # Check if p-value is above 0.05 or test statistic is higher than critical value
        if p_value > 0.05 or test_statistic > critical_value:
            non_stationaries.append(col)

print(f"Non-Stationary Features Found: {len(non_stationaries)}")

In [None]:
# Convert non-stationaries to stationary
df_stationary = x.copy()
df_stationary[non_stationaries] = df_stationary[non_stationaries].pct_change()
df_stationary = df_stationary.iloc[1:]

In [None]:
# Find NaN Rows
na_list = df_stationary.columns[df_stationary.isna().any().tolist()]
df_stationary.drop(columns=na_list, inplace=True)

In [None]:
# Handle inf values
df_stationary.replace([np.inf, -np.inf], 0, inplace=True)
df_stationary.head()

In [None]:
# Feature Scaling
df_sc = df_stationary.copy()
X_fs = StandardScaler().fit_transform(df_sc)

### Unsupervised ML - PCA Dimensionality Reduction

In [None]:
# Initialize PCA with the desired number of components
n_components = 26
pca = PCA(n_components=n_components)

# Fit PCA on the scaled feature set 'X_fs' and transform the data
X_pca = pca.fit_transform(X_fs)

In [None]:
# Calculate the variance explained by Principle Components
print("Variance of each component: ", pca.explained_variance_ratio_)
print("\n Total Variance Explained: ", round(sum(list(pca.explained_variance_ratio_)) * 100, 2))

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Assuming X_fs are your scaled features

# Fit PCA on the entire dataset to include all components
pca = PCA()
pca.fit(X_fs)

# Calculate the cumulative sum of explained variance ratio
cumulative_variance = pca.explained_variance_ratio_.cumsum()

# Plotting the scree plot
plt.figure(figsize=(10, 7))
plt.bar(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(1, len(cumulative_variance) + 1), cumulative_variance, where='mid', label='Cumulative explained variance')

# Adding labels and title to the plot
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Scree Plot')
plt.legend(loc='best')

# Show grid
plt.grid()

# Show the plot
plt.show()

In [None]:
# Create columns
pca_cols = []
for i in range(n_components):
    pca_cols.append(f"PC_{i}")
pca_cols

In [None]:
# Create and View DataFrame

df_pca = pd.DataFrame(data=X_pca, columns=pca_cols)

# Display the first few rows of the DataFrame
df_pca.head()
df_pca.tail()

In [None]:
# Reset index if needed (do this for both 'df_pca' and 'y' if their indices do not match)
df_pca.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

# Now add the target variable to the PCA DataFrame
df_pca['signal'] = y

In [None]:
print(df_pca.head())
print(df_pca.tail())

In [None]:
# Define the directory path
directory_path = 'Data Augmented'

In [None]:
# Define the full path for the pickle file including the directory and file name
file_path = os.path.join(directory_path, 'XAU_USD_M15_AUGMENTED_MULTICLASS.pkl')

# Save the DataFrame to a pickle file in the specified directory
try:
    df_pca.to_pickle(file_path)
    print(f"DataFrame saved as a pickle file at: {file_path}")
except Exception as e:
    print(f"An error occurred while saving the DataFrame: {e}")

In [None]:
# Read the pickle file
try:
    df_from_pickle = pd.read_pickle(file_path)
    print("First few rows of the DataFrame:")
    print(df_from_pickle.head())  # Display the first few rows of the DataFrame
    print("\nLast few rows of the DataFrame:")
    print(df_from_pickle.tail())  # Display the last few rows of the DataFrame
except Exception as e:
    print(f"An error occurred while loading the DataFrame: {e}")

In [None]:
# Define the path to the pickle file
pickle_file_path = os.path.join('Data Augmented', 'XAU_USD_M15_AUGMENTED_MULTICLASS.pkl')

# Read the DataFrame from the pickle file
try:
    df = pd.read_pickle(pickle_file_path)
    print("DataFrame loaded successfully from the pickle file.")
    # Display the first few rows to verify
    print(df.head())
    # Display the last few rows to verify
    print(df.tail())
except FileNotFoundError:
    print(f"The file {pickle_file_path} was not found.")
except Exception as e:
    print(f"An error occurred: {e}")