In [17]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core import Workspace

  # Load the workspace
ws = Workspace.from_config()

  # Create a new environment with required packages
env = Environment("env1-trainingDEC")
env.python.conda_dependencies = CondaDependencies.create(
    conda_packages=["scikit-learn", "pandas", "numpy", "pathlib"],
    pip_packages=["azureml-defaults", "tensorflow", "matplotlib"])


from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your compute target
compute_name = "computetarget1-trainingDEC"

try:
    # Check if the compute target already exists
    compute_target = ComputeTarget(workspace=ws, name=compute_name)
    print("Found existing compute target.")
except ComputeTargetException:
    # If the compute target doesn't exist, create it
    print("Creating new compute target...")
    compute_config = AmlCompute.provisioning_configuration(
        vm_size="STANDARD_NC6",  # Change this to the VM size you want to use
        max_nodes=4  # Change this to the maximum number of nodes you want to use
    )
    compute_target = ComputeTarget.create(ws, compute_name, compute_config)
    compute_target.wait_for_completion(show_output=True)

  # Create a new run configuration
script_config = ScriptRunConfig(
    source_directory="./",
    script="test.ipynb",
    arguments=[],
    compute_target=compute_name,
    environment=env)

# Create a new experiment
experiment = Experiment(workspace=ws, name="experiment2-trainingDEC")

# Submit the run
run = experiment.submit(script_config)

Creating new compute target...
InProgress.......
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [13]:
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path

import numpy as np
import pandas as pd

np.random.seed(42)
idx = pd.IndexSlice


meta = pd.read_hdf('/home/groovyjac/projects/Autonomous-Portfolio-Management/main_data_store_JDKv1.h5',
                     'stocks/base_fundamentals')

print(meta.loc[("AAPL", "General", ["Type", "Exchange"])])
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

selected_features = [
    'Sector', 'GicSector', 'MarketCapitalization', 'DividendYield', 'PERatio',
    'Beta', 'Industry', 'GicGroup', 'GicIndustry', 'GicSubIndustry', 'BookValue',
    'EBITDA', 'RevenueTTM', 'GrossProfitTTM', 'OperatingMarginTTM', 'ReturnOnAssetsTTM',
    'ReturnOnEquityTTM', 'PriceSalesTTM', 'PriceBookMRQ', 'EnterpriseValue',
    'EnterpriseValueRevenue', 'EnterpriseValueEbitda', 'SharesOutstanding',
    'PercentInstitutions', 'PayoutRatio', '52WeekHigh', '52WeekLow', '50DayMA',
    '200DayMA', 'ForwardPE', 'SharesFloat', 'PercentInsiders',
    'ShortPercent', 'ForwardAnnualDividendRate', 'ForwardAnnualDividendYield',
    'TrailingPE', 'PriceEarningsRatio', 'DividendShare', 'EarningsShare',
    'EPSEstimateCurrentYear', 'EPSEstimateNextYear', 'EPSEstimateNextQuarter',
    'EPSEstimateCurrentQuarter', 'ProfitMargin',
    'QuarterlyRevenueGrowthYOY', 'DilutedEpsTTM', 'QuarterlyEarningsGrowthYOY'
]

df = pd.DataFrame(meta.loc[idx[:, :, 
selected_features]])

df = df.droplevel(1)[~df.droplevel(1).index.duplicated(keep='first')]

df = df.unstack(level=1)

df.columns = df.columns.droplevel(0)

df = df.drop(['Sector', 'GicSector', 'Industry',	'GicGroup', 'GicIndustry', 'GicSubIndustry'], axis=1)
df = df.astype('float64')
df.info()
# Check for missing values
print(df.isna().sum())
# Fill in missing values with the mean of the column
df = df.fillna(df.mean())
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)



AAPL  General  Type        Common Stock
               Exchange          NASDAQ
dtype: object
<class 'pandas.core.frame.DataFrame'>
Index: 503 entries, AAPL to NWS
Data columns (total 40 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   MarketCapitalization        502 non-null    float64
 1   DividendYield               502 non-null    float64
 2   PERatio                     463 non-null    float64
 3   Beta                        499 non-null    float64
 4   BookValue                   501 non-null    float64
 5   EBITDA                      473 non-null    float64
 6   RevenueTTM                  503 non-null    float64
 7   GrossProfitTTM              503 non-null    float64
 8   OperatingMarginTTM          503 non-null    float64
 9   ReturnOnAssetsTTM           503 non-null    float64
 10  ReturnOnEquityTTM           503 non-null    float64
 11  PriceSalesTTM               503 non-null    float64
 12  

In [14]:
import tensorflow as tf
import pandas as pd
import numpy as np

# Define the model architecture
class DECModel(tf.keras.Model):
    def __init__(self, n_clusters, input_shape):
        super(DECModel, self).__init__()
        self.encoder = tf.keras.Sequential([
            tf.keras.layers.Input(shape=input_shape),
            tf.keras.layers.Dense(500, activation='relu'),
            tf.keras.layers.Dense(500, activation='relu'),
            tf.keras.layers.Dense(2000, activation='relu'),
            tf.keras.layers.Dense(10, activation=None)
        ])
        
        self.decoder = tf.keras.Sequential([
            tf.keras.layers.Dense(2000, activation='relu'),
            tf.keras.layers.Dense(500, activation='relu'),
            tf.keras.layers.Dense(500, activation='relu'),
            tf.keras.layers.Dense(input_shape, activation='linear')
        ])
        
        self.n_clusters = n_clusters
        
    def compile(self, optimizer, loss_fn):
        super(DECModel, self).compile()
        self.optimizer = optimizer
        self.loss_fn = loss_fn
        
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded
        
    def cluster_layer(self, inputs):
        q = tf.reduce_sum(tf.square(tf.expand_dims(inputs, axis=1) - self.encoder.weights[-1]) , axis=2)
        q = 1.0 / (1.0 + q / self.n_clusters)
        q = q ** (2.0 / 3.0)
        q = tf.transpose(tf.transpose(q) / tf.reduce_sum(q, axis=1))
        return q

# Define the loss function for the model
def dec_loss(x, x_reconstructed, q):
    reconstruction_loss = tf.keras.losses.mse(x, x_reconstructed)
    clustering_loss = tf.keras.losses.KLD(q, q_target)
    return reconstruction_loss + clustering_loss

# Create a function to generate target distribution for clustering loss
def target_distribution(q):
    weight = q ** 2 / q.sum(0)
    return (weight.T / weight.sum(1)).T


In [15]:
# Load the data and scale it
data = df.copy()
scaled_data = (data - data.mean()) / data.std()

# Initialize the model and compile it with the optimizer and loss function
model = DECModel(n_clusters=10, input_shape=scaled_data.shape[1])
optimizer = tf.keras.optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(optimizer=optimizer, loss_fn=dec_loss)

# Create a TensorFlow Dataset object from your scaled dataset
batch_size = 32
ds = tf.data.Dataset.from_tensor_slices(scaled_data)
ds = ds.shuffle(buffer_size=len(scaled_data), reshuffle_each_iteration=True)
ds = ds.batch(batch_size=batch_size)

# Train the model
n_epochs = 100
for epoch in range(n_epochs):
    for batch in ds:
        with tf.GradientTape() as tape:
            x = batch
            x_reconstructed = model(x)
            q = model.cluster_layer(x)
            q_target = target_distribution(q)
            loss = dec_loss(x, x_reconstructed, q)

        gradients = tape.gradient(loss, model.trainable_variables)
        model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    print("Epoch {}: Loss {:.3f}".format(epoch+1, loss.numpy()))

# Save the model
model.save('dec_model')

2023-04-05 13:09:28.411857: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-05 13:09:28.561284: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-05 13:09:28.561367: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-05 13:09:28.568876: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in 

InvalidArgumentError: cannot compute Sub as input #1(zero-based) was expected to be a double tensor but is a float tensor [Op:Sub]