In [1]:
import os
import sys
import numpy as np
import pandas as pd

np.random.seed(42)  # Set the seed for reproducibility

# Check if the original directory is already saved in a variable
if 'original_directory' not in globals():
    # Save the original working directory the first time
    original_directory = os.getcwd()

# Change back to the original directory whenever the cell is executed
os.chdir(original_directory)

# Go to mother directory
os.chdir("../")

# Verify the current working directory
print("Working directory set to:", os.getcwd())

sys.path.append(os.path.abspath(os.getcwd()))


Working directory set to: c:\Users\Biebert\OneDrive - Universität St.Gallen\Dokumente\OneDrive Dokumente\02_Bildung\01_BVWL Bachelor\01_Kurse\07_Thesis\Code\Portfolio_Optimization_DDPG


# Get Ticker

In [None]:
from data_scripts import generate_data as gd
# Get the list of S&P 500 stocks

np.random.seed(4)

if not os.path.exists("data/stock_overview.csv"):
    sp500_tickers = gd.get_sp500_stocks()
    data_dict = gd.filter_liquid_stocks(sp500_tickers)
    data_frame_df = pd.DataFrame.from_dict(data_dict, orient='index').transpose()
    data_frame_df.to_csv("data/stock_overview.csv", index=False)
else:
    liquid_stocks_df = pd.read_csv("data/stock_overview.csv")

valid_stocks  = liquid_stocks_df['Liquid Stocks'].dropna()
tickers = valid_stocks.sample(5).tolist()
tickers

In [121]:
from data_scripts import plotting as pl

pl.plot_original_stock(tickers = tickers, start_date="2000-01-01", end_date="2024-01-01")


# Simulate

In [27]:
num_simulation_episodes = 100
num_simulation_days = 750
tickers = ['MSFT', 'TGT', 'QCOM', 'MU', 'CAT']

from data_scripts import generate_data as gd
from data_scripts import plotting as pl


In [None]:
from data_scripts import generate_data as gd
tickers = ['MSFT', 'TGT', 'QCOM', 'MU', 'CAT']

data = gd.download_data(tickers, "2000-01-01", "2023-12-31")
log_returns = gd.create_log_return(data=data)

train, test = gd.stock_train_test_split(returns=log_returns)



# Save
train.to_csv("data/train_data.csv")
test.to_csv("data/test_data.csv")

In [None]:
train = pd.read_csv("data/train_data.csv", index_col=0)
test = pd.read_csv("data/test_data.csv", index_col=0)

In [30]:
import pandas as pd
import numpy as np
from arch import arch_model
import yfinance as yf
from sklearn.model_selection import train_test_split
import pickle


def download_data(tickers, start_date, end_date):
    data = yf.download(tickers, start=start_date, end=end_date)['Adj Close']
    return data

def create_log_return(data):
    log_returns = np.log(data / data.shift(1)).dropna()
    return log_returns

def save_data_csv(data, filename):
    data.to_csv(filename)

def load_data_csv(filename): 
    data = pd.read_csv(filename, index_col=0)
    return data

def stock_train_test_split(returns, test_size=0.2):
    train_data, test_data = train_test_split(returns, test_size=test_size, shuffle=False)
    return train_data, test_data


def run_garch(data, p, q, dist = 'normal', save = False):
    percent_log_return = data*100
    garch = arch_model(percent_log_return, p=p, q=q, dist=dist)
    res = garch.fit(disp="off")
    if save:
        save_garch_model([garch, res],  "saved_models/garch_and_fit_" + data.name + ".pkl")    
    return garch, res


def run_simulation(garch, garch_parameters, num_simulation_days):
    sim = garch.simulate(garch_parameters, num_simulation_days)
    return sim['data']

def simulate_many_episodes(data, p, q, dist='normal', num_days = 750, num_simulation_episodes = 50, test_size = 0.2, 
                            initial_price = 100, load_model = False, save = False):

    train_simulations = int(num_simulation_episodes * (1 - test_size))
    test_simulations = num_simulation_episodes - train_simulations

    sim_df_train = pd.DataFrame(index=range(num_days + 1), columns=range(train_simulations))
    sim_df_test = pd.DataFrame(index=range(num_days + 1), columns=range(test_simulations))

    if load_model:
        models = load_garch_model([garch, res],  "saved_models/garch_and_fit_" + data.name + ".pkl")
        garch, res = models[0], models[1]
    else:
        garch, res = run_garch(data, p, q, dist, save)

    for i in range(num_simulation_episodes):
        simulation = run_simulation(garch, res.params, num_days)
        simulated_log_returns = simulation / 100
        simulated_returns = np.exp(simulated_log_returns) - 1
        simulated_prices = initial_price * (1 + simulated_returns).cumprod()
        simulated_prices_with_initial = pd.concat([pd.Series([initial_price]), pd.Series(simulated_prices)], ignore_index=True)

        # Split the simulations into training and testing sets
        if i < train_simulations:
            sim_df_train[i] = simulated_prices_with_initial
        else:
            sim_df_test[i - train_simulations] = simulated_prices_with_initial

        save_data_csv(sim_df_train, "data/sim_train_" + data.name + ".csv")
        save_data_csv(sim_df_test, "data/sim_test_" + data.name + ".csv")

    return sim_df_train, sim_df_test


def save_garch_model(fitted_models, filename):
    with open(filename, 'wb') as f:
        pickle.dump(fitted_models, f)

def load_garch_model(filename):
    with open(filename, 'rb') as f:
        fitted_models = pickle.load(f)
    return fitted_models

In [31]:
import pandas as pd
import os

def load_simulation_data(tickers, folder_path):
    stock_data_dict = {}
    for ticker in tickers:
        # Load training and testing data for each ticker
        train_file = f'data/sim_train_{ticker}.csv'
        test_file = f'data/sim_test_{ticker}.csv'
        if os.path.exists(train_file) and os.path.exists(test_file):
            train_data = pd.read_csv(train_file, index_col=0)
            test_data = pd.read_csv(test_file, index_col=0)
            stock_data_dict[ticker] = {'train': train_data, 'test': test_data}
        else:
            print(f"File not found for ticker: {ticker}")
    return stock_data_dict


def get_combined_simulation(stock_data_dict, simulation_index, set_type='train'):
    combined_data = []
    for stock in stock_data_dict.keys():
        # Access either 'train' or 'test' data and get the column specified by simulation_index
        stock_data = stock_data_dict[stock][set_type]
        combined_data.append(stock_data.iloc[:, simulation_index].values)
    return np.array(combined_data).T


In [None]:
for stock in train:
    print(f"Simulating data for {stock}...")
    sim_df_train, sim_df_test = gd.simulate_many_episodes(train[stock], 1, 1, dist='normal', num_days = num_simulation_days, 
                                                        num_simulation_episodes = num_simulation_episodes, test_size = 0.2, 
                                                        initial_price = 100, load_model = False, save = True)

                                                        
    print(f"Simulated data for {stock} generated successfully!")
    pl.plot_simulations(sim_df_train, title="Training Simulations")
    pl.plot_simulations(sim_df_test, title="Testing Simulations")



In [5]:
# Load Garch model
model = gd.load_garch_model("saved_models/garch_and_fit_MSFT.pkl")
model

[Constant Mean(constant: yes, no. of exog: 0, volatility: GARCH(p: 1, q: 1), distribution: Normal distribution), id: 0x1cf78f66b90,
                      Constant Mean - GARCH Model Results                      
 Dep. Variable:                   MSFT   R-squared:                       0.000
 Mean Model:             Constant Mean   Adj. R-squared:                  0.000
 Vol Model:                      GARCH   Log-Likelihood:               -9574.85
 Distribution:                  Normal   AIC:                           19157.7
 Method:            Maximum Likelihood   BIC:                           19183.8
                                         No. Observations:                 5030
 Date:                Tue, Oct 22 2024   Df Residuals:                     5029
 Time:                        23:38:05   Df Model:                            1
                                 Mean Model                                
                  coef    std err          t      P>|t|    95.0% Conf. I

# Continue


In [2]:
import pandas as pd
import os 
import matplotlib.pyplot as plt
from data_scripts import generate_data as gd
from data_scripts import plotting as pl

tickers = ['MSFT', 'TGT', 'QCOM', 'MU', 'CAT']


# Load simulation data for the selected stocks
stock_data_dict = gd.load_simulation_data(tickers)

# Get the combined training simulation data for the selected stocks
combined_train_data = gd.get_combined_simulation(stock_data_dict, simulation_index=18, set_type='train')

pl.plot_one_combined_simulation(combined_train_data, tickers, 'train')


# MODEL

In [None]:
import random
import numpy as np
from trading_envs.trading_env import TradingEnv
from models.ddpg_agent import Agent


# Load simulation data for the selected stocks
stock_data_dict = gd.load_simulation_data(tickers)

# Get the combined training simulation data for the selected stocks
combined_train_data = gd.get_combined_simulation(stock_data_dict, simulation_index=0, set_type='train')


# Initialize variables
test_size = 0.2
train_simulations = int(num_simulation_episodes * (1 - test_size))
test_simulations = num_simulation_episodes - train_simulations


num_episodes = train_simulations
max_steps_per_episode = num_simulation_days 
training_performance = []
batch_size=64
replay_start_size = 100


train_simulation_index = 0
train_simulation_data = gd.get_combined_simulation(stock_data_dict, simulation_index=train_simulation_index, set_type='train')


env = TradingEnv(stock_data=train_simulation_data)

agent = Agent(alpha=0.0001, beta=0.001, input_dims=[env.observation_space.shape[0]], 
                tau=0.001, env=env, batch_size=64, layer1_size=400, layer2_size=300, 
                n_actions=env.action_space.shape[0])

scores = []

wealth = []

# Training loop only one time    
for episode in range(num_episodes):
    episode_wealth = []


    train_simulation_index = episode # random.choice(train_simulations)
    print(f"Training on simulation {train_simulation_index + 1} of {num_episodes}")
    train_simulation_data = gd.get_combined_simulation(stock_data_dict, simulation_index=train_simulation_index, set_type='train')

    # Set the new environment with this training data
    env = TradingEnv(stock_data=train_simulation_data)

    # Reset the environment and start a new episode
    state, _ = env.reset()
    done = False
    score = 0

    #while not done:
    while not done:
        action = agent.choose_action(state)
        state_, reward, done, _, _ = env.step(action)
        agent.remember(state, action, reward, state_, done)
        agent.learn()
        score += reward
        state = state_

        episode_wealth.append(env.get_portfolio_value())

        env.render()
    
    wealth.append(episode_wealth)
    training_performance.append(score)

# Save the model

import matplotlib.pyplot as plt
# Plot the training performance
plt.plot(training_performance)


In [None]:
# Plot the different wealth trajectories
plt.figure(figsize=(10, 6))
for i in range(num_episodes):
    plt.plot(wealth[i], label=f"Episode {i+1}")
    plt.title("Wealth Trajectories for Training Episodes")
    plt.xlabel("Days")
    plt.ylabel("Portfolio Value")
    plt.legend()

In [None]:
# Load simulation data for the selected stocks
stock_data_dict = gd.load_simulation_data(tickers)

# Get the combined training simulation data for the selected stocks
combined_train_data = gd.get_combined_simulation(stock_data_dict, simulation_index=7, set_type='train')

pl.plot_one_combined_simulation(combined_train_data, tickers, 'train')