In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import requests
import time

# Portfolio environment with transaction costs and liquidity risk
class PortfolioEnvRealTime:
    def __init__(self, asset_symbols, initial_cash=100000, max_assets=10, 
                 transaction_cost=0.001, liquidity_penalty=0.1, api_key="YOUR_ALPHA_VANTAGE_API_KEY"):
        self.asset_symbols = asset_symbols
        self.cash = initial_cash
        self.max_assets = max_assets
        self.transaction_cost = transaction_cost
        self.liquidity_penalty = liquidity_penalty
        self.portfolio = {}
        self.current_step = 0
        self.done = False
        self.api_key = api_key
        self.asset_prices = {symbol: None for symbol in asset_symbols}
        self.total_value = initial_cash

    def fetch_real_time_data(self):
        """Fetches real-time data for each asset."""
        base_url = "https://www.alphavantage.co/query"
        for symbol in self.asset_symbols:
            params = {
                "function": "TIME_SERIES_INTRADAY",
                "symbol": symbol,
                "interval": "1min",
                "apikey": self.api_key
            }
            response = requests.get(base_url, params=params)
            data = response.json()
            if 'Time Series (1min)' in data:
                last_refreshed = list(data['Time Series (1min)'].keys())[0]
                self.asset_prices[symbol] = float(data['Time Series (1min)'][last_refreshed]['4. close'])
            else:
                print(f"Failed to fetch data for {symbol}: {data}")
                self.asset_prices[symbol] = None
            time.sleep(12)

    def calculate_portfolio_value(self):
        return self.cash + sum(self.portfolio.get(symbol, 0) * self.asset_prices[symbol] 
                               for symbol in self.asset_symbols if self.asset_prices[symbol] is not None)

    def step(self, action):
        self.fetch_real_time_data()
        transaction_cost = 0
        liquidity_penalty = 0

        # Simulate trades based on action
        for i, symbol in enumerate(self.asset_symbols):
            if action[i] > 0:
                asset_price = self.asset_prices.get(symbol, None)
                if asset_price:
                    trade_volume = action[i] * self.cash
                    transaction_cost += trade_volume * self.transaction_cost
                    liquidity_penalty += trade_volume * self.liquidity_penalty
                    self.portfolio[symbol] = self.portfolio.get(symbol, 0) + (trade_volume / asset_price)
                    self.cash -= trade_volume

        portfolio_value = self.calculate_portfolio_value()
        reward = (portfolio_value - self.total_value) - (transaction_cost + liquidity_penalty)
        self.total_value = portfolio_value
        state = [self.asset_prices[symbol] for symbol in self.asset_symbols] + [self.cash, portfolio_value]
        self.done = self.current_step >= 1000

        return state, reward, self.done, {}

    def reset(self):
        self.cash = 100000
        self.portfolio = {}
        self.current_step = 0
        self.fetch_real_time_data()
        return [self.asset_prices[symbol] for symbol in self.asset_symbols] + [self.cash, self.total_value]

# PPO Policy and Value networks
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, output_dim),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.fc(x)

class ValueNetwork(nn.Module):
    def __init__(self, input_dim):
        super(ValueNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.fc(x)

# PPO Agent
class PPOAgent:
    def __init__(self, input_dim, output_dim, lr=0.001, gamma=0.99, epsilon=0.2):
        self.policy_net = PolicyNetwork(input_dim, output_dim)
        self.value_net = ValueNetwork(input_dim)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=lr)
        self.gamma = gamma
        self.epsilon = epsilon

    def select_action(self, state):
        state_tensor = torch.FloatTensor(state)
        action_probs = self.policy_net(state_tensor)
        action = torch.distributions.Categorical(action_probs).sample()
        return action.item(), action_probs[action]

    def compute_returns(self, rewards, dones, last_value):
        returns = []
        R = last_value
        for r, d in zip(reversed(rewards), reversed(dones)):
            R = r + self.gamma * R * (1 - d)
            returns.insert(0, R)
        return torch.tensor(returns)

    def update(self, states, actions, rewards, dones, old_action_probs):
        returns = self.compute_returns(rewards, dones, 0)
        states_tensor = torch.FloatTensor(states)
        actions_tensor = torch.LongTensor(actions)
        old_action_probs_tensor = torch.FloatTensor(old_action_probs)

        for _ in range(5):  # Multiple epochs
            action_probs = self.policy_net(states_tensor).gather(1, actions_tensor.unsqueeze(-1)).squeeze(-1)
            ratio = action_probs / old_action_probs_tensor
            advantage = returns - self.value_net(states_tensor).squeeze(-1)
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon) * advantage
            policy_loss = -torch.min(surr1, surr2).mean()
            value_loss = nn.functional.mse_loss(self.value_net(states_tensor).squeeze(-1), returns)

            self.policy_optimizer.zero_grad()
            policy_loss.backward()
            self.policy_optimizer.step()

            self.value_optimizer.zero_grad()
            value_loss.backward()
            self.value_optimizer.step()

# Example usage
env = PortfolioEnvRealTime(asset_symbols=['AAPL', 'GOOGL', 'TSLA'], initial_cash=100000)
agent = PPOAgent(input_dim=len(env.asset_symbols) + 2, output_dim=len(env.asset_symbols))

num_episodes = 100
for episode in range(num_episodes):
    state = env.reset()
    done = False
    episode_reward = 0

    states, actions, rewards, dones, old_action_probs = [], [], [], [], []

    while not done:
        action, action_prob = agent.select_action(state)
        next_state, reward, done, _ = env.step([action])
        
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        dones.append(done)
        old_action_probs.append(action_prob)

        state = next_state
        episode_reward += reward

    agent.update(states, actions, rewards, dones, old_action_probs)
    print(f"Episode {episode+1}: Total Reward = {episode_reward}")
