In [None]:
# import numpy as np
# import pandas as pd
# import gym
# from gym import spaces
# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout
# from tensorflow.keras.optimizers import Adam
# from collections import deque
# import random

# # [Previous ReplayBuffer and DQNAgent classes remain the same]

# class TeacherForcingEnv(gym.Env):
#     def __init__(self, model, scaler, X_train, y_train, time_step=24):
#         super(TeacherForcingEnv, self).__init__()
#         self.model = model
#         self.scaler = scaler
#         self.X_train = X_train  # Now expecting shape (n_sequences, time_step, 1)
#         self.y_train = y_train
#         self.time_step = time_step
#         self.current_step = 0
        
#         self.action_space = spaces.Discrete(2)
#         self.observation_space = spaces.Box(
#             low=-np.inf, high=np.inf, 
#             shape=(time_step, 1), 
#             dtype=np.float32
#         )
        
#         # Take the first sequence as initial state
#         self.state = self.X_train[0]

#     def reset(self):
#         self.current_step = 0
#         self.state = self.X_train[0]
#         return self.state

#     def step(self, action):
#         actual_value = self.y_train[self.current_step]
        
#         if action == 0:
#             next_input = actual_value
#         else:
#             # State is already in correct shape (time_step, 1)
#             state_reshaped = np.expand_dims(self.state, 0)  # Add batch dimension
#             next_input = self.model.predict(state_reshaped, verbose=0)[0, 0]
        
#         # Create next state with correct shape
#         next_state = np.roll(self.state, -1, axis=0)
#         next_state[-1, 0] = next_input
        
#         # Make prediction for reward calculation
#         state_reshaped = np.expand_dims(self.state, 0)
#         prediction = self.model.predict(state_reshaped, verbose=0)[0, 0]
#         reward = -abs(actual_value - prediction)
        
#         self.state = next_state
#         self.current_step += 1
#         done = self.current_step >= len(self.y_train)
        
#         return next_state, reward, done, {}

# def create_bilstm_model(time_step):
#     model = Sequential([
#         Bidirectional(LSTM(250, activation='relu', return_sequences=True, 
#                           input_shape=(time_step, 1))),
#         Dropout(0.2),
#         Bidirectional(LSTM(250, activation='relu', return_sequences=False)),
#         Dropout(0.2),
#         Dense(1)
#     ])
#     model.compile(optimizer='adam', loss='mean_squared_error')
#     return model

# def prepare_sequences(data, time_step):
#     """Prepare sequences for training with proper reshaping"""
#     sequences = []
#     targets = []
    
#     for i in range(len(data) - time_step):
#         sequences.append(data[i:(i + time_step)])
#         targets.append(data[i + time_step])
    
#     return np.array(sequences), np.array(targets)

# def train_bilstm_with_rl(X_train, y_train, scaler, time_step=24, 
#                         episodes=10, batch_size=64):
#     # Scale the data first
#     X_scaled = scaler.fit_transform(X_train.reshape(-1, 1))
    
#     # Prepare sequences
#     X_sequences, y_sequences = prepare_sequences(X_scaled, time_step)
    
#     # Reshape sequences for BiLSTM (samples, time_step, features)
#     X_sequences = X_sequences.reshape(-1, time_step, 1)
    
#     # Create models
#     bilstm_model = create_bilstm_model(time_step)
#     env = TeacherForcingEnv(bilstm_model, scaler, X_sequences, y_sequences, 
#                            time_step)
    
#     # Initialize DQN agent with flattened state shape
#     state_shape = (time_step,)  # Flattened shape for Dense network
#     agent = DQNAgent(state_shape, env.action_space.n)
    
#     # Training loop
#     total_steps = 0
#     for episode in range(episodes):
#         state = env.reset()
#         episode_reward = 0
#         done = False
        
#         while not done:
#             action = agent.get_action(state)
#             next_state, reward, done, _ = env.step(action)
            
#             agent.replay_buffer.push(state, action, reward, next_state, done)
            
#             if len(agent.replay_buffer) >= batch_size:
#                 agent.train(batch_size)
            
#             state = next_state
#             episode_reward += reward
#             total_steps += 1
            
#             if total_steps % 100 == 0:
#                 agent.update_target_network()
        
#         print(f"Episode {episode + 1}/{episodes}, "
#               f"Total Reward: {episode_reward:.2f}, "
#               f"Epsilon: {agent.epsilon:.2f}")
    
#     return bilstm_model

# # Example usage:
# """
# # Your data preparation
# X_train = your_time_series_data  # Original time series
# scaler = MinMaxScaler()
# time_step = 24

# # Train the model
# trained_model = train_bilstm_with_rl(X_train, X_train, scaler, time_step)
# """

'\n# Your data preparation\nX_train = your_time_series_data  # Original time series\nscaler = MinMaxScaler()\ntime_step = 24\n\n# Train the model\ntrained_model = train_bilstm_with_rl(X_train, X_train, scaler, time_step)\n'

In [None]:
import numpy as np
import gym
from gym import spaces
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout # type: ignore
# Import legacy Adam optimizer instead
from tensorflow.keras.optimizers.legacy import Adam # type: ignore
from collections import deque
import random

# ReplayBuffer stays the same
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)
    
    def __len__(self):
        return len(self.buffer)
    


class DQNAgent:
    # def __init__(self, state_shape, n_actions, learning_rate=1e-3, gamma=0.99, epsilon=1.0, epsilon_min=0.05, epsilon_decay=0.995):
    def __init__(self, state_shape, n_actions, learning_rate=1e-1, gamma=0.99, epsilon=1.0):
        self.state_shape = state_shape
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        # self.epsilon_min = epsilon_min
        # self.epsilon_decay = epsilon_decay
        
        self.q_network = self._build_network(learning_rate)
        self.target_network = self._build_network(learning_rate)
        self.update_target_network()
        self.replay_buffer = ReplayBuffer(capacity=10000)
        
    def _build_network(self, learning_rate):
        model = Sequential([
            Dense(64, activation='relu', input_shape=self.state_shape),
            Dense(64, activation='relu'),
            Dense(self.n_actions, activation='linear')
        ])

        optimizer = Adam(learning_rate=0.001, clipvalue=1.0)

        # Use legacy Adam optimizer
        model.compile(optimizer=optimizer, loss='mse')
        return model
    
    def update_target_network(self):
        self.target_network.set_weights(self.q_network.get_weights())
    
    def get_action(self, state):
        state_flat = state.reshape(-1)
        if random.random() < self.epsilon:
            return random.randrange(self.n_actions)
        q_values = self.q_network.predict(state_flat[np.newaxis], verbose=0)
        return np.argmax(q_values[0])
    
    def train(self, batch_size):
        if len(self.replay_buffer) < batch_size:
            return
        
        batch = self.replay_buffer.sample(batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        
        states = np.array([s.reshape(-1) for s in states])
        next_states = np.array([s.reshape(-1) for s in next_states])
        
        current_q_values = self.q_network.predict(states, verbose=0)
        next_q_values = self.target_network.predict(next_states, verbose=0)
        
        for i in range(batch_size):
            if dones[i]:
                current_q_values[i][actions[i]] = rewards[i]
            else:
                current_q_values[i][actions[i]] = rewards[i] + self.gamma * np.max(next_q_values[i])
        
        self.q_network.fit(states, current_q_values, batch_size=batch_size, verbose=0)
        # self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
        # self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

class TeacherForcingEnv(gym.Env):
    def __init__(self, model, scaler, X_train, y_train, time_step=24):
        super(TeacherForcingEnv, self).__init__()
        self.model = model
        self.scaler = scaler
        self.X_train = X_train  # Now expecting shape (n_sequences, time_step, 1)
        self.y_train = y_train
        self.time_step = time_step
        self.current_step = 0
        
        self.action_space = spaces.Discrete(2)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, 
            shape=(time_step, 1), 
            dtype=np.float32
        )
        
        # Take the first sequence as initial state
        self.state = self.X_train[0]

    def reset(self):
        self.current_step = 0
        self.state = self.X_train[0]
        return self.state

    def step(self, action):
        actual_value = self.y_train[self.current_step]
        
        if action == 0:
            next_input = actual_value
        else:
            state_reshaped = np.expand_dims(self.state, 0)  # Add batch dimension
            next_input = self.model.predict(state_reshaped, verbose=0)[0, 0]
        
        # Create next state with correct shape
        next_state = np.roll(self.state, -1, axis=0)
        next_state[-1, 0] = next_input
        
        # Make prediction for reward calculation
        state_reshaped = np.expand_dims(self.state, 0)
        prediction = self.model.predict(state_reshaped, verbose=0)[0, 0]
        
        # Ensure reward is a scalar value
        reward = float(-abs(actual_value - prediction))  # Convert to float if needed
        
        self.state = next_state
        self.current_step += 1
        done = self.current_step >= len(self.y_train)
        
        return next_state, reward, done, {}

def create_bilstm_model(time_step):
    model = Sequential([
        Bidirectional(LSTM(200, activation='relu', return_sequences=True, 
                          input_shape=(time_step, 1))),
        Dropout(0.2),
        Bidirectional(LSTM(200, activation='relu', return_sequences=False)),
        Dropout(0.2),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

def prepare_sequences(data, time_step):
    """Prepare sequences for training with proper reshaping"""
    sequences = []
    targets = []
    
    for i in range(len(data) - time_step):
        sequences.append(data[i:(i + time_step)])
        targets.append(data[i + time_step])
    
    return np.array(sequences), np.array(targets)

def train_bilstm_with_rl(X_train, y_train, scaler, time_step=24, 
                        episodes=1000, batch_size=128):
    # Scale the data first
    X_scaled = scaler.fit_transform(X_train.reshape(-1, 1))
    
    # Prepare sequences
    X_sequences, y_sequences = prepare_sequences(X_scaled, time_step)
    
    # Reshape sequences for BiLSTM (samples, time_step, features)
    X_sequences = X_sequences.reshape(-1, time_step, 1)
    
    # Create models
    bilstm_model = create_bilstm_model(time_step)
    env = TeacherForcingEnv(bilstm_model, scaler, X_sequences, y_sequences, 
                           time_step)
    
    # Initialize DQN agent with flattened state shape
    state_shape = (time_step,)  # Flattened shape for Dense network
    agent = DQNAgent(state_shape, env.action_space.n)
    
    # Training loop
    total_steps = 0
    for episode in range(episodes):
        state = env.reset()
        episode_reward = 0.0  # Initialize as a float
        done = False
        
        while not done:
            action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            
            agent.replay_buffer.push(state, action, reward, next_state, done)
            
            if len(agent.replay_buffer) >= batch_size:
                agent.train(batch_size)
            
            state = next_state
            episode_reward += float(reward)  # This should now be a float
            total_steps += 1
            
            if total_steps % 100 == 0:
                agent.update_target_network()
        
        print(f"Episode {episode + 1}/{episodes}, "
            f"Total Reward: {episode_reward:.2f}, "
            f"Epsilon: {agent.epsilon:.2f}")
    
    return bilstm_model


In [37]:

# Training and Evaluation
time_step = 24
data = pd.read_csv('/Users/yashwanthkaruparthi/Documents/Acads/sem7/design project/execution/data/solar_weather copy 2.csv', header=0, infer_datetime_format=True, parse_dates=['Time'], index_col=['Time'])
data = data[(data.index.month.isin([5, 6, 7])) & (data.index.year == 2021)]
# dataset = data[['Energy delta[Wh]', 'GHI', 'temp', 'pressure', 'humidity']]
dataset = data[['Energy delta[Wh]', 'GHI']]
X = dataset.iloc[:, 1:].values  # Features
y = dataset.iloc[:, 0].values   # Target

# Split data into train and test sets (80% train, 20% test)
train_size = int(len(X) * 0.8)
test_size = len(X) - train_size
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]
scaler = MinMaxScaler()

# Train model with RL-guided Teacher Forcing
trained_model = train_bilstm_with_rl(X_train, y_train, scaler, time_step)


  data = pd.read_csv('/Users/yashwanthkaruparthi/Documents/Acads/sem7/design project/execution/data/solar_weather copy 2.csv', header=0, infer_datetime_format=True, parse_dates=['Time'], index_col=['Time'])
  next_state[-1, 0] = next_input
  reward = float(-abs(actual_value - prediction))  # Convert to float if needed


Episode 1/1000, Total Reward: -1772.63, Epsilon: 1.00


KeyboardInterrupt: 