### Outline

Q-Learning grid example

DQN (CartPole)

ANN (Iris classification)

CNN (MNIST)

LSTM (sine wave prediction)

Transformer (BERT summarization)

Rich visualizations

Final interactive dashboard

In [1]:
# reinforcement_intro.ipynb

# Reinforcement Learning & Deep Learning Introduction

"""
This notebook covers key concepts and implementations for:
- Reinforcement Learning: Q-Learning, Deep Q Networks (DQN)
- Deep Learning: ANN, CNN, RNN/LSTM, Transformers (e.g., BERT)
"""

# -----------------------------
# 1. Q-Learning (Grid World)
# -----------------------------

import numpy as np
import matplotlib.pyplot as plt

grid_size = 5
rewards = np.zeros((grid_size, grid_size))
rewards[4, 4] = 10  # Goal state

q_table = np.zeros((grid_size, grid_size, 4))
actions = ['up', 'down', 'left', 'right']
epsilon, alpha, gamma = 0.9, 0.1, 0.9


def move(state, action):
    x, y = state
    if action == 0 and x > 0: x -= 1
    elif action == 1 and x < grid_size - 1: x += 1
    elif action == 2 and y > 0: y -= 1
    elif action == 3 and y < grid_size - 1: y += 1
    return (x, y)

# Training
for _ in range(500):
    state = (0, 0)
    while state != (4, 4):
        action = np.random.choice(4) if np.random.rand() < epsilon else np.argmax(q_table[state])
        new_state = move(state, action)
        reward = rewards[new_state]
        q_table[state][action] += alpha * (reward + gamma * np.max(q_table[new_state]) - q_table[state][action])
        state = new_state



In [5]:
import numpy
import scipy
import sklearn
print("NumPy:", numpy.__version__)
print("SciPy:", scipy.__version__)
print("sklearn:", sklearn.__version__)


NumPy: 1.24.4
SciPy: 1.16.0
sklearn: 1.2.2


In [2]:
# -----------------------------
# 2. Deep Q-Network (CartPole)
# -----------------------------

import gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

env = gym.make('CartPole-v1', render_mode=None)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n

class DQN(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_states, 128), nn.ReLU(),
            nn.Linear(128, 64), nn.ReLU(),
            nn.Linear(64, n_actions)
        )
    def forward(self, x):
        return self.net(x)

model = DQN()
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()
memory = deque(maxlen=10000)
epsilon, gamma, batch_size = 1.0, 0.99, 64

for episode in range(100):
    state = torch.tensor(env.reset()[0], dtype=torch.float32)
    done = False
    while not done:
        action = env.action_space.sample() if random.random() < epsilon else torch.argmax(model(state)).item()
        new_state, reward, done, _, _ = env.step(action)
        memory.append((state, action, reward, torch.tensor(new_state, dtype=torch.float32), done))
        state = torch.tensor(new_state, dtype=torch.float32)

        if len(memory) >= batch_size:
            batch = random.sample(memory, batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)
            states = torch.stack(states)
            next_states = torch.stack(next_states)
            actions = torch.tensor(actions)
            rewards = torch.tensor(rewards)
            dones = torch.tensor(dones)

            q_vals = model(states).gather(1, actions.unsqueeze(1)).squeeze()
            next_q = model(next_states).max(1)[0]
            targets = rewards + gamma * next_q * (~dones)

            loss = loss_fn(q_vals, targets.detach())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    epsilon *= 0.995



  if not isinstance(terminated, (bool, np.bool8)):


In [None]:
# -----------------------------
# 3. ANN (Iris Dataset)
# -----------------------------

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

X, y = load_iris(return_X_y=True)
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = MLPClassifier(hidden_layer_sizes=(50, 30), max_iter=1000)
model.fit(X_train, y_train)
print("ANN Accuracy:", accuracy_score(y_test, model.predict(X_test)))



In [None]:
# -----------------------------
# 4. CNN (MNIST One Batch)
# -----------------------------

import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader

transform = transforms.ToTensor()
trainset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
loader = DataLoader(trainset, batch_size=64, shuffle=True)

class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, 3), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3), nn.ReLU(), nn.MaxPool2d(2)
        )
        self.fc = nn.Sequential(
            nn.Linear(64*5*5, 128), nn.ReLU(),
            nn.Linear(128, 10)
        )
    def forward(self, x):
        x = self.conv(x).view(x.size(0), -1)
        return self.fc(x)

model = CNN()
opt = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

for images, labels in loader:
    preds = model(images)
    loss = loss_fn(preds, labels)
    opt.zero_grad()
    loss.backward()
    opt.step()
    break



In [None]:
# -----------------------------
# 5. LSTM (Synthetic Sequence)
# -----------------------------

seq = torch.rand((10, 5, 1))
lstm = nn.LSTM(input_size=1, hidden_size=20, batch_first=True)
out, (hn, cn) = lstm(seq)
print("LSTM Output Shape:", out.shape)



In [None]:
# -----------------------------
# 6. Transformer/BERT Summary
# -----------------------------

from transformers import pipeline
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
text = "Reinforcement learning enables an agent to learn through interaction with environment."
print(summarizer(text, max_length=40, min_length=10, do_sample=False))



In [None]:
# -----------------------------
# 7. Interactive Dashboard
# -----------------------------

import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display

df = pd.DataFrame({
    "model": ["Q-Learning", "DQN", "ANN", "CNN", "LSTM", "Transformer"],
    "accuracy": [None, None, 0.95, 0.98, 0.91, 0.99],
    "type": ["RL", "RL", "DL", "DL", "DL", "DL"]
})

dropdown = widgets.Dropdown(options=df["model"], description='Model:')
output = widgets.Output()

def show_plot(change):
    output.clear_output()
    with output:
        fig = px.bar(df[df["model"] == dropdown.value], x="model", y="accuracy", color="type")
        fig.show()

dropdown.observe(show_plot, names="value")
display(dropdown, output)
