In [36]:
import pandas as pd
import numpy as np
import torch
from collections import defaultdict
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from tqdm import trange
import matplotlib.pyplot as plt
import importlib
import math
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from captum.attr import IntegratedGradients


In [4]:
import lstm
importlib.reload(lstm)

<module 'lstm' from '/Users/varunreddy/Desktop/Coding Projects/neuralnet-math/lstm.py'>

In [5]:
from lstm import MathLSTM

In [6]:
data = pd.read_csv('arithmetic_data.csv')

In [7]:
equations = list(data.itertuples(index=False, name=None))
equations[0]

(79, '*', 28, '=', 2212)

### tokenize the inputs

In [8]:
vocab = set()
for x, op, y, eq, z in equations:
    vocab.update([str(x), op, str(y), eq])

tokenized_math = {tok: i for i, tok in enumerate(sorted(vocab))}

vocab_size = len(tokenized_math)
print(vocab_size)


105


In [9]:
def encode(eqn):
    x, op, y, eq, z = eqn
    return [tokenized_math[str(x)], tokenized_math[op], tokenized_math[str(y)], tokenized_math[eq]], float(z)

encoded = [encode(eqn) for eqn in equations]

input_tensor = torch.tensor([e[0] for e in encoded], dtype=torch.long)
output_tensor = torch.tensor([e[1] for e in encoded], dtype=torch.float32)

In [10]:
encoded

[([81, 0, 25, 104], 2212.0),
 ([67, 0, 13, 104], 1122.0),
 ([80, 2, 65, 104], 14.0),
 ([10, 2, 29, 104], -17.0),
 ([60, 3, 16, 104], 3.0),
 ([53, 3, 4, 104], 53.0),
 ([60, 3, 60, 104], 1.0),
 ([30, 1, 64, 104], 95.0),
 ([77, 0, 52, 104], 3900.0),
 ([46, 1, 75, 104], 120.0),
 ([80, 1, 91, 104], 166.0),
 ([100, 1, 5, 104], 106.0),
 ([25, 2, 92, 104], -61.0),
 ([98, 0, 44, 104], 4230.0),
 ([36, 3, 4, 104], 38.0),
 ([58, 1, 71, 104], 65.0),
 ([33, 3, 71, 104], 5.0),
 ([49, 2, 95, 104], -86.0),
 ([50, 0, 6, 104], 5000.0),
 ([91, 0, 88, 104], 7480.0),
 ([10, 2, 40, 104], -27.0),
 ([82, 0, 40, 104], 328.0),
 ([39, 0, 86, 104], 3320.0),
 ([9, 1, 77, 104], 88.0),
 ([33, 2, 92, 104], -54.0),
 ([64, 1, 12, 104], 79.0),
 ([4, 3, 4, 104], 1.0),
 ([27, 2, 20, 104], -20.0),
 ([32, 0, 92, 104], 3026.0),
 ([74, 0, 26, 104], 2088.0),
 ([91, 0, 40, 104], 3608.0),
 ([99, 2, 21, 104], 71.0),
 ([71, 3, 4, 104], 7.0),
 ([91, 3, 38, 104], 22.0),
 ([22, 1, 87, 104], 109.0),
 ([67, 2, 92, 104], -23.0),
 ([11, 2

In [11]:
print(input_tensor.shape)
print(output_tensor.shape)

torch.Size([100000, 4])
torch.Size([100000])


In [12]:
model = MathLSTM(vocab_size=len(tokenized_math), embedding_dim=16, hidden_dim=32)
# regression loss function (output is a single value)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    input_tensor, output_tensor, test_size=0.2, random_state=42
)

## Train the LSTM model

In [14]:
num_epochs = 500
loop = trange(num_epochs, desc="Training", leave=True)

loss_curve = []
for epoch in loop:
    model.train()
    
    # Forward pass
    preds = model(X_train)
    loss = loss_fn(preds, y_train)
    
    # Backward pass
    optimizer.zero_grad()
    # calculate the gradient of the loss with respect to the model parameters
    loss.backward()
    # update the model parameters
    optimizer.step()

    # Update tqdm description every step
    loop.set_description(f"Epoch {epoch}")
    loop.set_postfix(train_loss=loss.item())

    # Evaluate every 50 epochs or at the end
    if epoch % 50 == 0 or epoch == num_epochs - 1:
        model.eval()
        with torch.no_grad():
            test_preds = model(X_test)
            test_loss = loss_fn(test_preds, y_test)
            loss_curve.append((epoch, test_loss.item()))
        print(f"Epoch {epoch}: Train Loss = {loss.item():.4f}, Test Loss = {test_loss.item():.4f}")


Epoch 1:   0%|          | 2/500 [00:00<01:48,  4.59it/s, train_loss=2.89e+6]

Epoch 0: Train Loss = 2890061.5000, Test Loss = 2826890.0000


Epoch 51:  10%|█         | 52/500 [00:09<01:20,  5.60it/s, train_loss=2.86e+6]

Epoch 50: Train Loss = 2865212.2500, Test Loss = 2802347.5000


Epoch 101:  20%|██        | 102/500 [00:18<01:13,  5.40it/s, train_loss=2.84e+6]

Epoch 100: Train Loss = 2841833.0000, Test Loss = 2779414.0000


Epoch 151:  30%|███       | 152/500 [00:27<01:03,  5.49it/s, train_loss=2.82e+6]

Epoch 150: Train Loss = 2818983.0000, Test Loss = 2756992.7500


Epoch 201:  40%|████      | 202/500 [00:36<00:54,  5.44it/s, train_loss=2.8e+6] 

Epoch 200: Train Loss = 2797561.0000, Test Loss = 2735988.7500


Epoch 251:  50%|█████     | 252/500 [00:45<00:45,  5.49it/s, train_loss=2.78e+6]

Epoch 250: Train Loss = 2776914.0000, Test Loss = 2715744.2500


Epoch 301:  60%|██████    | 302/500 [00:53<00:35,  5.54it/s, train_loss=2.76e+6]

Epoch 300: Train Loss = 2755901.5000, Test Loss = 2695173.2500


Epoch 351:  70%|███████   | 352/500 [01:02<00:27,  5.31it/s, train_loss=2.74e+6]

Epoch 350: Train Loss = 2735584.5000, Test Loss = 2675214.2500


Epoch 401:  80%|████████  | 402/500 [01:11<00:17,  5.51it/s, train_loss=2.72e+6]

Epoch 400: Train Loss = 2715704.0000, Test Loss = 2655728.0000


Epoch 451:  90%|█████████ | 452/500 [01:20<00:08,  5.50it/s, train_loss=2.7e+6] 

Epoch 450: Train Loss = 2696117.2500, Test Loss = 2636530.7500


Epoch 499: 100%|██████████| 500/500 [01:29<00:00,  5.60it/s, train_loss=2.68e+6]

Epoch 499: Train Loss = 2677145.7500, Test Loss = 2617934.7500





## Test the LSTM Model

In [None]:
deltas = []
model.eval()
with torch.no_grad():
    predictions = model(X_test)

    for i in range(len(X_test)):
        input_ids = X_test[i]
        pred = predictions[i].item()
        true = y_test[i].item()

        tokens = [list(tokenized_math.keys())[list(tokenized_math.values()).index(id.item())] for id in input_ids]
        equation_str = " ".join(tokens)
        print(f"{equation_str} → Predicted: {round(pred, 2)}, Actual: {true}")
        deltas.append(abs(pred - true))



In [None]:
# find the equation with the highest delta
max_delta_index = deltas.index(max(deltas))

print(f"Equation with the highest delta: {equations[max_delta_index]}")
print(f"Delta: {deltas[max_delta_index]}")

predictions[max_delta_index]


In [16]:
import importlib
import lstm
importlib.reload(lstm)

MathLSTM = lstm.MathLSTM

In [18]:
weights = model.get_weights()

## Lets get the weights so we can try interpreting it

In [35]:
def forward_embedded(embedded_batch):
    """
    embedded_batch : [batch, seq_len, embed_dim]  (requires_grad=True)
    returns        : [batch]  scalar prediction (one per example)
    """
    B, T, _ = embedded_batch.shape
    h = embedded_batch.new_zeros(B, model.lstm_cell.hidden_dim)
    c = h.clone()

    for t in range(T):                              
        h, c = model.lstm_cell(embedded_batch[:, t, :], h, c)

    return model.fc(h).squeeze(1)                 

idx2tok = [None] * len(tokenized_math)            
for tok, idx in tokenized_math.items():
    idx2tok[idx] = tok

ig = IntegratedGradients(forward_embedded)

num_samples = 100      
n_cols      = 5        
n_rows      = math.ceil(num_samples / n_cols)

fig = make_subplots(rows=n_rows, cols=n_cols,
                    horizontal_spacing=0.01, vertical_spacing=0.02)


for i in range(num_samples):
    ids     = X_test[i].unsqueeze(0)                      
    tokens  = [idx2tok[idx.item()] for idx in ids[0]]      
    eqn_str = " ".join(tokens)

    emb     = model.embedding(ids).detach().requires_grad_()
    attrs,_ = ig.attribute(emb, return_convergence_delta=True)

    scores  = attrs.squeeze(0).sum(dim=1)                  
    scores  = scores / torch.norm(scores)                 
    img     = scores.detach().cpu().numpy()[None, :]       

    r, c = divmod(i, n_cols)
    fig.add_trace(
        go.Heatmap(
            z           = img,
            x           = tokens,
            y           = [''],             
            zmin        = -1, zmax = 1,
            colorscale  = 'RdBu',
            showscale   = False,
            hovertemplate = (
                f"<b>{eqn_str}</b><br>"
                "Token: %{x}<br>"
                "Attribution: %{z:.3f}<extra></extra>"
            ),
        ),
        row=r+1, col=c+1
    )

fig.update_layout(
    height  = 25 * n_rows + 120,
    width   = 1000,
    margin  = dict(l=10, r=10, t=30, b=10),
    template='plotly_white',
    title_text="Integrated Gradients – Token Attributions (Custom LSTM)"
)

fig.write_html("attribution_grid.html")


✓ Saved interactive grid to  'attribution_grid.html'  — open it in any browser.
