<a href="https://colab.research.google.com/github/ywanglab/STAT4160/blob/main/notebooks/reproducibility_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Colab cell
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Adjust these two for YOUR repo
REPO_OWNER = "ywanglab"
REPO_NAME  = "STAT4160"   # e.g., unified-stocks-team1

BASE_DIR   = "/content/drive/MyDrive/dspt25"
CLONE_DIR  = f"{BASE_DIR}/{REPO_NAME}"
REPO_URL   = f"https://github.com/{REPO_OWNER}/{REPO_NAME}.git"

import os, pathlib
pathlib.Path(BASE_DIR).mkdir(parents=True, exist_ok=True)


In [3]:
import os, subprocess, shutil, pathlib

if not pathlib.Path(CLONE_DIR).exists():
    !git clone {REPO_URL} {CLONE_DIR}
else:
    # If the folder exists, just ensure it's a git repo and pull latest
    os.chdir(CLONE_DIR)
    !git status
    !git pull --ff-only
os.chdir(CLONE_DIR)
print("Working dir:", os.getcwd())

Refresh index: 100% (47/47), done.
On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   notebooks/reproducibility_demo.ipynb[m
	[31mmodified:   notebooks/system_check.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mnotebooks/testing.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")
Already up to date.
Working dir: /content/drive/MyDrive/dspt25/STAT4160


In [None]:
# Homework

In [None]:
# Colab cell: freeze exact versions
!pip freeze > requirements-lock.txt
print("Wrote requirements-lock.txt with exact versions")
!head -n 20 requirements-lock.txt

Wrote requirements-lock.txt with exact versions
absl-py==1.4.0
accelerate==1.10.0
aiofiles==24.1.0
aiohappyeyeballs==2.6.1
aiohttp==3.12.15
aiosignal==1.4.0
alabaster==1.0.0
albucore==0.0.24
albumentations==2.0.8
ale-py==0.11.2
altair==5.5.0
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.10.0
anywidget==0.9.18
argon2-cffi==25.1.0
argon2-cffi-bindings==25.1.0
array_record==0.7.2
arviz==0.22.0
astropy==7.1.0


In [None]:
import numpy as np, torch, random, json, os, time

def set_seed(seed=123):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    try:
        torch.use_deterministic_algorithms(True)
    except Exception:
        pass

def make_toy(n=512, d=10, noise=0.1, seed=123):
    set_seed(seed)
    X = torch.randn(n, d)
    true_w = torch.randn(d, 1)
    y = X @ true_w + noise * torch.randn(n, 1)
    return X, y, true_w

device = "cuda" if torch.cuda.is_available() else "cpu"
X, y, true_w = make_toy()
X, y = X.to(device), y.to(device)

In [None]:
# torch.use_deterministic_algorithms(False)
def train_once(lr=0.05, steps=300, seed=123):
    set_seed(seed)
    model = torch.nn.Linear(X.shape[1], 1, bias=False).to(device) #torch.nn.Linear(in_features, out_features, bias=False)
    opt = torch.optim.SGD(model.parameters(), lr=lr)
    loss_fn = torch.nn.MSELoss()
    losses=[]
    for t in range(steps):
        opt.zero_grad(set_to_none=True) # set_to_none: slightly faster
        yhat = model(X)
        loss = loss_fn(yhat, y) # yhat must be in the first postion to have the correct gradient graph
        loss.backward() # compute the loss
        opt.step() # update weights
        losses.append(loss.item())
    return model.weight.detach().cpu().numpy(), losses[-1] #detatch from computing graph, moved to cpu to create np array

w1, final_loss1 = train_once(seed=2025)
w2, final_loss2 = train_once(seed=2025)

print("Final loss 1:", round(final_loss1, 6))
print("Final loss 2:", round(final_loss2, 6))
print("Weights equal:", np.allclose(w1, w2, atol=1e-7))

Final loss 1: 0.01057
Final loss 2: 0.01057
Weights equal: True


In [None]:
os.makedirs("reports", exist_ok=True)
result = {
    "device": device,
    "final_loss1": float(final_loss1),
    "final_loss2": float(final_loss2),
    "weights_equal": bool(np.allclose(w1, w2, atol=1e-7)),
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}
with open("reports/reproducibility_results.json","w") as f:
    json.dump(result, f, indent=2)
result

{'device': 'cpu',
 'final_loss1': 0.010570256970822811,
 'final_loss2': 0.010570256970822811,
 'weights_equal': True,
 'timestamp': '2025-08-15 21:10:06'}

In [None]:
env_example = """\
# Example environment variables (do NOT commit a real .env with secrets)
ALPHA_VANTAGE_KEY=
FRED_API_KEY=
"""
open(".env.example", "w").write(env_example)
print(open(".env.example").read())

# Example environment variables (do NOT commit a real .env with secrets)
ALPHA_VANTAGE_KEY=
FRED_API_KEY=

