<a href="https://colab.research.google.com/github/ywanglab/STAT4160/blob/main/notebooks/system_check.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Colab cell
COURSE_DIR = "/content/drive/MyDrive/dspt25"  # change if you prefer another path
PROJECT_NAME = "unified-stocks"               # course project folder/repo name

In [None]:
# Colab cell: make directories and cd into project folder
import os, pathlib
base = pathlib.Path(COURSE_DIR)
proj = base / PROJECT_NAME
for p in [base, proj, proj/"reports", proj/"notebooks", proj/"data"]:
    p.mkdir(parents=True, exist_ok=True)

import os
os.chdir(proj) # change dir
print("Working in:", os.getcwd())

Working in: /content/drive/MyDrive/dspt25/unified-stocks


In [None]:
# Colab cell: clone via HTTPS (public or your private; for private, you can upload later instead of pushing from Colab)
REPO_URL = "https://github.com/ywanglab/STAT4160.git"  # <- change me
import subprocess, os
os.chdir(base)  # clone next to your project folder, only clone one time
# subprocess.run(["git", "clone", REPO_URL], check=True)
# Optionally, use that cloned repo as the working directory:
REPO_NAME = REPO_URL.split("/")[-1].replace(".git","")
os.chdir(base/REPO_NAME)
print("Working in:", os.getcwd())

Working in: /content/drive/MyDrive/dspt25/STAT4160


In [None]:
# Adjust these two for YOUR repo
REPO_OWNER = "kadkins3880"
REPO_NAME  = "STAT4160"   # e.g., unified-stocks-team1

BASE_DIR   = "/content/drive/MyDrive/dspt25"
CLONE_DIR  = f"{BASE_DIR}/{REPO_NAME}"
REPO_URL   = f"https://github.com/{REPO_OWNER}/{REPO_NAME}.git"

import os, pathlib
pathlib.Path(BASE_DIR).mkdir(parents=True, exist_ok=True)


In [None]:
import os, subprocess, shutil, pathlib

if not pathlib.Path(CLONE_DIR).exists():
    !git clone {REPO_URL} {CLONE_DIR}
else:
    # If the folder exists, just ensure it's a git repo and pull latest
    os.chdir(CLONE_DIR)
    !git status
    !git pull --ff-only
os.chdir(CLONE_DIR)
print("Working dir:", os.getcwd())

Cloning into '/content/drive/MyDrive/dspt25/STAT4160'...
remote: Enumerating objects: 678, done.[K
remote: Counting objects: 100% (165/165), done.[K
remote: Compressing objects: 100% (109/109), done.[K
remote: Total 678 (delta 72), reused 127 (delta 42), pack-reused 513 (from 1)[K
Receiving objects: 100% (678/678), 3.84 MiB | 12.61 MiB/s, done.
Resolving deltas: 100% (279/279), done.
Updating files: 100% (105/105), done.
fatal: cannot exec '/content/drive/MyDrive/dspt25/STAT4160/.git/hooks/post-checkout': Permission denied
Working dir: /content/drive/MyDrive/dspt25/STAT4160


In [None]:
# Colab cell: write a soft-pinned requirements.txt
req = """\
pandas>=2.2,<3.0
numpy>=2.0.0,<3.0
pyarrow>=15,<17
matplotlib>=3.8,<4.0
scikit-learn>=1.6,<2.0
yfinance>=0.2,<0.3
python-dotenv>=1.0,<2.0
"""
open("requirements.txt","w").write(req)
print(open("requirements.txt").read())

pandas>=2.2,<3.0
numpy>=2.0.0,<3.0
pyarrow>=15,<17
matplotlib>=3.8,<4.0
scikit-learn>=1.6,<2.0
yfinance>=0.2,<0.3
python-dotenv>=1.0,<2.0



In [None]:
# Colab cell: install (quietly). Torch is usually preinstalled in Colab; we'll check separately.
!pip install -q -r requirements.txt

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Colab cell: environment info + GPU check
import sys, platform, json, time
import pandas as pd
import numpy as np

env = {
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    "python": sys.version,
    "os": platform.platform(),
    "pandas": pd.__version__,
    "numpy": np.__version__,
}

try:
    import torch
    env["torch"] = torch.__version__
    env["cuda_available"] = bool(torch.cuda.is_available())
    env["cuda_device"] = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
except Exception as e:
    env["torch"] = "not importable"
    env["cuda_available"] = False
    env["cuda_device"] = "CPU"

print(env)
os.makedirs("reports", exist_ok=True)
with open("reports/environment.json","w") as f:
    json.dump(env, f, indent=2)

{'timestamp': '2025-08-25 13:27:12', 'python': '3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]', 'os': 'Linux-6.1.123+-x86_64-with-glibc2.35', 'pandas': '2.2.2', 'numpy': '2.0.2', 'torch': '2.8.0+cu126', 'cuda_available': False, 'cuda_device': 'CPU'}


In [None]:
# Colab cell: PyTorch check. If not available (rare in Colab), install CPU-only as a fallback.
try:
    import torch
    print("PyTorch:", torch.__version__)
except Exception as e:
    print("PyTorch not found; installing CPU-only wheel as fallback...")
    !pip install -q torch
    import torch
    print("PyTorch:", torch.__version__)

PyTorch: 2.8.0+cu126


In [None]:
# Colab cell: reproducibility helpers
import random
import numpy as np

def set_seed(seed: int = 42, deterministic_torch: bool = True):
    random.seed(seed)
    np.random.seed(seed)
    try:
        import torch
        torch.manual_seed(seed)  #cpu side
        torch.cuda.manual_seed_all(seed)  # cuda side
        if deterministic_torch:
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False  #disable automatic algorithm
            try:
                torch.use_deterministic_algorithms(True)
            except Exception:
                pass
    except Exception:
        pass

def sample_rng_fingerprint(n=5, seed=42):
    set_seed(seed)
    a = np.random.rand(n).round(6).tolist()
    try:
        import torch
        b = torch.rand(n).tolist()
        b = [round(x,6) for x in b]
    except Exception:
        b = ["torch-missing"]*n
    return {"numpy": a, "torch": b}

f1 = sample_rng_fingerprint(n=6, seed=123)
f2 = sample_rng_fingerprint(n=6, seed=123)
print("Fingerprint #1:", f1)
print("Fingerprint #2:", f2)
print("Match:", f1 == f2)

with open("reports/seed_fingerprint.json","w") as f:
    json.dump({"f1": f1, "f2": f2, "match": f1==f2}, f, indent=2)

Fingerprint #1: {'numpy': [0.696469, 0.286139, 0.226851, 0.551315, 0.719469, 0.423106], 'torch': [0.296112, 0.516562, 0.251671, 0.688557, 0.073972, 0.866522]}
Fingerprint #2: {'numpy': [0.696469, 0.286139, 0.226851, 0.551315, 0.719469, 0.423106], 'torch': [0.296112, 0.516562, 0.251671, 0.688557, 0.073972, 0.866522]}
Match: True


In [None]:
# Colab cell: create stock list if it doesn't exist yet
import pandas as pd, os
tickers = [
    "AAPL","MSFT","AMZN","GOOGL","META","NVDA","TSLA","JPM","JNJ","V",
    "PG","HD","BAC","XOM","CVX","PFE","KO","DIS","NFLX","INTC",
    "CSCO","ORCL","T","VZ","WMT"
]
path = "tickers_25.csv"
if not os.path.exists(path):
    pd.DataFrame({"ticker": tickers}).to_csv(path, index=False)
pd.read_csv(path).head()

Unnamed: 0,ticker
0,AAPL
1,MSFT
2,AMZN
3,GOOGL
4,META


In [None]:
# Colab cell: tiny GPU smoke test (safe if CUDA available)
import torch, time
import os

# change back to not use deterministic_algorithm to do the matrix computation if use GPU
# torch.use_deterministic_algorithms(False)

device = "cuda" if torch.cuda.is_available() else "cpu"
x = torch.randn(1000, 1000, device=device)
y = x @ x.T
print("Device:", device, "| y shape:", y.shape, "| mean:", y.float().mean().item()) #cast y into float32, extract the mean as a python float.

Device: cpu | y shape: torch.Size([1000, 1000]) | mean: 0.963737428188324


In [None]:
# Colab cell: write a small Markdown summary for humans
from textwrap import dedent  # dedent removes the common indentation
summary = dedent(f"""
# System Check

- Timestamp: {env['timestamp']}
- Python: `{env['python']}`
- OS: `{env['os']}`
- pandas: `{env['pandas']}` | numpy: `{env['numpy']}` | torch: `{env['torch']}`
- CUDA available: `{env['cuda_available']}` | Device: `{env['cuda_device']}`

## RNG Fingerprint
- Match on repeated seeds: `{f1 == f2}`
- numpy: `{f1['numpy']}`
- torch: `{f1['torch']}`
""").strip()

open("reports/system_check.md","w").write(summary)
print(summary)

# System Check

- Timestamp: 2025-08-25 13:27:12
- Python: `3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]`
- OS: `Linux-6.1.123+-x86_64-with-glibc2.35`
- pandas: `2.2.2` | numpy: `2.0.2` | torch: `2.8.0+cu126`
- CUDA available: `False` | Device: `CPU`

## RNG Fingerprint
- Match on repeated seeds: `True`
- numpy: `[0.696469, 0.286139, 0.226851, 0.551315, 0.719469, 0.423106]`
- torch: `[0.296112, 0.516562, 0.251671, 0.688557, 0.073972, 0.866522]`


In [None]:
# code to save this notebook. (too complicated, don't bother)
from google.colab import  _message
notebook_name = "system_check.ipynb"
# Create the 'notebooks' subdirectory path
out_dir = proj / "notebooks"
out_path = out_dir / notebook_name

# Make sure the folder exists
out_dir.mkdir(parents=True, exist_ok=True)

# Get the CURRENT notebook JSON from Colab
resp = _message.blocking_request('get_ipynb', timeout_sec=10)
nb = resp.get('ipynb') if isinstance(resp, dict) else None

# Basic sanity check: ensure there are cells
if not nb or not isinstance(nb, dict) or not nb.get('cells'):
    raise RuntimeError("Could not capture the current notebook contents (no cells returned). "
                       "Try running this cell again after a quick edit, or use File → Save a copy in Drive once.")

# Write to Drive
with open(out_path, 'w', encoding='utf-8') as f:
    json.dump(nb, f, ensure_ascii=False, indent=2)

print("Saved notebook to:", out_path)


In [None]:
# Colab cell: freeze exact versions
!pip freeze > requirements-lock.txt
print("Wrote requirements-lock.txt with exact versions")
!head -n 20 requirements-lock.txt

Wrote requirements-lock.txt with exact versions
absl-py==1.4.0
accelerate==1.10.0
aiofiles==24.1.0
aiohappyeyeballs==2.6.1
aiohttp==3.12.15
aiosignal==1.4.0
alabaster==1.0.0
albucore==0.0.24
albumentations==2.0.8
ale-py==0.11.2
altair==5.5.0
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.10.0
anywidget==0.9.18
argon2-cffi==25.1.0
argon2-cffi-bindings==25.1.0
array_record==0.7.2
arviz==0.22.0
astropy==7.1.0


In [3]:
import numpy as np, torch, random, json, os, time

def set_seed(seed=123):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    try:
        torch.use_deterministic_algorithms(True)
    except Exception:
        pass

def make_toy(n=512, d=10, noise=0.1, seed=123):
    set_seed(seed)
    X = torch.randn(n, d)
    true_w = torch.randn(d, 1)
    y = X @ true_w + noise * torch.randn(n, 1)
    return X, y, true_w

device = "cuda" if torch.cuda.is_available() else "cpu"
X, y, true_w = make_toy()
X, y = X.to(device), y.to(device)

In [4]:
# torch.use_deterministic_algorithms(False)
def train_once(lr=0.05, steps=300, seed=123):
    set_seed(seed)
    model = torch.nn.Linear(X.shape[1], 1, bias=False).to(device) #torch.nn.Linear(in_features, out_features, bias=False)
    opt = torch.optim.SGD(model.parameters(), lr=lr)
    loss_fn = torch.nn.MSELoss()
    losses=[]
    for t in range(steps):
        opt.zero_grad(set_to_none=True) # set_to_none: slightly faster
        yhat = model(X)
        loss = loss_fn(yhat, y)
        loss.backward() # compute the loss
        opt.step() # update weights
        losses.append(loss.item())
    return model.weight.detach().cpu().numpy(), losses[-1] #detatch from computing graph, moved to cpu to create np array

w1, final_loss1 = train_once(seed=2025)
w2, final_loss2 = train_once(seed=2025)

print("Final loss 1:", round(final_loss1, 6))
print("Final loss 2:", round(final_loss2, 6))
print("Weights equal:", np.allclose(w1, w2, atol=1e-7))

RuntimeError: Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or `at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this case, you must set an environment variable before running your PyTorch application: CUBLAS_WORKSPACE_CONFIG=:4096:8 or CUBLAS_WORKSPACE_CONFIG=:16:8. For more information, go to https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility

In [None]:
os.makedirs("reports", exist_ok=True)
result = {
    "device": device,
    "final_loss1": float(final_loss1),
    "final_loss2": float(final_loss2),
    "weights_equal": bool(np.allclose(w1, w2, atol=1e-7)),
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}
with open("reports/reproducibility_results.json","w") as f:
    json.dump(result, f, indent=2)
result

{'device': 'cpu',
 'final_loss1': 0.010570256970822811,
 'final_loss2': 0.010570256970822811,
 'weights_equal': True,
 'timestamp': '2025-08-15 21:10:06'}

In [None]:
env_example = """\
# Example environment variables (do NOT commit a real .env with secrets)
ALPHA_VANTAGE_KEY=
FRED_API_KEY=
"""
open(".env.example", "w").write(env_example)
print(open(".env.example").read())

# Example environment variables (do NOT commit a real .env with secrets)
ALPHA_VANTAGE_KEY=
FRED_API_KEY=

