<a href="https://colab.research.google.com/github/ywanglab/STAT4160/blob/main/notebooks/lec2_hw.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Colab cell
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Adjust these two for YOUR repo
REPO_OWNER = "ywanglab"
REPO_NAME  = "STAT4160"   # e.g., unified-stocks-team1

BASE_DIR   = "/content/drive/MyDrive/dspt25"
CLONE_DIR  = f"{BASE_DIR}/{REPO_NAME}"
REPO_URL   = f"https://github.com/{REPO_OWNER}/{REPO_NAME}.git"

import os, pathlib
pathlib.Path(BASE_DIR).mkdir(parents=True, exist_ok=True)


In [None]:
import os, subprocess, shutil, pathlib

if not pathlib.Path(CLONE_DIR).exists():
    !git clone {REPO_URL} {CLONE_DIR}
else:
    # If the folder exists, just ensure it's a git repo and pull latest
    os.chdir(CLONE_DIR)
    !git status
    !git pull --ff-only
os.chdir(CLONE_DIR)
print("Working dir:", os.getcwd())

In [None]:
# Run in your repo root
import os, pathlib, textwrap
pathlib.Path(".github").mkdir(exist_ok=True)
tpl = textwrap.dedent("""\
    ## Summary
    What does this PR do and why?

    ## Changes
    -

    ## How to test
    - From a fresh clone: steps to run

    ## Checklist
    - [ ] Runs from a fresh clone (README steps)
    - [ ] No secrets committed; `.env` only (and `.env.example` updated if needed)
    - [ ] Large artifacts tracked by LFS (`git lfs ls-files` shows expected files)
    - [ ] Clear, small diff; comments where useful
""")
open(".github/pull_request_template.md","w").write(tpl)
print("Wrote .github/pull_request_template.md")

Wrote .github/pull_request_template.md


In [None]:
owners = """\
# Replace with your GitHub handles
* @teammate1 @teammate2 @ywanglab
"""
open(".github/CODEOWNERS","w").write(owners)
print("Wrote .github/CODEOWNERS (edit handles!)")

Wrote .github/CODEOWNERS (edit handles!)


In [None]:
# tools/guard_large_files.py
import os, subprocess, sys

LIMIT_MB = 10
ROOT = os.getcwd()

def lfs_tracked_paths(): # find files tracked by lfs
    try:
        out = subprocess.check_output(["git", "lfs", "ls-files"], text=True)
        tracked = set()
        for line in out.strip().splitlines():
            # line format: "<oid> <path>" ex: line = "3b2d8c7d53   data/processed/file.parquet"
            p = line.split(None, 1)[-1].strip() #split on whitespace at most once.
            tracked.add(os.path.normpath(p)) # normpath(p): ensure consistent slashes (\ vs /)
        return tracked
    except Exception:
        return set()

def humanize(bytes_):
    return f"{bytes_/(1024*1024):.2f} MB"

lfs_set = lfs_tracked_paths()
bad = []
for dirpath, dirnames, filenames in os.walk(ROOT):
  #os.walk() is a generator that recursively traverses a directory tree.
  # At each step it yields a tuple:(dirpath, subdirnames, filenames)

    # skip .git directory
    if ".git" in dirpath.split(os.sep):#using os specific separator os.sep (/ for linus, \ for windows)
        continue
    for fn in filenames:
        path = os.path.normpath(os.path.join(dirpath, fn))
        try:
            size = os.path.getsize(path)
        except FileNotFoundError:
            continue
        if size >= LIMIT_MB * 1024 * 1024:
            rel = os.path.relpath(path, ROOT)
            if rel not in lfs_set:
                bad.append((rel, size))

if bad:
    print("ERROR: Large non-LFS files found:")
    for rel, size in bad:
        print(f" - {rel} ({humanize(size)})")
    sys.exit(1)
else:
    print("OK: No large non-LFS files detected.")

OK: No large non-LFS files detected.


In [None]:
# Define the path to the tools directory
tools_dir = Path("tools")

# Create it if it doesn't exist (including any parents)
tools_dir.mkdir(parents=True, exist_ok=True)

print(f"Directory '{tools_dir}' is ready.")

Directory 'tools' is ready.


In [None]:
from pathlib import Path

tools_dir = Path("tools")
tools_dir.mkdir(parents=True, exist_ok=True)

script = tools_dir / "guard_large_files.py"

code = '''#!/usr/bin/env python3
import os
import sys
import subprocess

LIMIT_MB = 10  # size threshold for LFS in megabytes
ROOT = os.path.abspath(os.path.dirname(__file__) + "/..")

def humanize(nbytes):
    # format size in human-friendly units
    for unit in ['B','KB','MB','GB','TB']:
        if nbytes < 1024:
            return f"{nbytes:.1f}{unit}"
        nbytes /= 1024
    return f"{nbytes:.1f}PB"

def lfs_tracked_paths():
    try:
        out = subprocess.check_output(["git", "lfs", "ls-files"], text=True)
        tracked = set()
        for line in out.strip().splitlines():
            # line format: "<oid> <path>"
            p = line.split(None, 1)[-1].strip()
            tracked.add(os.path.normpath(p))
        return tracked
    except Exception:
        return set()

def main():
    lfs_set = lfs_tracked_paths()
    bad = []
    for dirpath, dirnames, filenames in os.walk(ROOT):
        # skip .git and other hidden dirs
        if ".git" in dirpath.split(os.sep):
            continue
        for fn in filenames:
            path = os.path.normpath(os.path.join(dirpath, fn))
            try:
                size = os.path.getsize(path)
            except FileNotFoundError:
                continue
            if size >= LIMIT_MB * 1024 * 1024:
                rel = os.path.relpath(path, ROOT)
                if rel not in lfs_set:
                    bad.append((rel, size))

    if bad:
        print("ERROR: Large non-LFS files found:")
        for rel, size in sorted(bad, key=lambda x: x[1], reverse=True):
            print(f" - {rel} ({humanize(size)})")
        sys.exit(1)
    else:
        print(f"OK: No large non-LFS files detected (limit {LIMIT_MB} MB).")

if __name__ == "__main__":
    main()
'''

# Write the file
script.write_text(code)
script.chmod(0o755)  # make it executable 0o means base-8. r:4, w: 2, x:1

print(f"Created {script}")


Created tools/guard_large_files.py


In [None]:
# Create/append Makefile target
from pathlib import Path
text = "\n\nguard:\n\tpython tools/guard_large_files.py\n" # guard: Makefile target. \t: tab required.
p = Path("Makefile") # point to the Makefile
# p.write_text(p.read_text() + text if p.exists() else text) # if p exists, read exising content and append text and overwrites.
# the above code will append text everytime, casue error if repeatedly excute.
if p.exists():
    content = p.read_text()
    if "guard:" not in content:
        p.write_text(content + text)
else:
    p.write_text(text)

print("Added 'guard' target to Makefile")

Added 'guard' target to Makefile


In [None]:
!python tools/guard_large_files.py

OK: No large non-LFS files detected (limit 10 MB).
