In [11]:
# Colab cell
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [12]:
# Adjust these two for YOUR repo
REPO_OWNER = "ywanglab"
REPO_NAME  = "STAT4160"   # e.g., unified-stocks-team1

BASE_DIR   = "/content/drive/MyDrive/dspt25"
CLONE_DIR  = f"{BASE_DIR}/{REPO_NAME}"
REPO_URL   = f"https://github.com/{REPO_OWNER}/{REPO_NAME}.git"

import os, pathlib
pathlib.Path(BASE_DIR).mkdir(parents=True, exist_ok=True)


In [13]:

import os, subprocess, shutil, pathlib

if not pathlib.Path(CLONE_DIR).exists():
    !git clone {REPO_URL} {CLONE_DIR}
else:
    # If the folder exists, just ensure it's a git repo and pull latest
    os.chdir(CLONE_DIR)
    # !git status
    # !git pull --rebase # !git pull --ff-only
os.chdir(CLONE_DIR)
print("Working dir:", os.getcwd())

Working dir: /content/drive/MyDrive/dspt25/STAT4160


%%bash: Jupyter notebook magic command. use bash (Bourne Again Shell) command
```
set -euo pipefail
```
This is a common shell “strict mode” setting:

-`e`: Exit immediately if any command returns a nonzero (error) status.

-`u`: Treat undefined variables as errors (instead of silently treating them as empty strings).

-`o` pipefail: If a pipeline has multiple commands (like cmd1 | cmd2), the whole pipeline fails if any command fails, not just the last one.

Together: helps catch errors early and avoids running later commands with broken assumptions.

In [14]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"
pwd
ls -la

/content/drive/MyDrive/dspt25/STAT4160
total 670
-rw------- 1 root root      0 Sep 20 00:22 1
-rw------- 1 root root      6 Sep 17 13:37 a.txt
drwx------ 2 root root   4096 Sep 20 00:25 data
drwx------ 2 root root   4096 Sep 20 00:24 docs
drwx------ 2 root root   4096 Sep 20 00:22 docs1
-rw------- 1 root root      0 Sep 17 13:37 docs.txt
drwx------ 2 root root   4096 Sep 20 00:24 .git
-rw------- 1 root root    205 Sep 20 00:22 .gitattributes
drwx------ 2 root root   4096 Sep 20 00:22 .github
-rw------- 1 root root    452 Sep 20 00:22 .gitignore
-rw------- 1 root root     47 Sep 17 13:29 hello.sh
drwx------ 2 root root   4096 Sep 20 00:24 homework
-rw------- 1 root root 604095 Sep 20 00:22 index.pdf
-rw------- 1 root root    188 Sep  7 18:39 index.qmd
-rw------- 1 root root   2684 Sep 20 00:24 Makefile
-rw------- 1 root root     23 Sep 20 00:22 myfirstfrommac.txt
-rw------- 1 root root     22 Sep 20 00:22 myfirstlocalfile.txt
drwx------ 2 root root   4096 Sep 20 00:24 notebooks
drwx----

```
rng.integers(low, high=None, size=None, dtype=int, endpoint=False)
Series.diff(periods=1)
DataFrame.diff(periods=1, axis=0)
```

In [15]:
# Generates data/raw/prices.csv with columns: ticker,date,adj_close,volume,log_return
import pandas as pd, numpy as np, os
from pathlib import Path

Path("data/raw").mkdir(parents=True, exist_ok=True)
tickers = pd.read_csv("tickers_25.csv")["ticker"].tolist() if os.path.exists("tickers_25.csv") else [
    "AAPL","MSFT","AMZN","GOOGL","META","NVDA","TSLA","JPM","JNJ","V",
    "PG","HD","BAC","XOM","CVX","PFE","KO","DIS","NFLX","INTC","CSCO","ORCL","T","VZ","WMT"
]
dates = pd.bdate_range("2020-01-01", periods=180)  # ~ 9 months
rng = np.random.default_rng(7)

frames=[]
for t in tickers:
    r = rng.normal(0, 0.01, len(dates))  #rng.normal(mean, std, size)
    price = 100*np.exp(np.cumsum(r))  # cumsum: simulate random walk; exp(): multiplicative growth (geometric Brownian motion style)
    vol = rng.integers(1e5, 5e6, len(dates))
    df = pd.DataFrame({"ticker": t, "date": dates, "adj_close": price, "volume": vol})
    df["log_return"] = np.log(df["adj_close"]).diff().fillna(0) #log return. diff: log(P_t-P_{t-1}). fillna(0): fill the value for day 1 (no prev) with 0
    frames.append(df)

out = pd.concat(frames, ignore_index=True)
out.to_csv("data/raw/prices.csv", index=False)
out.head()

Unnamed: 0,ticker,date,adj_close,volume,log_return
0,AAPL,2020-01-01,100.00123,4457901,0.0
1,AAPL,2020-01-02,100.300426,2664190,0.002987
2,AAPL,2020-01-03,100.025841,4100245,-0.002741
3,AAPL,2020-01-06,99.138974,4586613,-0.008906
4,AAPL,2020-01-07,98.689241,1556062,-0.004547


1. `wc` Short for word count.
By default it prints number of lines, words, and bytes in a file.
Common options:

`wc -l file` → line count only.

`wc -w file` → word count only.

`wc -c file` → byte count.

2. `tee`
Think of a plumbing T-joint: it splits a stream into two.

`tee file.txt` → writes the output to a file and to the terminal at the same time.

3. `tail -n +2`
Normally `tail -n 10 file` shows the last 10 lines of a file.
But if you use +N, it starts printing from line N to the end.

4. `cut -d, -f1 data/raw/prices.csv`

`cut` splits text into fields.

`-d,` says delimiter is a comma.

`-f1` picks field 1 → the ticker column.

Output = first column of the CSV (including header).

In [16]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"

# How many lines? (including header)
wc -l data/raw/prices.csv | tee reports/prices_wc.txt

# First 5 lines, save to a sample
head -n 5 data/raw/prices.csv | tee data/raw/prices_sample.csv

set +o pipefail  # uncomment to fix the SIGPIPE error
# Show ticker column only (field 1), excluding header
cut -d, -f1 data/raw/prices.csv | tail -n +2 | head -n 10  #|| true  # swallow SIGPIPE explicitly
set -o pipefail  #uncomment to change back

4501 data/raw/prices.csv
ticker,date,adj_close,volume,log_return
AAPL,2020-01-01,100.00123016092391,4457901,0.0
AAPL,2020-01-02,100.30042606816971,2664190,0.00298745537508438
AAPL,2020-01-03,100.02584117375997,4100245,-0.0027413785536225532
AAPL,2020-01-06,99.13897423972712,4586613,-0.008905918387572598
AAPL
AAPL
AAPL
AAPL
AAPL
AAPL
AAPL
AAPL
AAPL
AAPL


## `grep` = search lines matching a regex and print them.
`grep` stands for `Global Regular Expression Print`.
* **Plain `grep`** = Basic Regular Expressions (BRE).

  * In BRE, some operators like `+` and `|` are treated as *literal characters* unless you escape them.
  * Example: `grep 'a\+'` matches “aaa…” in BRE.

* **`grep -E`** = Extended Regular Expressions (ERE).

  * Here, operators like `+` (one or more) and `|` (alternation) work *without* a backslash.
  * Example: `grep -E 'cat|dog' file.txt` matches lines with either “cat” or “dog”.
  * Example: `grep -E 'a+'` matches one or more “a”.

* `grep` (BRE) is stricter, you must escape.
* `grep -E` (ERE) is friendlier for regex like `|` and `+`.


3. `sort -t, -k3,3nr`

   * Sort by 3rd column (abs log return), numeric, reverse (largest first).

4. `head -n 5`

   * Take top 5 rows.

Effect: the 5 biggest up or down days for NVDA, sorted by absolute return.

---

`sort` → group identical tickers together.

`uniq -c` → collapse duplicates and count how many there were.

In [17]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"

set +o pipefail
# All rows for NVDA OR MSFT (extended regex with alternation)
grep -E '^(NVDA|MSFT),' data/raw/prices.csv | head -n 3  # -E: extended regex

# Rows where ticker starts with a vowel (A, E, I, O, U)
grep -E '^(A|E|I|O|U)[A-Z]*,' data/raw/prices.csv | head -n 3  #[A-Z]*: 0+ uppercase letters

# Count rows per ticker quickly (just for demo)
cut -d, -f1 data/raw/prices.csv | tail -n +2 | sort | uniq -c | head

MSFT,2020-01-01,99.86247367796362,593024,0.0
MSFT,2020-01-02,99.8551257815958,2835784,-7.358286291303529e-05
MSFT,2020-01-03,98.54112146451733,1525535,-0.013246455506441102
AAPL,2020-01-01,100.00123016092391,4457901,0.0
AAPL,2020-01-02,100.30042606816971,2664190,0.00298745537508438
AAPL,2020-01-03,100.02584117375997,4100245,-0.0027413785536225532
    180 AAPL
    180 AMZN
    180 BAC
    180 CSCO
    180 CVX
    180 DIS
    180 GOOGL
    180 HD
    180 INTC
    180 JNJ


```
mkdir -p path/to/dir
```
* “parents”: create any parent directories as needed.

* No error if existing: if the directory already exists, it just does nothing (and exits successfully).

## 1. **Replace ISO date dashes with slashes**
`sed`: Steam EDitor. Unix command-line tool for sarching,filtering,and transforming text streams.
```bash
sed -i '1!s/\([0-9]\{4\}\)-\([0-9]\{2\}\)-\([0-9]\{2\}\)/\1\/\2\/\3/g' data/interim/prices_copy.csv
```

* **`sed -i`** → edit the file *in place*.
* **`1!`** → apply the command to all lines *except line 1* (skip the header).
* **`s/.../.../g`** → substitution: find a pattern, replace it globally in each line.
* Regex:

  * `\([0-9]\{4\}\)` = capture 4 digits (year).
  * `-` = literal dash.
  * `\([0-9]\{2\}\)` = capture 2 digits (month).
  * Another dash.
  * `\([0-9]\{2\}\)` = capture 2 digits (day).
* Replacement: `\1/\2/\3`

  * Use capture groups 1, 2, 3 (year, month, day) separated by `/`.

Effect:
`2020-01-02` → `2020/01/02`
(but leaves header untouched).

---
* `sed` uses **Basic Regular Expressions (BRE)** by default.
* In BRE:

  * `(...)` (grouping) must be written as `\(...\)` (escaped).
  * `{n}` (quantifier) must be written as `\{n\}` (escaped).
* So `\([0-9]\{4\}\)` means “capture 4 digits.”

If you used `sed -E` (Extended RE mode), you could drop most backslashes:

```bash
sed -E '1!s/([0-9]{4})-([0-9]{2})-([0-9]{2})/\1\/\2\/\3/g' file.csv
```
* A **group** (aka “capture group”) in regex is a part of the pattern wrapped in parentheses.
* It “remembers” what matched inside it.
* Later, you can **reuse** it in the replacement string as `\1`, `\2`, `\3`, etc.

## 2. **Normalize ticker to lowercase (first column)**

1. **Keep the header**

   ```bash
   head -n 1 data/interim/prices_copy.csv > data/interim/prices_lower.csv
   ```

   * Take just the header line and save it as the new file.

2. **Process the body (data rows)**

   ```bash
   tail -n +2 data/interim/prices_copy.csv \
   | awk -F, 'BEGIN{OFS=","}{ $1=tolower($1); print }' >> data/interim/prices_lower.csv
   ```

   * `tail -n +2` → skip header, output from 2nd line onward.
   * `awk -F,` → split fields on commas.
   * `BEGIN{OFS=","}` → set output field separator as comma (preserve CSV format).
   * `{ $1=tolower($1); print }` → convert the first field (ticker) to lowercase, then print the whole row.
   * `>>` appends to the new file.

* Header stays the same.
* All tickers (first column) become lowercase (`AAPL` → `aapl`).

---


In [18]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"

# Make a copy so we don't touch the raw file
cp data/raw/prices.csv data/interim/ || mkdir -p data/interim && cp data/raw/prices.csv data/interim
cp data/interim/prices.csv data/interim/prices_copy.csv

# Replace ISO date dashes with slashes (2020-01-02 -> 2020/01/02) in-place
sed -i '1!s/\([0-9]\{4\}\)-\([0-9]\{2\}\)-\([0-9]\{2\}\)/\1\/\2\/\3/g' data/interim/prices_copy.csv

# Normalize ticker to lowercase (first column) using sed's capture groups and tolower via awk (hybrid example)
head -n 1 data/interim/prices_copy.csv > data/interim/prices_lower.csv
tail -n +2 data/interim/prices_copy.csv | awk -F, 'BEGIN{OFS=","}{ $1=tolower($1); print }' >> data/interim/prices_lower.csv

head -n 3 data/interim/prices_lower.csv

ticker,date,adj_close,volume,log_return
aapl,2020/01/01,100.00123016092391,4457901,0.0
aapl,2020/01/02,100.30042606816971,2664190,0.00298745537508438


cp: cannot create regular file 'data/interim/': Not a directory


* **awk** is named after its original authors:

  * **A**lfred V. Aho
  * **P**eter J. **W**einberger
  * **B**rian W. **K**ernighan

So “awk” is their initials.
It’s both a **programming language** (for text/data processing) and a **command-line tool**.

---

## 2. Syntax in `awk`

The general structure is:

```bash
awk 'pattern { action }' file
```

* **`pattern`** → condition to match (like a filter).
* **`{ action }`** → what to do when the pattern matches.
* Both are optional:

  * If you omit `pattern`, action applies to every line.
  * If you omit `{ action }`, default action is `print $0` (print the whole line).

### Special symbols:

* **`$1, $2, ...`** → fields (columns) in the current line, split by `-F` delimiter.
* **`$0`** → the whole line.
* **`NR`** → current line number.
* **`{ ... }`** → block of code (can have multiple statements separated by `;`).
* **`END { ... }`** → run once, after processing all lines.

---

## 3. What is `-k2,2nr` in `sort`
general form:
```
sort -kM,N
```

`M` = starting field number.

`N` = ending field number.

If you give both, the sort key is everything from field `M` through field `N`.

If you only give `-kM`, then the key runs from field `M` all the way to the end of the line.

Fields are determined by the delimiter (`-t`).
* `-t,` → use **comma** as field separator.
* `-k2,2` → sort by **column 2 only**.
* `n` → numeric sort (treat “10” as 10, not as string).
* `r` → reverse order (largest first).

So `-k2,2nr` = “sort by **2nd column**, numerically, descending”.

---

* By default, `head` show  **10 lines**.
* You can override with `-n N`.

  * Example: `head -n 5 file.txt` shows the first 5 lines.

### (a) Compute mean log\_return per ticker
“Group by ticker, compute average log return, rank them.”
```bash
awk -F, 'NR>1 { sum[$1]+=$5; n[$1]++ }
         END { OFS=","; print "ticker","mean_log_return";
               for (t in sum) print t, sum[t]/n[t] }' data/raw/prices.csv \
| sort -t, -k2,2nr | head
```
* `-F,` → field separator is a comma (CSV).
* `NR>1` → skip header line (NR = record number).
* `{ sum[$1]+=$5; n[$1]++ }` →

  * `$1` = ticker (first column).
  * `$5` = log\_return (fifth column).
  * For each ticker, accumulate `sum[ticker]` and count `n[ticker]`.
* `END { ... }` → after reading all rows:

  * `OFS=","` → output field separator = comma.
  * Print a header row.
  * For each ticker `t`, print `t, average = sum[t]/n[t]`.
* Pipe to `sort -t, -k2,2nr`:

  * `-t,` → use comma as delimiter.
  * `-k2,2nr` → sort by column 2 (mean) numerically, descending.
* `head` → show top few.

 Effect: top tickers ranked by average log return.

---

### (b) Top 5 dates with highest absolute log\_return for NVDA
“Filter NVDA, compute absolute returns, find the 5 most extreme days.”
```bash
awk -F, 'NR>1 && $1=="NVDA" { print $2, $5 }' data/raw/prices.csv \
| awk '{ if ($2<0) s=-$2; else s=$2; print $1","$2","s }' \
| sort -t, -k3,3nr | head -n 5
```
1. `awk -F, 'NR>1 && $1=="NVDA" { print $2, $5 }'`

   * Skip header.
   * Only rows where ticker (`$1`) is NVDA.
   * Print date (`$2`) and log\_return (`$5`).

2. Pipe into another `awk`:

   ```awk
   { if ($2<0) s=-$2; else s=$2; print $1","$2","s }
   ```

   * Compute absolute value of log\_return (`s`).
   * Print: `date,log_return,abs_log_return`.


In [19]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"

# Compute mean log_return per ticker (skip header). -F, sets comma as field separator.
awk -F, 'NR>1 { sum[$1]+=$5; n[$1]++ } END { OFS=","; print "ticker","mean_log_return"; for (t in sum) print t, sum[t]/n[t] }' data/raw/prices.csv \
| sort -t, -k2,2nr | head

# Top 5 dates with highest absolute log_return for NVDA
awk -F, 'NR>1 && $1=="NVDA" { print $2, $5 }' data/raw/prices.csv \
| awk '{ if ($2<0) s=-$2; else s=$2; print $1","$2","s }' \
| sort -t, -k3,3nr | head -n 5

WMT,2.24177e-05
PFE,0.00129911
T,0.000994674
ORCL,0.00073139
V,0.000592352
GOOGL,0.000489848
NFLX,0.000474115
JPM,0.000342749
AMZN,0.000134249
ticker,mean_log_return
2020-03-11,0.028288986337162925,0.028288986337162925
2020-06-08,-0.027802932151547388,0.0278029
2020-08-20,0.02705506369509436,0.02705506369509436
2020-07-23,0.026552935706134484,0.026552935706134484
2020-06-09,0.026508047390848333,0.026508047390848333


* `comm file1 file2` → compares **two sorted files line by line**.
* It outputs **three columns**:

  1. Lines only in `file1`.
  2. Lines only in `file2`.
  3. Lines in both.

You can suppress columns with options:

* `-1` → suppress col 1 (only in file1).
* `-2` → suppress col 2 (only in file2).
* `-3` → suppress col 3 (common lines).

* `comm -23 file1 file2`

  * Suppress col 2 and 3 → show **only lines unique to file1**.
* `comm -13 file1 file2`

  * Suppress col 1 and 3 → show **only lines unique to file2**.

```bash
sed 's/^/  /'
```

* `s/^/  /` = substitute start of line (`^`) with two spaces.
* Effect: indent each output line by 2 spaces → makes results more readable.

**Important:** `comm` requires both input files to be **sorted**. If they aren’t, results will be wrong.

In [20]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"

# Unique tickers actually present in the file
cut -d, -f1 data/raw/prices.csv | tail -n +2 | sort | uniq > data/interim/tickers_in_data.txt

# Compare to our canonical list from tickers_25.csv
cut -d, -f1 tickers_25.csv | tail -n +2 | sort > data/interim/tickers_25.txt

echo "Only in data:"; comm -23 data/interim/tickers_in_data.txt data/interim/tickers_25.txt | sed 's/^/  /'
echo "Only in canonical:"; comm -13 data/interim/tickers_in_data.txt data/interim/tickers_25.txt | sed 's/^/  /'

Only in data:
Only in canonical:


```bash
find data -type f -name "*.csv" -printf "%p,%s bytes\n" | sort | head
```

* `find data -type f -name "*.csv"` → find all regular files ending with `.csv` under `data/`.
* `-printf "%p,%s bytes\n"` → custom output:

  * `%p` = path of the file.
  * `%s` = file size in bytes.
* `| sort` → sort results alphabetically (by path).
* `| head` → show only the first 10 results.

Effect: a neat file listing with paths and sizes.
Note: `-printf` is GNU (GNU's not unix) `find` only; on macOS/Colab (BSD: Berkeley Software Distribution), you’d use `-exec stat` instead.

---

```bash
find data -type f -name "*.csv" -print0 | xargs -0 -I{} sh -c 'echo -n "{},"; wc -l < "{}"'
```

* `find ... -print0` → prints file paths separated by **nulls** (`\0`), safe for weird filenames (with spaces, quotes, etc.).
* `xargs -0` → reads null-separated paths. `xargs` builds and executes command lines from stdin.
* `-I{}` → replace `{}` with each filename in the command.
* `sh -c 'echo -n "{},"; wc -l < "{}"'` →

  * `echo -n "{},"` → print filename followed by a comma (no newline).
  * `-n`: no new line (in the same line).
  * `wc -l < "{}"` → count lines in the file, print number.
  * -c: scripts

Effect: outputs `filename.csv,<linecount>` for each CSV.

```bash
find data -type f -name "*.csv" -size +1000k -print0 | xargs -0 -I{} gzip -kf "{}"
```

* `-size +1000k` → match files larger than **1000 kilobytes ≈ 1 MB**.
* `-print0 | xargs -0` → null-safe passing to `gzip`.
* `gzip -kf "{}"` →

  * `-k` = keep original file (don’t delete after compressing).
  * `-f` = force overwrite if `.gz` already exists.

Effect: creates `filename.csv.gz` alongside the original CSV for any file bigger than \~1MB.

---

* **GNU find** → standard on Linux (e.g., Ubuntu, Debian, Colab).
* **BSD find** → standard on BSD-based systems like macOS and FreeBSD.

On macOS, you use the `stat` command inside `find`:

```bash
find data -type f -name "*.csv" -exec stat -f "%N,%z bytes" {} \;
```

* `-exec ... {} \;` → run `stat` on each found file.
* `-f "%N,%z bytes"` → BSD `stat` format string:

  * `%N` = filename
  * `%z` = file size in bytes

Example output:

```
data/raw/prices.csv,123456 bytes
```
On Linux → you typically get GNU utilities (e.g. GNU find with -printf).

On macOS → you get BSD utilities (e.g. BSD find without -printf).

Perfect — let’s go step by step.

---

### Syntax of `find`

General form:

```bash
find [path...] [options] [tests] [actions]
```

* **`[path...]`** → where to start searching (default is `.`).
* **`[options]`** → control things like depth or following symlinks.
* **`[tests]`** → conditions that must be true for a file to match (e.g. name, type, size).
* **`[actions]`** → what to do with each match (e.g. print, exec, delete).

---

### Common **tests**

* `-name "*.csv"` → filename matches pattern.
* `-type f` → regular file.
* `-type d` → directory.
* `-size +1000k` → larger than 1000 KB.
* `-mtime -7` → modified within last 7 days.

---

### Common **actions**

* `-print` → print the file path (default if no action given).
* `-printf` (GNU only) → custom output format.
* `-exec command {} \;` → run a command on each file (`{}` replaced with filename).
* `-delete` → remove matching files.

---

### Examples

```bash
# 1. Find all CSVs under data/
find data -type f -name "*.csv"

# 2. Find large files (>1MB)
find . -type f -size +1000k

# 3. Find and delete .tmp files
find . -type f -name "*.tmp" -delete

# 4. Find files and run wc -l on each
find . -type f -name "*.csv" -exec wc -l {} \;
```
---

## Difference between `' '` (single quotes) and `" "` (double quotes)

* **Single quotes `'...'`**

  * Take everything **literally**.
  * No variable expansion, no backslash escapes (except `'\''` trick).
  * Example:

    ```bash
    name=world
    echo 'Hello $name'   # → Hello $name
    ```

* **Double quotes `"..."`**

  * Allow **expansion** of variables, command substitution, and some escapes.
  * Example:

    ```bash
    name=world
    echo "Hello $name"   # → Hello world
    ```





In [21]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"

# Show all CSVs under data/, printing sizes
find data -type f -name "*.csv" -printf "%p,%s bytes\n" | sort | head

# Count lines in each CSV (null-safe for weird filenames)
find data -type f -name "*.csv" -print0 | xargs -0 -I{} sh -c 'echo -n "{},"; wc -l < "{}"'

# Gzip-compress any CSV larger than ~1MB (demo threshold: 1e6 bytes)
find data -type f -name "*.csv" -size +1000k -print0 | xargs -0 -I{} gzip -kf "{}"  # -k keeps original

data/interim/prices_copy.csv,282169 bytes
data/interim/prices.csv,282169 bytes
data/interim/prices_lower.csv,282169 bytes
data/raw/prices.csv,282169 bytes
data/raw/prices_sample.csv,280 bytes
data/raw/prices.csv,4501
data/raw/prices_sample.csv,5
data/interim/prices.csv,4501
data/interim/prices_copy.csv,4501
data/interim/prices_lower.csv,4501


### Creating the script with a here-doc

```bash
cat > scripts/qa_csv.sh << 'EOF'
...
EOF
chmod +x scripts/qa_csv.sh
```

* Writes everything between `<< 'EOF'` and `EOF` to `scripts/qa_csv.sh`.
* **Quotes around `EOF`** are important: they **prevent variable/command expansion** while writing, so `$1`, `$FILE`, etc. are preserved literally.
* Here document (<<): a way to feed a block of text to a command’s standard input.

* cat > file.sh → runs cat and redirects its stdout into file.sh. Normally, cat just copies its stdin to stdout.

* << 'EOF' ... EOF → the lines in between are treated as stdin for cat.

So the effect: everything between EOF markers is written to file.sh.

---

```bash
#!/usr/bin/env bash
# Simple CSV health check
# Usage: scripts/qa_csv.sh path/to/file.csv required_columns_csv
set -euo pipefail
IFS=$'\n\t'
```

* Shebang(sh+bang: bang: `!`) picks bash from PATH.
* `IFS=$'\n\t'` set IFS (Internal Field Separator) to newline + tab. makes word-splitting safer (don’t split on spaces by default).
* $'...' is a special ANSI C quoting form in bash.
Inside it, escape sequences like `\n` (newline), `\t` (tab), `\x41` (hex A) are interpreted.
Without the `$`, '...' is just literal characters.



### Inputs & defaults

```bash
FILE="${1:-}"
REQUIRED="${2:-ticker,date,adj_close,volume,log_return}"
```

* `$1` first positional paramter passed to the script. It is the CSV path; if missing, empty string.
* `$2` is a comma-separated list of required columns; default shown.
* `${var:-default}`:
Use `$var` if set and non-empty; otherwise use default. `:-` requies to use `{ }`.
  *`${var-default}` → use default if unset (empty counts as set)

  *`${var:-default}` → use default if unset or empty

  * `${var:?msg}` → error (with msg) if unset/empty

  * `${var:+alt}` → use alt if var is set (else empty)

### Tiny error helper

```bash
err() { echo "ERROR: $*" >&2; exit 1; }
[[ -z "$FILE" ]] && err "No CSV file provided."
[[ ! -f "$FILE" ]] && err "File not found: $FILE"
```

* `err` prints to stderr (2) and exits. stdout: 1. `>&2`: send it to stderr.
* Basic presence & path checks.
* `$*` = all positional parameters (`$1` `$2` `$3` ...).
* In "${*}", they’re joined into one string.

* `$*` → expands to all args as one string (joined by IFS).

* `$@` → expands to each arg separately (safe for iteration).
* `[[ ... ]]` is bash’s test command (safer than [ ... ]): Supports regex matching.

* `-z` = “string length is zero.” So `[[ -z "$FILE" ]]` means “is `$FILE` empty?”

* Double quotes around `$FILE`:
Prevent errors if `$FILE` is unset or contains spaces.
Without quotes, `[[ -z $FILE ]]` could break if $FILE is empty or has spaces.

* `-f` = “is this a regular file?”
`[[ -f "$FILE" ]]` → true if $FILE exists and is a file.

### 1) Non-empty file with header

```bash
LINES=$(wc -l < "$FILE" || true)
[[ "${LINES:-0}" -lt 2 ]] && err "File has <2 lines (missing data?): $FILE"
HEADER=$(head -n 1 "$FILE")
```

* `wc -l < "$FILE"` counts lines; `|| true` prevents strict-mode exit if `wc` failed (defensive).
* `<2` lines → likely missing data row(s).
* `HEADER` grabs the first line to inspect column names.

### 2) Check required columns exist (exact token match)

```bash
IFS=',' read -r -a req <<< "$REQUIRED"
for col in "${req[@]}"; do
  echo "$HEADER" | grep -q -E "(^|,)${col}(,|$)" || err "Missing required column: $col"
done
```
* Temporary IFS: `IFS=',' cmd ...` sets IFS for that command only (and its children).
In bash, a VAR=value cmd prefix is a per-command environment assignment; it does not permanently change the parent shell’s IFS.

`read`: reads one line from stdin and splits it into fields using IFS.

`-r`: don’t treat backslashes as escapes (reads text literally).

`-a req`: put the split fields into array `req` (`req[0], req[1]`, …).

* `<<<` Feeds the string `REQUIRED` into the previous command; Splits `REQUIRED` into array `req`.
* The regex `(^|,),col,(|$)` ensures the **whole header token** equals `col` (not a substring).
* `"${req[@]}"`  req is an array.
`${req[@]}` expands to each element of the array separately.

* Contrast: `${req[*]}` expands to all elements as one string.

`grep -q -E "(^|,)${col}(,|$)"`

`-q` = quiet (suppress output, just return success/failure).

`-E` = extended regex (so | works without backslashes).

Regex breakdown:

  * `(^|,)` → start of line OR a comma before the column name.

  * `${col}` → variable with column name.

  * `(,|$)` → comma after the name OR end of line.


### 3) Basic NA/blank checks for numeric columns

```bash
NUMERIC="adj_close,volume,log_return"
IFS=',' read -r -a nums <<< "$NUMERIC"
for col in "${nums[@]}"; do
  # find column index (1-based)
  idx=$(awk -F, -v COL="$col" 'NR==1{for(i=1;i<=NF;i++) if($i==COL) print i}' "$FILE")
  [[ -z "${idx:-}" ]] && err "Column not found: $col"

  # count blanks/"NA" (rows 2+) in that column
  bad=$(awk -F, -v I="$idx" 'NR>1 && ($I=="" || $I=="NA") {c++} END{print c+0}' "$FILE")
  [[ "$bad" -gt 0 ]] && err "Found $bad blank/NA in column: $col"
done
```

* First `awk` scans the **header row** to find the index of `col`.
* Second `awk` scans data rows (`NR>1`) and increments for empty strings "" or literal `NA`.
* Fails the script if any blanks/`NA` found.
* `-v I="$idx"` `-v`: sets an `awk` variable before the program starts. passes the column index into awk as variable I.

* `NR>1` → skip header.

* `($I=="" || $I=="NA")` → check if that column is blank or NA.


* `END{print c+0}` → after finishing file, print the counter (defaults to 0).

* If `bad > 0`, error out.

---



In [22]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"
mkdir -p scripts

cat > scripts/qa_csv.sh << 'EOF'
#!/usr/bin/env bash
# Simple CSV health check
# Usage: scripts/qa_csv.sh path/to/file.csv required_columns_csv
set -euo pipefail
IFS=$'\n\t'

FILE="${1:-}"
REQUIRED="${2:-ticker,date,adj_close,volume,log_return}"

err() { echo "ERROR: $*" >&2; exit 1; }
[[ -z "$FILE" ]] && err "No CSV file provided."
[[ ! -f "$FILE" ]] && err "File not found: $FILE"

# 1) Non-empty and header present
LINES=$(wc -l < "$FILE" || true)
[[ "${LINES:-0}" -lt 2 ]] && err "File has <2 lines (missing data?): $FILE"

HEADER=$(head -n 1 "$FILE")
# 2) All required columns present
IFS=',' read -r -a req <<< "$REQUIRED"
for col in "${req[@]}"; do
  echo "$HEADER" | grep -q -E "(^|,)${col}(,|$)" || err "Missing required column: $col"
done

# 3) No obvious NA/blank values in required numeric cols (basic check)
NUMERIC="adj_close,volume,log_return"
IFS=',' read -r -a nums <<< "$NUMERIC"
for col in "${nums[@]}"; do
  # find column index
  idx=$(awk -F, -v COL="$col" 'NR==1{for(i=1;i<=NF;i++) if($i==COL) print i}' "$FILE")
  [[ -z "${idx:-}" ]] && err "Column not found: $col"
  # check any blank values from row 2 onward
  bad=$(awk -F, -v I="$idx" 'NR>1 && ($I=="" || $I=="NA") {c++} END{print c+0}' "$FILE")
  [[ "$bad" -gt 0 ]] && err "Found $bad blank/NA in column: $col"
done

echo "OK: $FILE passed basic CSV QA ($LINES lines)."
EOF

chmod +x scripts/qa_csv.sh

In [23]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"
scripts/qa_csv.sh data/raw/prices.csv

OK: data/raw/prices.csv passed basic CSV QA (4501 lines).


In [24]:
!scripts/qa_csv.sh data/interim/prices_lower.csv ticker,date,adj_close

OK: data/interim/prices_lower.csv passed basic CSV QA (4501 lines).


# Homework

In [25]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"
mkdir -p reports data/interim

# 1) Count lines and unique tickers
{
  echo "Lines (incl header): $(wc -l < data/raw/prices.csv)";
  echo "Unique tickers: $(cut -d, -f1 data/raw/prices.csv | tail -n +2 | sort | uniq | wc -l)";
} | tee reports/data_counts.txt

# 2) Top-10 days by absolute log_return across all tickers
tail -n +2 data/raw/prices.csv \
| awk -F, '{a=$5; if(a<0) a=-a; print $1","$2","$5","a}' \
| sort -t, -k4,4nr | head -n 10 \
| tee reports/top10_abs_moves.csv || true

# 3) Mean log_return per ticker (CSV)
awk -F, 'NR>1 { s[$1]+=$5; n[$1]++ } END { OFS=","; print "ticker,mean_log_return"; for(t in s) print t, s[t]/n[t] }' \
  data/raw/prices.csv | sort -t, -k2,2nr | tee reports/mean_return_by_ticker.csv

Lines (incl header): 4501
Unique tickers: 25
XOM,2020-05-08,-9.545439969294023e-05,9.54544e-05
NFLX,2020-02-26,9.243919974366577e-05,9.243919974366577e-05
HD,2020-07-10,9.230948293748042e-05,9.230948293748042e-05
PFE,2020-06-23,8.716362939864553e-05,8.716362939864553e-05
BAC,2020-06-04,-8.67879702681762e-05,8.6788e-05
CSCO,2020-06-18,8.461770772161259e-05,8.461770772161259e-05
VZ,2020-01-21,-8.349958001385716e-05,8.34996e-05
AAPL,2020-08-03,7.959099184873253e-05,7.959099184873253e-05
MSFT,2020-01-02,-7.358286291303529e-05,7.35829e-05
JNJ,2020-02-19,6.050414894520628e-05,6.050414894520628e-05
WMT,2.24177e-05
PFE,0.00129911
T,0.000994674
ORCL,0.00073139
V,0.000592352
GOOGL,0.000489848
NFLX,0.000474115
JPM,0.000342749
AMZN,0.000134249
ticker,mean_log_return
NVDA,-0.000120328
CVX,-0.000129288
HD,-0.00015012
BAC,-0.000158227
MSFT,-0.000274487
CSCO,-0.000283718
META,-0.0004515
KO,-0.000466965
JNJ,-0.000634333
XOM,-0.0010357
TSLA,-0.00109702
PG,-0.00115799
DIS,-0.00119287
VZ,-0.00161965
AAPL,

In [26]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"
mkdir -p data/interim

# Extract header once
HEADER=$(head -n 1 data/raw/prices.csv)

# Create per-ticker files with header + rows (null-safe not necessary here)
cut -d, -f1 data/raw/prices.csv | tail -n +2 | sort | uniq | while read -r T; do
  mkdir -p "data/interim/ticker=${T}"
  {
    echo "$HEADER"
    awk -F, -v TK="$T" 'NR==1 || $1==TK' data/raw/prices.csv # change NR==1 to NR>1 to avoid writing the header twice
  } > "data/interim/ticker=${T}/prices_${T}.csv"
done

# Verify one example
ls -la data/interim/ticker=AAPL | head

total 12
-rw------- 1 root root 11487 Sep 20 00:27 prices_AAPL.csv


The total 12 is the sum of disk blocks used by files in that directory (12 × 1 KiB blocks ≈ 11,487 bytes, which matches the size shown).

In [27]:
# To see that many per-ticker folders/files were created, list the parent:
!ls -1 data/interim | head          # shows directories like ticker=AAPL, ticker=MSFT, ...
!find data/interim -maxdepth 2 -name 'prices_*.csv' | head


prices_copy.csv
prices.csv
prices_lower.csv
ticker=AAPL
ticker=AMZN
ticker=BAC
ticker=CSCO
ticker=CVX
ticker=DIS
ticker=GOOGL
data/interim/prices_copy.csv
data/interim/prices_lower.csv
data/interim/ticker=AAPL/prices_AAPL.csv
data/interim/ticker=AMZN/prices_AMZN.csv
data/interim/ticker=BAC/prices_BAC.csv
data/interim/ticker=CSCO/prices_CSCO.csv
data/interim/ticker=CVX/prices_CVX.csv
data/interim/ticker=DIS/prices_DIS.csv
data/interim/ticker=GOOGL/prices_GOOGL.csv
data/interim/ticker=HD/prices_HD.csv


In [28]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"

# Append or create a Makefile
{
  echo ""
  echo "qa:"
  echo "\tscripts/qa_csv.sh data/raw/prices.csv"
  echo ""
  echo "split-by-ticker:"
  echo "\tbash -c 'HEADER=\$(head -n 1 data/raw/prices.csv); cut -d, -f1 data/raw/prices.csv | tail -n +2 | sort | uniq | while read -r T; do mkdir -p data/interim/ticker=\$\$T; { echo \"\$\$HEADER\"; awk -F, -v TK=\"\$\$T\" '\"'NR==1 || \$1==TK'\"' data/raw/prices.csv; } > data/interim/ticker=\$\$T/prices_\$\$T.csv; done'"
} >> Makefile

cat Makefile

# Makefile — unified-stocks
SHELL := /bin/bash
.SHELLFLAGS := -eu -o pipefail -c
.ONESHELL:


PY := python
QUARTO := quarto

START ?= 2020-01-01
END   ?= 2025-08-01
ROLL  ?= 30

DATA_RAW := data/raw/prices.csv
FEATS    := data/processed/features.parquet
REPORT   := docs/reports/eda.html

# Default target
.DEFAULT_GOAL := help

.PHONY: help all clean clobber qa report backup

help: ## Show help for each target
	@awk 'BEGIN {FS = ":.*##"; printf "Available targets:\n"} /^[a-zA-Z0-9_\-]+:.*##/ {printf "  \033[36m%-18s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)

# all: $(DATA_RAW) $(FEATS) report backup ## Run the full pipeline and back up artifacts
all: $(DATA_RAW) $(FEATS) report train backup

$(DATA_RAW): scripts/get_prices.py tickers_25.csv
	$(PY) scripts/get_prices.py --tickers tickers_25.csv --start $(START) --end $(END) --out $(DATA_RAW)

$(FEATS): scripts/build_features.py $(DATA_RAW) scripts/qa_csv.sh
	# Basic QA first
	scripts/qa_csv.sh $(DATA_RAW)
	$(PY) scripts/build_features.py

The previous `Makefile` will lead to errors as the tab `\t` is not expanded, which is required.

Let's first delete the lines starting from `qa:`, then use a here-doc to fix it.

* `^qa:` = regex that matches any line **starting with `qa:`** (`^` anchors to line start).
* `,` = address range operator (“from … to …”).
* `$` = the **last line** of the file.
* So `/^qa:/,$` = “all lines from the first line that begins with `qa:` **through to the end of the file**.”
* `d` = **delete** those lines (in sed, delete means “skip printing them”).
* So everything from `qa:` through end is removed.

General command

```bash
sed [options] 'address command' file
```

or with multiple commands:

```bash
sed [options] 'address1 command1; address2 command2' file
```

* **`[options]`** → e.g. `-n` (suppress automatic printing), `-i` (edit in place).
* **`address`** → which lines the command applies to.

  * Single line number: `3` → apply to line 3.
  * Range: `5,10` → lines 5 through 10.
  * Regex: `/pattern/` → any line matching `pattern`.
  * Range with regex: `/start/,/end/`.

  * `d` → delete the line(s).
  * `p` → print the line(s).
  * `s/old/new/` → substitute.
  * `q` → quit.

Example:

```bash
sed '2,4d' file.txt   # delete lines 2 through 4
sed '/foo/s/bar/baz/' file.txt   # substitute "bar" with "baz" only on lines matching "foo"
```

In [29]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"

sed -i '/^qa:/,$d' Makefile    # remove the erroneous lines. singel quote to prevent the shell to expand `$d$`
cat Makefile


# Makefile — unified-stocks
SHELL := /bin/bash
.SHELLFLAGS := -eu -o pipefail -c
.ONESHELL:


PY := python
QUARTO := quarto

START ?= 2020-01-01
END   ?= 2025-08-01
ROLL  ?= 30

DATA_RAW := data/raw/prices.csv
FEATS    := data/processed/features.parquet
REPORT   := docs/reports/eda.html

# Default target
.DEFAULT_GOAL := help

.PHONY: help all clean clobber qa report backup

help: ## Show help for each target
	@awk 'BEGIN {FS = ":.*##"; printf "Available targets:\n"} /^[a-zA-Z0-9_\-]+:.*##/ {printf "  \033[36m%-18s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)

# all: $(DATA_RAW) $(FEATS) report backup ## Run the full pipeline and back up artifacts
all: $(DATA_RAW) $(FEATS) report train backup

$(DATA_RAW): scripts/get_prices.py tickers_25.csv
	$(PY) scripts/get_prices.py --tickers tickers_25.csv --start $(START) --end $(END) --out $(DATA_RAW)

$(FEATS): scripts/build_features.py $(DATA_RAW) scripts/qa_csv.sh
	# Basic QA first
	scripts/qa_csv.sh $(DATA_RAW)
	$(PY) scripts/build_features.py

* `-c`: read commands from a string.
* `@bash`: make will not echo the command itself
* Every $ that should reach the shell must be $$ in a Makefile.

In [30]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"

cat >> Makefile <<'MAKE'   # note need to attach >>
qa:
	# TAB above!
	scripts/qa_csv.sh data/raw/prices.csv

split-by-ticker:
	@bash -c 'HEADER=$$(head -n 1 data/raw/prices.csv); \
	  cut -d, -f1 data/raw/prices.csv | tail -n +2 | sort -u | \
	  while read -r T; do \
	    mkdir -p data/interim/ticker=$$T; \
	    { echo "$$HEADER"; \
	      awk -F, -v TK="$$T" '"'"'NR>1 && $$1==TK'"'"' data/raw/prices.csv; \
	    } > data/interim/ticker=$$T/prices_$$T.csv; \
	  done'

MAKE


In [31]:
# check if Makefile is successfully modified.
!cat Makefile

# Makefile — unified-stocks
SHELL := /bin/bash
.SHELLFLAGS := -eu -o pipefail -c
.ONESHELL:


PY := python
QUARTO := quarto

START ?= 2020-01-01
END   ?= 2025-08-01
ROLL  ?= 30

DATA_RAW := data/raw/prices.csv
FEATS    := data/processed/features.parquet
REPORT   := docs/reports/eda.html

# Default target
.DEFAULT_GOAL := help

.PHONY: help all clean clobber qa report backup

help: ## Show help for each target
	@awk 'BEGIN {FS = ":.*##"; printf "Available targets:\n"} /^[a-zA-Z0-9_\-]+:.*##/ {printf "  \033[36m%-18s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)

# all: $(DATA_RAW) $(FEATS) report backup ## Run the full pipeline and back up artifacts
all: $(DATA_RAW) $(FEATS) report train backup

$(DATA_RAW): scripts/get_prices.py tickers_25.csv
	$(PY) scripts/get_prices.py --tickers tickers_25.csv --start $(START) --end $(END) --out $(DATA_RAW)

$(FEATS): scripts/build_features.py $(DATA_RAW) scripts/qa_csv.sh
	# Basic QA first
	scripts/qa_csv.sh $(DATA_RAW)
	$(PY) scripts/build_features.py

In [32]:
!chmod +x scripts/qa_csv.sh

In [33]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"
# chmod +x scripts/qa_csv.sh
make qa
make split-by-ticker

# TAB above!
scripts/qa_csv.sh data/raw/prices.csv
OK: data/raw/prices.csv passed basic CSV QA (4501 lines).


## Approach B (much simpler): make the shell for the target be bash and avoid `bash -c`

```make
SHELL := bash
.ONESHELL:

split-by-ticker:
	set -euo pipefail
	HEADER=$$(head -n 1 data/raw/prices.csv)
	cut -d, -f1 data/raw/prices.csv | tail -n +2 | sort -u | while read -r T; do
	  mkdir -p "data/interim/ticker=$$T"
	  {
	    echo "$$HEADER"
	    awk -F, -v TK="$$T" 'NR>1 && $$1==TK' data/raw/prices.csv
	  } > "data/interim/ticker=$$T/prices_$$T.csv"
	done
```

Notes:

* `SHELL := bash` makes recipes run in bash (not `/bin/sh`).
* `.ONESHELL:` makes the **whole recipe run in one shell**, so variables like `HEADER` persist across lines, and you don’t need `bash -c` or the quote gymnastics.
* You still need **`$$`** for every `$` intended for the shell/awk.


## Approach C:  move the loop into a script:

```bash
# scripts/split_by_ticker.sh
#!/usr/bin/env bash
set -euo pipefail
HEADER=$(head -n 1 data/raw/prices.csv)
cut -d, -f1 data/raw/prices.csv | tail -n +2 | sort -u | while read -r T; do
  mkdir -p "data/interim/ticker=$T"
  { echo "$HEADER"
    awk -F, -v TK="$T" 'NR>1 && $1==TK' data/raw/prices.csv
  } > "data/interim/ticker=$T/prices_$T.csv"
done
```

Then in Makefile:

```make
split-by-ticker:
	./scripts/split_by_ticker.sh
```

This avoids all Makefile quoting rules and is easiest to maintain.


In [34]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"

{
  echo "# Mini EDA (shell-only)"
  echo "Generated: $(date)"
  echo
  echo "## Counts"
  echo "Lines (incl header): $(wc -l < data/raw/prices.csv)"
  echo "Unique tickers: $(cut -d, -f1 data/raw/prices.csv | tail -n +2 | sort | uniq | wc -l)"
  echo
  echo "## Top 5 absolute daily moves"
  tail -n +2 data/raw/prices.csv \
  | awk -F, '{a=$5; if(a<0) a=-a; print $1","$2","$5","a}' \
  | sort -t, -k4,4nr | head -n 5
  echo
  echo "## Mean log_return by ticker (top 10)"
  awk -F, 'NR>1 { s[$1]+=$5; n[$1]++ } END { for(t in s) printf "%s,%.6f\n", t, s[t]/n[t] }' \
    data/raw/prices.csv | sort -t, -k2,2nr | head -n 10
} | tee reports/mini_eda.txt || True

# Mini EDA (shell-only)
Generated: Sat Sep 20 12:27:55 AM UTC 2025

## Counts
Lines (incl header): 4501
Unique tickers: 25

## Top 5 absolute daily moves
XOM,2020-05-08,-9.545439969294023e-05,9.54544e-05
NFLX,2020-02-26,9.243919974366577e-05,9.243919974366577e-05
HD,2020-07-10,9.230948293748042e-05,9.230948293748042e-05
PFE,2020-06-23,8.716362939864553e-05,8.716362939864553e-05
BAC,2020-06-04,-8.67879702681762e-05,8.6788e-05

## Mean log_return by ticker (top 10)
PFE,0.001299
T,0.000995
ORCL,0.000731
V,0.000592
GOOGL,0.000490
NFLX,0.000474
JPM,0.000343
AMZN,0.000134
WMT,0.000022
INTC,-0.000099
