In [None]:
# from google.colab import drive
# drive.flush_and_unmount()           # ignore errors if already unmounted

#If cannot remount, simply delete the mounted drive and then remount
# rm -rf /content/drive


Drive not mounted, so nothing to flush and unmount.


In [None]:
# Colab cell
from google.colab import drive

drive.mount('/content/drive', force_remount=True)



Mounted at /content/drive


In [None]:
# Adjust these two for YOUR repo
REPO_OWNER = "ywanglab"
REPO_NAME  = "STAT4160"   # e.g., unified-stocks-team1
BASE_DIR   = "/content/drive/MyDrive/dspt25"
CLONE_DIR  = f"{BASE_DIR}/{REPO_NAME}"
REPO_URL   = f"https://github.com/{REPO_OWNER}/{REPO_NAME}.git"

# if on my office computer

# REPO_NAME  = "lectureNotes"   # e.g., on my office computer
# BASE_DIR = r"E:\OneDrive - Auburn University Montgomery\teaching\AUM\STAT 4160 Productivity Tools" # on my office computer
# CLONE_DIR  = f"{BASE_DIR}\{REPO_NAME}"

import os, pathlib
pathlib.Path(BASE_DIR).mkdir(parents=True, exist_ok=True)


In [28]:
import os, subprocess, shutil, pathlib

if not pathlib.Path(CLONE_DIR).exists():
    !git clone {REPO_URL} {CLONE_DIR}
else:
    # If the folder exists, just ensure it's a git repo and pull latest
    os.chdir(CLONE_DIR)
    # !git status
    # !git pull --rebase # !git pull --ff-only
os.chdir(CLONE_DIR)
print("Working dir:", os.getcwd())

Working dir: /content/drive/MyDrive/dspt25/STAT4160


* `glob` is a Python **standard library module** for Unix-style pathname pattern expansion (wildcards).
* `glob.glob(pattern)` returns a **list of pathnames** matching the given `pattern`.

### 2. Wildcards you can use

* `*` → matches **any number of characters** (including none).

  * `"*.csv"` → all CSV files.
* `?` → matches **exactly one character**.

  * `"file?.txt"` → `file1.txt`, `fileA.txt` but not `file10.txt`.
* `[abc]` → matches one character from the set.

  * `"file[12].txt"` → `file1.txt`, `file2.txt`.
* `[0-9]` → range inside brackets.

  * `"file[0-9].txt"` → `file0.txt` … `file9.txt`.

### 4. Key options

* `recursive=True` → lets `**` mean “match in all subdirectories.”

  ```python
  glob.glob("**/*.py", recursive=True)  
  # finds all .py files in current directory tree
  ```
* Returns a **list of strings** (file paths).
* The list order is arbitrary; use `sorted(...)` if you need consistency.

## `grep`
`grep` searches text for lines that match a pattern (string or regex) and prints the matching lines. It’s your go-to tool for “find in files” on the command line.

## Basic forms

```bash
grep "needle" file.txt          # print lines containing needle
grep -i "needle" file.txt       # case-insensitive
grep -n "needle" file.txt       # show line numbers
grep -r "needle" path/          # recursive through directories
grep -R "needle" path/          # like -r, also follows symlinks
```

## Regex vs fixed string

```bash
grep -E "foo|bar" file.txt      # extended regex (egrep; preferred)
grep -F "a?b*c" file.txt        # fixed string (treats metacharacters literally; fgrep)
```

## Show only what you need

```bash
grep -l "needle" -r path/       # list filenames with a match
grep -L "needle" -r path/       # list filenames with NO match
grep -c "needle" file.txt       # count matches per file
grep -oE "[A-Z]{3}[0-9]{2}" f   # print only the matched part(s)
```


## Exit codes (useful in scripts/CI)

* `0` = found at least one match
* `1` = no matches
* `>1` = error (e.g., unreadable file)


### Notes

* `egrep` and `fgrep` are legacy names; use `grep -E` and `grep -F`.
* Regex metacharacters: `. ^ $ * + ? ( ) [ ] { } |` (escape with `\` when needed).
* For binary-safe fixed searches (e.g., large data files), prefer `grep -F`.




In [None]:
from pathlib import Path
import pandas as pd, numpy as np, datetime as dt
import glob

raw_candidates = []
if Path("data/raw/prices.csv").exists():
    raw_candidates = ["data/raw/prices.csv"]
else:
    raw_candidates = sorted(glob.glob("data/raw/prices*.csv")) or sorted(glob.glob("data/raw/prices/*.csv"))

def _make_synthetic_prices():
    # Small 2-year synthetic daily prices for AAPL/MSFT/GOOGL
    tickers = ["AAPL","MSFT","GOOGL"]
    dates = pd.bdate_range("2022-01-03", periods=520, freq="B")
    rows = []
    rng = np.random.default_rng(0)
    for t in tickers:
        price = 100 + rng.normal(0, 1).cumsum()
        price = np.maximum(price, 1.0)
        vol = rng.integers(5e6, 2e7, size=len(dates)) #[low, high)
        df = pd.DataFrame({
            "date": dates,
            "ticker": t,
            "open": price * (1 + rng.normal(0, 0.002, size=len(dates))),
            "high": price * (1 + rng.normal(0.003, 0.003, size=len(dates))).clip(min=1), #set all values below 1 to 1, leave others unchanged.
            "low":  price * (1 - np.abs(rng.normal(0.003, 0.003, size=len(dates)))),
            "close": price,
            "adj_close": price * (1 + rng.normal(0, 0.0005, size=len(dates))),
            "volume": vol
        })
        rows.append(df)
    out = pd.concat(rows, ignore_index=True)
    Path("data/raw").mkdir(parents=True, exist_ok=True)
    out.to_csv("data/raw/prices.csv", index=False)
    return ["data/raw/prices.csv"]

if not raw_candidates:
    print("No raw prices found; creating a small synthetic dataset...")
    raw_candidates = _make_synthetic_prices()

raw_candidates

['data/raw/prices.csv']

In [None]:
from pathlib import Path
meta_path = Path("data/static/tickers.csv")
if meta_path.exists():
    meta = pd.read_csv(meta_path)
else:
    # Build a minimal metadata table from raw tickers
    tmp = pd.read_csv(raw_candidates[0])
    tickers = sorted(pd.unique(tmp["ticker"]))
    meta = pd.DataFrame({"ticker": tickers,
                         "name": tickers,
                         "sector": ["Unknown"]*len(tickers)})
    Path("data/static").mkdir(parents=True, exist_ok=True)
    meta.to_csv(meta_path, index=False)
meta.head()

Unnamed: 0,ticker,name,sector
0,AAPL,AAPL,Unknown
1,AMZN,AMZN,Unknown
2,BAC,BAC,Unknown
3,CSCO,CSCO,Unknown
4,CVX,CVX,Unknown



```python
s = re.sub(r"[^\w\s]", "_", s)
```

* Pattern: `[^\w\s]`

  * `\w` = “word characters” (`[A-Za-z0-9_]`)
  * `\s` = whitespace
  * `^` inside brackets = negation
* So `[^\w\s]` = **any character that is NOT a word character and NOT whitespace**.
* Replace those with `_`.
* Effect: strip punctuation and special symbols, turn them into underscores.

**Example:**

```python
"Price%Change!".sub(...)  →  "Price_Change_"
```

---


```python
s = re.sub(r"\s+", "_", s.strip().lower())
```

* First `s.strip().lower()` → trims leading/trailing spaces and makes lowercase.
* Regex: `\s+` = one or more whitespace characters.
* Replace with `_`.
* Effect: turn spaces (tabs, newlines, etc.) into underscores, collapse runs of them into one.

**Example:**

```python
"  Stock Price Change ".sub(...)  →  "stock_price_change"
```

---


```python
s = re.sub(r"_+", "_", s)
```

* Regex: `_+` = one or more underscores in a row.
* Replace with a single `_`.
* Effect: collapse multiple underscores into just one.

**Example:**

```python
"stock__price___change".sub(...)  →  "stock_price_change"
```



```python
pd.to_datetime(out["date"], errors="coerce")
```

 The option `errors="coerce"`

* Controls what happens if a value **cannot** be parsed as a date.
* Choices:

  * `"raise"` (default) → throw an error if parsing fails.
  * `"ignore"` → return the original value unchanged if parsing fails.
  * `"coerce"` → invalid parsing becomes `NaT` (Not a Time, like `NaN` for dates).

So with `"coerce"`, you won’t crash on bad data — instead, those entries become `NaT`.

---

### 4. Example

```python
import pandas as pd

s = pd.Series(["2020-01-01", "06/30/2020", "not_a_date", None])

parsed = pd.to_datetime(s, errors="coerce")
print(parsed)
```

Output:

```
0   2020-01-01
1   2020-06-30
2          NaT
3          NaT
dtype: datetime64[ns]
```


```python
out["volume"] = out["volume"].round().astype("Int64")  # nullable int
```


* **`.astype("Int64")`**
  → Converts the column to **pandas’ nullable integer dtype**, `Int64` (capital `I`).
  This is different from NumPy’s plain `int64`.

  * `int64` cannot hold missing values (`NaN`).
  * `Int64` can hold **`<NA>`** (pandas’ missing marker).

So now `volume` is an integer column that can still represent missing data.

---

### 2.

```python
out.loc[out["volume"] < 0, "volume"] = pd.NA
```

* **`out["volume"] < 0`** → Boolean mask of rows where `volume` is negative.
* **`.loc[..., "volume"]`** → Select those rows in the `volume` column.
* **`= pd.NA`** → Replace them with pandas’ `NA` (nullable missing value).


### 3. Example

```python
import pandas as pd
import numpy as np

out = pd.DataFrame({"volume": [1.2e7, -5000, 8.9e6, np.nan]})

out["volume"] = out["volume"].round().astype("Int64")
out.loc[out["volume"] < 0, "volume"] = pd.NA

print(out)
print(out.dtypes)
```

Output:

```
     volume
0  12000000
1      <NA>
2   8900000
3      <NA>

volume    Int64
dtype: object
```


```python
out.reset_index(drop=True)
```

* `reset_index()` **resets the index back to the default `RangeIndex` (0,1,2,...)**.

* By default, it takes the old index and moves it into a new column.

---

### 2. The option `drop=True`

* Without it:

  ```python
  df.reset_index()
  ```

  keeps the old index as a new column named `"index"`.

* With `drop=True`:

  ```python
  df.reset_index(drop=True)
  ```

  **discards** the old index entirely, instead of adding it back as a column.

---

### 3. Example

```python
import pandas as pd

out = pd.DataFrame({"A":[10,20,30]}, index=["x","y","z"])
print(out)
```

```
   A
x  10
y  20
z  30
```

Now reset:

```python
print(out.reset_index(drop=True))
```

```
    A
0  10
1  20
2  30
```

```python
pd.api.types.is_datetime64_any_dtype(out["date"])
```


* `pd.api.types` = pandas’ internal type-checking utilities.
* Functions like `is_numeric_dtype`, `is_string_dtype`, `is_datetime64_any_dtype` let you test the dtype of a Series/DataFrame column.

---

### 2. What `is_datetime64_any_dtype` checks

* Returns **True** if the data type of the Series (or array) is **any kind of datetime64**.
* That includes:

  * `datetime64[ns]` (the most common, nanosecond precision)
  * `datetime64[ns, tz]` (timezone-aware)
  * Or other datetime64 precisions (like `[s]`, `[ms]`, `[us]`).

`datetime64[ms]` → millisecond precision

`datetime64[us]` → microsecond precision (µs; written us since µ isn’t ASCII)
---

### 3. Example

```python
import pandas as pd

df = pd.DataFrame({
    "date": pd.to_datetime(["2020-01-01", "2020-06-30"]),
    "value": [1, 2]
})

print(pd.api.types.is_datetime64_any_dtype(df["date"]))   # True
print(pd.api.types.is_datetime64_any_dtype(df["value"]))  # False
```


```python
prices.merge(meta2, on="ticker", how="left", validate="many_to_one")
```

* `DataFrame.merge()` combines two DataFrames (like SQL JOIN).
* You specify the key(s) to join on (`on="ticker"` here).
* Returns a new DataFrame with columns from both.

---

### 2. `on="ticker"`

* The join key is the column `"ticker"`.
* Must exist in both `prices` and `meta2`.

---

### 3. `how="left"`

* Do a **left join**:

  * Keep **all rows from `prices`** (the left DataFrame).
  * Attach matching info from `meta2` (the right DataFrame).
  * If a `ticker` from `prices` doesn’t exist in `meta2`, the extra columns will be filled with `NaN`.

This mirrors SQL:

```sql
SELECT *
FROM prices
LEFT JOIN meta2 USING (ticker)  --USING only for the sanem name in both tables. Otherwise use ON
```

---

### 4. `validate="many_to_one"`

This is a **sanity check** to catch join mistakes.

* `"many_to_one"` means:

  * Left DataFrame (`prices`) may have **many rows per ticker** (because each ticker has daily prices).
  * Right DataFrame (`meta2`) must have **at most one row per ticker** (e.g. ticker metadata like name, sector).
* If `meta2` has duplicates in `"ticker"`, pandas will raise a `MergeError`.

This protects against accidentally duplicating rows.

Other options:

* `"one_to_one"`
* `"one_to_many"`
* `"many_to_many"`

---

### 5. Example

```python
import pandas as pd

prices = pd.DataFrame({
    "ticker": ["AAPL", "AAPL", "MSFT"],
    "date": ["2020-01-01", "2020-01-02", "2020-01-01"],
    "adj_close": [300, 305, 160]
})

meta2 = pd.DataFrame({
    "ticker": ["AAPL", "MSFT"],
    "sector": ["Tech", "Tech"]
})

merged = prices.merge(meta2, on="ticker", how="left", validate="many_to_one")
print(merged)
```

Output:

```
  ticker        date  adj_close sector
0   AAPL  2020-01-01        300   Tech
1   AAPL  2020-01-02        305   Tech
2   MSFT  2020-01-01        160   Tech
```




In [None]:
import re

def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Lowercase snake_case; repair common price column name variants."""
    def snake(s):
        s = re.sub(r"[^\w\s]", "_", s)
        s = re.sub(r"\s+", "_", s.strip().lower())
        s = re.sub(r"_+", "_", s)
        return s
    out = df.copy()
    out.columns = [snake(c) for c in out.columns]
    # Normalize known variants
    ren = {
        "adjclose":"adj_close", "adj_close_":"adj_close",
        "close_adj":"adj_close", "adj_close_close":"adj_close"
    }
    out = out.rename(columns={k:v for k,v in ren.items() if k in out.columns})
    # If no adj_close but close exists, create it
    if "adj_close" not in out and "close" in out:
        out = out.assign(adj_close=out["close"])
    return out

def clean_prices(df: pd.DataFrame) -> pd.DataFrame:
    """Coerce dtypes, drop dupes, basic sanity checks; add minor derived fields."""
    cols = ["date","ticker","open","high","low","close","adj_close","volume"]
    keep = [c for c in cols if c in df.columns]
    out = df.loc[:, keep].copy()

    # Parse date, coerce numerics
    out["date"] = pd.to_datetime(out["date"], errors="coerce")
    for c in ["open","high","low","close","adj_close"]:
        if c in out: out[c] = pd.to_numeric(out[c], errors="coerce")
    if "volume" in out: out["volume"] = pd.to_numeric(out["volume"], errors="coerce")

    # Drop bad rows
    out = out.dropna(subset=["date","ticker","adj_close"])
    # Deduplicate by (ticker, date)
    out = out.sort_values(["ticker","date"])
    out = out.drop_duplicates(subset=["ticker","date"], keep="last")

    # Enforce dtypes
    if "volume" in out:
        out["volume"] = out["volume"].round().astype("Int64")  # nullable int
        out.loc[out["volume"] < 0, "volume"] = pd.NA
    # Use category for low-cardinality strings
    out["ticker"] = out["ticker"].astype("category")
    # Use consistent float dtype
    for c in ["open","high","low","close","adj_close"]:
        if c in out: out[c] = out[c].astype("float32")  #  change to float64 if you need more precision

    # Quick sanity checks
    assert out[["ticker","date"]].duplicated().sum() == 0, "Duplicates remain"
    assert pd.api.types.is_datetime64_any_dtype(out["date"]), "date not datetime"
    return out.reset_index(drop=True)

def join_meta(prices: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
    """Left join metadata; keep minimal meta columns; set dtypes."""
    keep_meta = [c for c in ["ticker","name","sector"] if c in meta.columns]
    meta2 = meta.loc[:, keep_meta].copy()
    # Make strings consistent and compact
    if "name" in meta2:   meta2["name"]   = meta2["name"].astype("string")
    if "sector" in meta2: meta2["sector"] = meta2["sector"].astype("category")
    out = prices.merge(meta2, on="ticker", how="left", validate="many_to_one")
    return out


```python
tidy = (raw
            .pipe(standardize_columns)  # <- consistent names
            .pipe(clean_prices))
```

### 2. `.pipe(func, *args, **kwargs)`

* In pandas, `.pipe()` lets you pass a DataFrame into a function in a **chainable** way.
* `df.pipe(f)` is equivalent to `f(df)`.
* If the function needs extra arguments:

  ```python
  df.pipe(f, arg1, kwarg1=value)
  # same as
  f(df, arg1, kwarg1=value)
  ```
* Advantage: keeps the **method-chaining style** consistent (like dplyr in R).


### 2. `memory_usage="deep"`

* By default, `info()` shows a **shallow** memory estimate (just the arrays).
* With `"deep"`, pandas does a **full introspection**, especially for `object` columns (like strings).

  * Measures actual Python object memory, not just array references.
  * Much more accurate for text-heavy data.
* It can take longer if the DataFrame is big.

Example difference:

```python
df = pd.DataFrame({"col": ["a"*100, "b"*200, "c"*300]})
print(df.info())                  # shallow
print(df.info(memory_usage="deep"))  # deep, counts string lengths
```

Output:

```
memory usage: 152.0+ bytes
memory usage: 1.1 KB
```



In [None]:
dfs = []
for path in raw_candidates:
    raw = pd.read_csv(path)
    tidy = (raw
            .pipe(standardize_columns)  # <- consistent names
            .pipe(clean_prices))        # <- dtypes and sanity checks
    dfs.append(tidy)

prices = pd.concat(dfs, ignore_index=True)
prices = prices.pipe(join_meta, meta=meta)

print("Preview:")
display(prices.head(3))
print("\nInfo:")
print(prices.info(memory_usage="deep"))

Preview:


Unnamed: 0,date,ticker,adj_close,volume,name,sector
0,2020-01-01,AAPL,100.001228,4457901,AAPL,Unknown
1,2020-01-02,AAPL,100.300423,2664190,AAPL,Unknown
2,2020-01-03,AAPL,100.025841,4100245,AAPL,Unknown



Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       4500 non-null   datetime64[ns]
 1   ticker     4500 non-null   object        
 2   adj_close  4500 non-null   float32       
 3   volume     4500 non-null   Int64         
 4   name       4500 non-null   string        
 5   sector     4500 non-null   category      
dtypes: Int64(1), category(1), datetime64[ns](1), float32(1), object(1), string(1)
memory usage: 555.4 KB
None


```python
prices.to_parquet(single_path, engine="pyarrow", compression="zstd", index=False)
```


### 1. `to_parquet`

* A pandas method to save a DataFrame as a **Parquet file**.
* Parquet = a **columnar, compressed, binary format** widely used in data science and big data (efficient storage + fast reads).

---

* **`engine="pyarrow"`**
  Pandas can use either:

  * `pyarrow` (Apache Arrow’s Python library — fast, modern, supports advanced features).
  * `fastparquet` (older, pure Python, slower for some cases).
    You chose `pyarrow`, which is the default if installed.

* **`compression="zstd"`**

  * Use **Zstandard** compression (modern, high compression ratio, fast).
  * Alternatives: `"snappy"`, `"gzip"`, `"brotli"`, `"lz4"`, or `None`.
  * Zstandard is excellent for large datasets — smaller than Snappy, faster than Gzip.

* **`index=False`**

  * Don’t save the DataFrame’s index as a column in the Parquet file.
  * Keeps the file clean if your index is just `0,1,2,...`.

---

### 3. Example

```python
import pandas as pd

prices = pd.DataFrame({
    "ticker": ["AAPL", "MSFT"],
    "date": pd.to_datetime(["2020-01-01", "2020-01-02"]),
    "adj_close": [300, 160]
})

prices.to_parquet("prices.parquet", engine="pyarrow", compression="zstd", index=False)

# Read back
df = pd.read_parquet("prices.parquet", engine="pyarrow")
print(df)
```

Output:

```
  ticker       date  adj_close
0   AAPL 2020-01-01        300
1   MSFT 2020-01-02        160
```

---

### 4. Why use Parquet

* Much smaller than CSV.
* Much faster to load into pandas (especially with Arrow).
* Preserves dtypes (like `datetime64[ns]`, `Int64`, `category`).
* Plays nicely with Spark, Dask, DuckDB, BigQuery, etc.



```python
prices.to_parquet(
    part_dir,
    engine="pyarrow",
    compression="zstd",
    index=False,
    partition_cols=["ticker"]
)
```



### 1. `partition_cols=["ticker"]`

* Instead of writing **one single Parquet file**, pandas will write a **Parquet dataset**.
* The dataset is split into **subdirectories by the values in `ticker`**.
* Each unique `ticker` value gets its own folder.
* Inside each folder is one (or more) Parquet files containing only rows with that ticker.

---

### 2. Directory structure example

If `prices` has three tickers (`AAPL`, `MSFT`, `GOOG`), the output looks like:

```
part_dir/
├── ticker=AAPL/
│   └── part-0.parquet
├── ticker=MSFT/
│   └── part-0.parquet
└── ticker=GOOG/
    └── part-0.parquet
```

So the folder name encodes the partition column value: `ticker=AAPL`.


---

### 4. Other options

* You can partition by **multiple columns**:

  ```python
  partition_cols=["sector", "year"]
  ```

  Then the directory hierarchy will nest:

  ```
  sector=Tech/year=2020/part-0.parquet
  ```
* Works only with Parquet (not CSV).

---

### 5. Reading it back

```python
import pandas as pd

df = pd.read_parquet(part_dir, engine="pyarrow")
print(df.head())
```

* This automatically reassembles all partitions into one DataFrame.
* If you want only one ticker:

  ```python
  df = pd.read_parquet(part_dir, filters=[("ticker", "=", "AAPL")], engine="pyarrow")
  ```




In [None]:
# Single-file Parquet
single_path = "data/processed/prices.parquet"
prices.to_parquet(single_path, engine="pyarrow", compression="zstd", index=False)
print("Wrote:", single_path)

# Partitioned dataset by ticker (directory with /ticker=.../)
part_dir = "data/processed/prices_by_ticker"
# pandas to_parquet supports partition_cols with pyarrow engine
try:
    prices.to_parquet(part_dir, engine="pyarrow", compression="zstd",
                      index=False, partition_cols=["ticker"])
    print("Wrote partitioned dataset:", part_dir)
except TypeError:
    # Fallback via pyarrow dataset API
    import pyarrow as pa, pyarrow.parquet as pq
    pa_tbl = pa.Table.from_pandas(prices, preserve_index=False)
    pq.write_to_dataset(pa_tbl, root_path=part_dir, partition_cols=["ticker"], compression="zstd")
    print("Wrote (fallback) partitioned dataset:", part_dir)

Wrote: data/processed/prices.parquet
Wrote partitioned dataset: data/processed/prices_by_ticker


In [None]:
# 6a) Read a few columns from single-file Parquet
cols = ["ticker","date","adj_close","volume"]
df_small = pd.read_parquet("data/processed/prices.parquet", columns=cols)
df_small.head()

# 6b) Read one ticker from the partitioned dataset using pyarrow.dataset
import pyarrow.dataset as ds
dataset = ds.dataset("data/processed/prices_by_ticker", format="parquet", partitioning="hive")
# Choose a ticker present in the data
one_ticker = str(prices["ticker"].cat.categories[0])
flt = (ds.field("ticker") == one_ticker)
tbl = dataset.to_table(filter=flt, columns=["date","adj_close","volume"])
df_one = tbl.to_pandas()
df_one.head()

In [None]:
import json, pathlib
schema = {c: str(t) for c,t in prices.dtypes.items()}
pathlib.Path("data/processed").mkdir(parents=True, exist_ok=True)
with open("data/processed/prices_schema.json","w") as f:
    json.dump(schema, f, indent=2)
print("Wrote data/processed/prices_schema.json")