In [None]:
# Colab cell
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Adjust these two for YOUR repo
REPO_OWNER = "ywanglab"
REPO_NAME  = "STAT4160"   # e.g., unified-stocks-team1

BASE_DIR   = "/content/drive/MyDrive/dspt25"
CLONE_DIR  = f"{BASE_DIR}/{REPO_NAME}"
REPO_URL   = f"https://github.com/{REPO_OWNER}/{REPO_NAME}.git"

import os, pathlib
pathlib.Path(BASE_DIR).mkdir(parents=True, exist_ok=True)


In [None]:
import os, subprocess, shutil, pathlib

if not pathlib.Path(CLONE_DIR).exists():
    !git clone {REPO_URL} {CLONE_DIR}
else:
    # If the folder exists, just ensure it's a git repo and pull latest
    os.chdir(CLONE_DIR)
    # !git status
    # !git pull --rebase # !git pull --ff-only
os.chdir(CLONE_DIR)
print("Working dir:", os.getcwd())

Working dir: /content/drive/MyDrive/dspt25/STAT4160


In [None]:
# Ensure pandas and sqlite3 are available (sqlite3 is in stdlib)
import pandas as pd, sqlite3, numpy as np, os
from pathlib import Path

Path("data/raw").mkdir(parents=True, exist_ok=True)
if not Path("data/raw/prices.csv").exists():
    print("No prices.csv found; generating a small synthetic one.")
    tickers = ["AAPL","MSFT","NVDA","AMZN","GOOGL"]
    dates = pd.bdate_range("2022-01-03", periods=120)
    rng = np.random.default_rng(7)
    frames=[]
    for t in tickers:
        r = rng.normal(0, 0.01, len(dates))
        price = 100*np.exp(np.cumsum(r))
        vol = rng.integers(1e5, 5e6, len(dates))
        df = pd.DataFrame({"ticker": t, "date": dates, "adj_close": price, "volume": vol})
        df["log_return"] = np.log(df["adj_close"]).diff().fillna(0)
        frames.append(df)
    pd.concat(frames, ignore_index=True).to_csv("data/raw/prices.csv", index=False)

# Show a peek
pd.read_csv("data/raw/prices.csv").head()

Unnamed: 0,ticker,date,adj_close,volume,log_return
0,AAPL,2020-01-01,100.00123,4457901,0.0
1,AAPL,2020-01-02,100.300426,2664190,0.002987
2,AAPL,2020-01-03,100.025841,4100245,-0.002741
3,AAPL,2020-01-06,99.138974,4586613,-0.008906
4,AAPL,2020-01-07,98.689241,1556062,-0.004547


`db_path.unlink()`

Deletes the file at that path (similar to `rm` in Unix or `del` in Windows).

1. **`unlink()`**

   * Low-level **system call** in Unix/POSIX.
   * Removes a *directory entry* (a name → inode mapping).
   * If no more directory entries or processes reference that inode, the filesystem reclaims the storage.
   * In Python’s `pathlib`, `Path.unlink()` is a wrapper that calls this system-level removal.

   
2. **`rm`**

   * A **Unix command-line utility** (higher-level tool).
   * Under the hood, `rm` calls `unlink()` (or `unlinkat()`) to actually remove the file.
   * Supports **flags** like:

     * `rm -r dir/` → remove directories recursively.
     * `rm -f file` → force remove, ignore errors.
     * `rm -i file` → interactive, ask before deleting.
   * Works with globbing (`rm *.txt`).

### **So the difference**

* `unlink()` = **primitive** system call (or Python API) → removes one name from the filesystem.
* `rm` = **user-facing command** that wraps `unlink()` (and adds options, recursion, safety prompts, etc.).

### **What is an inode?**

* **inode = index node** (used in Unix/Linux filesystems like ext4, XFS, etc.).
* It’s a **data structure** on disk that stores **metadata about a file**, *not* the file’s name.

**An inode contains things like:**

* File type (regular file, directory, symlink, etc.)
* File size
* Permissions (read/write/execute bits, owner, group)
* Timestamps (created, modified, accessed)
* Pointers (addresses) to the actual data blocks on disk

* Filenames live in a **directory entry** (a mapping from name → inode number).

### **Inode mapping**

* A directory is just a table mapping:

  ```
  filename → inode number
  ```
* Example:

  ```
  "notes.txt" → inode #12345
  "report.pdf" → inode #54321
  ```
* Multiple names (hard links) can map to the same inode.

---

### **Why “unlink”?**

* When you “delete” a file in Unix, you’re really just **removing the link** (the directory entry) that points to the inode.
* If other links (hard links) or processes still reference that inode, the data stays around.
* Only when the **link count** drops to zero *and* no process has the file open will the filesystem free the inode and reclaim the storage.

---


* **Inode** = the actual file cabinet drawer (where the data + info lives).
* **Filename** = the sticky note on the cabinet saying “Report.pdf → Drawer #12.”
* **unlink()** = removing the sticky note.
* If nobody else has a sticky note to Drawer #12, the cabinet can be emptied and reused.

---

* An **inode** is a filesystem structure with metadata + pointers to file data.
* A **directory entry** maps a name to an inode.
* **unlink()** removes that mapping; the inode and file data go away only when no names or processes reference it.



### 1. **Connect to SQLite**

```python
con = sqlite3.connect(db_path)
cur = con.cursor()
```

* Opens (or creates) the SQLite database at `db_path`.
* Creates a cursor object for executing SQL commands.

---

### 2. **Enable foreign keys**

```python
cur.execute("PRAGMA foreign_keys = ON;")
```

* SQLite doesn’t enforce foreign keys unless this is turned **ON** per connection.
* Ensures that values in the `prices.ticker` column must exist in the `meta.ticker` column.
* The `;` marks the end of an SQL statement.

  * Required when running multiple statements or in interactive/database shells.

  *Optional for a single query in many programming APIs (like Python’s execute).
---

### 3. **Enable WAL mode**

```python
cur.execute("PRAGMA journal_mode = WAL;")
```

* WAL = *Write-Ahead Logging*.
* Improves concurrency: one process can read while another writes.
* Not strictly necessary here, but helpful if you expect multiple readers/writers.
* It’s a journaling mode where **writes go to a separate log file first**, not directly to the main database file.
* In SQLite:

  * Normally (`DELETE` or `ROLLBACK` journal mode), when you write, the database has to **lock the whole DB**, make changes, and then update the main file.
  * In **WAL mode**, changes are appended to a `*.db-wal` log file. Later, they’re **checkpointed** (merged) back into the main DB file.

---

### 4. **Define schema (DDL (Data definition language) block)**

* **`prices` table** stores daily prices.
* `(ticker, date)` pair is the **primary key**, so each ticker has at most one entry per day.
* `adj_close` and `volume` have `CHECK` constraints to forbid negative values.
* `FOREIGN KEY (ticker)` ensures every price row corresponds to a `meta` row.
A foreign key is a column (or set of columns) in one table that references a primary key in another table.

In a table definition, `REFERENCES` links a column (the “child”) to a column in another table (the “parent”).

General form:

```sql
FOREIGN KEY (child_column)
    REFERENCES parent_table(parent_column)
```

---

## 2. What it means

* The `child_column` must always match an existing value in `parent_table.parent_column`.
* This enforces **referential integrity** → you cannot insert invalid values, and you cannot remove referenced parent rows unless you handle the child rows too.



```sql
CREATE INDEX IF NOT EXISTS idx_prices_date ON prices(date);
```

* Adds an index on `date` (across all tickers).
* Speeds up queries like *“give me all prices between 2020-01-01 and 2020-12-31”*.

---

### 5. **Execute + commit**

```python
cur.executescript(ddl)
con.commit()
```

* `executescript()` runs the multi-statement DDL block in one go.
* `commit()` saves changes to the database file.

---

**Summary**:
This code builds a small relational schema for stock data in SQLite.

* `meta` table = company metadata.
* `prices` table = daily stock prices, linked by foreign key to `meta`.
* Foreign keys and constraints protect data integrity.
* WAL mode and the date index improve performance.

`-- ISO 'YYYY-MM-DD'` SQL comment line

That line is creating a **database index** on the `date` column of your `prices` table. Let’s unpack it carefully:

```sql
CREATE INDEX IF NOT EXISTS idx_prices_date ON prices(date);
```
In SQLite, the index is stored inside the same database file (`.db`).

* **`CREATE INDEX`** → defines an index to speed up lookups.
* **`IF NOT EXISTS`** → create it if it does not existp;otherwise, do nothing.
* **`idx_prices_date`** → the chosen name of the index.
* **`ON prices(date)`** → builds the index on the `date` column of the `prices` table.

Normally, SQLite would scan the entire table row by row (**full table scan**).
* With the index, SQLite can **jump directly** to the rows with matching dates → much faster when `prices` has many rows.



## 1. `cur.close()`

* **What it does:** frees the resources associated with the **cursor object**.
* **Why:** Cursors are lightweight, but if you open many of them, it’s good practice to close ones you no longer use.
* **Effect on DB:** does **not** commit or close the connection. The database remains open and changes are still pending until you commit.
* **Analogy:** you put down your pen, but the notebook (connection) is still open on the desk.

---

## 2. `con.commit()`

* **What it does:** saves all pending changes in the current transaction (INSERT, UPDATE, DELETE, CREATE, etc.) to the database file permanently.
* **Why:** SQLite starts a transaction implicitly when you make changes. Without a `commit()`, those changes stay “pending” and will be rolled back when the connection closes.
* **Effect on DB:** flushes changes to disk, releases write locks.
* **Analogy:** you press “Save” in a document editor.

---

## 3. `con.close()`

* **What it does:** closes the entire **connection** to the database.
* **Why:** releases file locks, finalizes any open transactions (if uncommitted, SQLite rolls them back).
* **Effect on DB:** after this, you can’t use the connection or its cursors.
* **Analogy:** you close the notebook completely.

---

##  Relationships

* `cur.close()` → optional cleanup (especially if you open many cursors). cursor object must be closed before commit.
* `con.commit()` → required if you want to **save your changes**.
* `con.close()` → required when you’re **done** with the database.

---

### Example

```python
import sqlite3

con = sqlite3.connect("prices.db")
cur = con.cursor()

cur.execute("INSERT INTO logs(message) VALUES (?)", ("hello",))

# Commit the change so it’s saved
con.commit()

# Close cursor (optional but clean)
cur.close()

# Close connection (releases file and locks)
con.close()
```

If you had skipped `con.commit()`, the `"hello"` row would not be saved — it would vanish when `con.close()` rolled back the pending transaction.

---

**Rule of thumb:**

* Always `commit()` after writes.
* Always `close()` the connection when done.
* Close cursors if you’re finished with them, but it’s less critical (they close automatically when the connection closes).




In [None]:
import sqlite3, textwrap, os
from pathlib import Path

db_path = Path("data/prices.db")
if db_path.exists(): db_path.unlink()  # start fresh for class; remove this in real life
con = sqlite3.connect(db_path) # Opens (or creates) the SQLite database at db_path.
cur = con.cursor() # Creates a cursor object for executing SQL commands.

# Turn on foreign keys
cur.execute("PRAGMA foreign_keys = ON;")
# (Optional) WAL can help concurrency; not critical here. Ensures that values in the prices.ticker column must exist in the meta.ticker column.
cur.execute("PRAGMA journal_mode = WAL;")
# WAL = Write-Ahead Logging.
# Improves concurrency: one process can read while another writes.
# Not strictly necessary here, but helpful if you expect multiple readers/writers.

ddl = textwrap.dedent("""
CREATE TABLE meta (
  ticker TEXT PRIMARY KEY,
  name   TEXT,
  sector TEXT NOT NULL
);

CREATE TABLE prices (
  ticker     TEXT NOT NULL,
  date       TEXT NOT NULL,               -- ISO 'YYYY-MM-DD'
  adj_close  REAL NOT NULL CHECK (adj_close >= 0),
  volume     INTEGER NOT NULL CHECK (volume >= 0),
  log_return REAL NOT NULL,
  PRIMARY KEY (ticker, date),
  FOREIGN KEY (ticker) REFERENCES meta(ticker)
);

-- Index to speed up date-range scans across all tickers
CREATE INDEX IF NOT EXISTS idx_prices_date ON prices(date);
""")
cur.executescript(ddl)
con.commit()
print("Created:", db_path)

Created: data/prices.db


`warnings.filterwarnings("ignore")` tells Python:
“Hide all warnings — don’t print them at all.”

In [None]:
import pandas as pd, numpy as np
import warnings
warnings.filterwarnings("ignore")

# Read tickers (from existing CSV or fallback)
if Path("tickers_25.csv").exists():
    tickers = pd.read_csv("tickers_25.csv")["ticker"].dropna().unique().tolist()
else:
    tickers = pd.read_csv("data/raw/prices.csv")["ticker"].dropna().unique().tolist()

def fetch_sector_map(tickers):
    try:
        import yfinance as yf
        out=[]
        for t in tickers:
            info = yf.Ticker(t).info or {}
            name  = info.get("shortName") or info.get("longName") or t
            sector= info.get("sector") or "Unknown"
            out.append({"ticker": t, "name": name, "sector": sector})
        return pd.DataFrame(out)
    except Exception:
        pass
    # Fallback: deterministic synthetic sectors
    sectors = ["Technology","Financials","Healthcare","Energy","Consumer"]
    rng = np.random.default_rng(42)
    return pd.DataFrame({
        "ticker": tickers,
        "name": tickers,
        "sector": [sectors[i % len(sectors)] for i in range(len(tickers))]
    })

meta_df = fetch_sector_map(tickers)
meta_df.head()

Unnamed: 0,ticker,name,sector
0,AAPL,Apple Inc.,Technology
1,MSFT,Microsoft Corporation,Technology
2,AMZN,"Amazon.com, Inc.",Consumer Cyclical
3,GOOGL,Alphabet Inc.,Communication Services
4,META,"Meta Platforms, Inc.",Communication Services


### 1. **Context manager for the connection**
**Note**: This may not be working if there is an writing process not closed.  We will remove it below.
```python
with con:
```

* Opens a transaction automatically.
* If everything succeeds, it commits at the end.
* If an error happens, it rolls back.
* Cleaner than `con.commit()` / `con.rollback()` manually.

---

### 2. **Bulk insert with `executemany`**

```python
con.executemany(
    "INSERT INTO meta(ticker, name, sector) VALUES(?, ?, ?)",
    meta_df[["ticker","name","sector"]].itertuples(index=False, name=None)
)
```

* `executemany` runs the SQL statement once **for each row** in the provided sequence.
* The placeholders `?, ?, ?` are **parameter markers** → safe against SQL injection, and efficient.
* `meta_df[["ticker","name","sector"]].itertuples(index=False, name=None)`:

  * Takes only the `ticker`, `name`, `sector` columns from the DataFrame.
  * `itertuples(..., name=None)` yields each row as a plain tuple, e.g.:

    ```python
    ("AAPL", "Apple Inc.", "Technology")
    ("MSFT", "Microsoft Corp.", "Technology")
    ```


So: all rows in your pandas `DataFrame` get inserted into the `meta` table in **one batch**.


### 3. **Check the results**

```python
print(pd.read_sql_query("SELECT * FROM meta LIMIT 5;", con))
```

* Runs a quick SELECT to show the first 5 rows you just inserted.
* Uses `pandas.read_sql_query`, so you get the results as a DataFrame.


If you’re inserting **one** row into `meta`, the SQL is simply:

```sql
INSERT INTO meta (ticker, name, sector)
VALUES ('AAPL', 'Apple Inc.', 'Technology');
```

In Python with `sqlite3`, use a **parameterized** single-row insert (safer for quotes, etc.):

```python
con.execute(
    "INSERT INTO meta(ticker, name, sector) VALUES(?, ?, ?)",
    ("AAPL", "Apple Inc.", "Technology")
)
con.commit()  # or use `with con:` to auto-commit
```

With Jupyter SQL magic:
```python
t, n, s = "AAPL", "Apple Inc.", "Technology"
```

```python
%sql INSERT INTO meta(ticker, name, sector) VALUES (:t, :n, :s)
```

### Notes

* `executemany(...)` is just the **bulk** version of the same statement; it runs the single-row `INSERT` repeatedly for each tuple.
* Since `ticker` is the **PRIMARY KEY**, inserting a duplicate will error. If you want “upsert” behavior, use one of these:

  * **Ignore duplicates:**

    ```sql
    INSERT OR IGNORE INTO meta(ticker, name, sector)
    VALUES ('AAPL', 'Apple Inc.', 'Technology');
    ```
  * **Update on conflict (preferred upsert):**

    ```sql
    INSERT INTO meta(ticker, name, sector)
    VALUES ('AAPL', 'Apple Inc.', 'Technology')
    ON CONFLICT(ticker) DO UPDATE
      SET name = excluded.name,
          sector = excluded.sector;
    ```

  *(Avoid `INSERT OR REPLACE` with FKs; it performs a delete+insert under the hood and can clash with foreign keys.)*


* **`excluded`** is a special row alias that holds the values you *tried to insert* but that **conflicted** (here, on `ticker`).
* The statement says: “when there’s a PK conflict on `ticker`, **update the existing row** so its `name` and `sector` become the attempted values.”

### Useful variations

**Only update if something actually changed (null-safe):**

```sql
ON CONFLICT(ticker) DO UPDATE
SET name   = excluded.name,
    sector = excluded.sector
WHERE name   IS NOT excluded.name
   OR sector IS NOT excluded.sector;
```

**Preserve existing non-NULLs (only overwrite when new value is non-NULL):**

```sql
ON CONFLICT(ticker) DO UPDATE
SET name   = COALESCE(excluded.name,   meta.name),
    sector = COALESCE(excluded.sector, meta.sector);
```


In [None]:
# Insert meta with parameterized query
# with con:
#     con.executemany(
#         "INSERT INTO meta(ticker, name, sector) VALUES(?, ?, ?)",
#         meta_df[["ticker","name","sector"]].itertuples(index=False, name=None)
#     )
con.executemany(
        "INSERT INTO meta(ticker, name, sector) VALUES(?, ?, ?)",
        meta_df[["ticker","name","sector"]].itertuples(index=False, name=None)
    )
print(pd.read_sql_query("SELECT * FROM meta LIMIT 5;", con))

  ticker                   name                  sector
0   AAPL             Apple Inc.              Technology
1   MSFT  Microsoft Corporation              Technology
2   AMZN       Amazon.com, Inc.       Consumer Cyclical
3  GOOGL          Alphabet Inc.  Communication Services
4   META   Meta Platforms, Inc.  Communication Services


* **Connection vs Cursor**

  * `con.execute()` / `con.executemany()` are **shortcuts** that use an internal cursor and return it.
  * `cur.execute()` / `cur.executemany()` let you manage your own cursor — better when you want multiple concurrent cursors or finer control.

* **Execute vs Executemany**

  * `.execute()` → one SQL with one parameter set.
  * `.executemany()` → same SQL repeated with many parameter sets.



In [None]:
# Sanity check
print("DBs attached:", list(con.execute("PRAGMA database_list;")))
print("Tables:", list(con.execute("SELECT name FROM sqlite_master WHERE type='table'")))
print("In transaction?", con.in_transaction)


DBs attached: [(0, 'main', '/content/drive/MyDrive/dspt25/STAT4160/data/prices.db')]
Tables: [('meta',), ('prices',)]
In transaction? True


1. **`drop_duplicates(subset=["ticker","date"])`**

* Removes rows that have the same `(ticker, date)` pair.
* By default it **keeps the first** occurrence and drops later ones (`keep="first"`).
* You can change behavior: `keep="last"` or `keep=False` (drop *all* duplicates).

2. **`reset_index(drop=True)`**

* Moves the **index** back into regular **columns**, and replaces the index with a default **RangeIndex(0…N−1)**.
* If the index has a **name**, that name becomes the new column name; if unnamed, the column will be called `"index"`.

* `drop=True` discards the old index instead of adding it as a column.

### Common patterns

```python
# 1) Typical cleanup after filtering/dropping rows
df = df.reset_index(drop=True)       # discard old index, get 0..N-1

# 2) After groupby (turn group labels from index to columns)
out = df.groupby("sector")["adj_close"].mean().reset_index()

# 3) Only reset some levels of a MultiIndex
df = df.reset_index(level=["ticker"])  # bring just 'ticker' out as a column
```


In [None]:
prices = pd.read_csv("data/raw/prices.csv", parse_dates=["date"])
# Normalize date to ISO text
prices["date"] = prices["date"].dt.strftime("%Y-%m-%d")
# Keep only needed columns and ensure order matches table
prices = prices[["ticker","date","adj_close","volume","log_return"]]

# Optional: drop duplicates to respect PK before insert
prices = prices.drop_duplicates(subset=["ticker","date"]).reset_index(drop=True)
len(prices)

4500

**`IGNORE`** (in SQLite) is a **conflict resolution** policy that tells the engine to **skip the row that violates a constraint and continue**—no error is raised and nothing is written for that row.


* **Old-style clause on the statement:**

  ```sql
  INSERT OR IGNORE INTO meta(ticker, name, sector)
  VALUES ('AAPL', 'Apple Inc.', 'Technology');
  ```
* **UPSERT form (SQLite ≥ 3.24):**

  ```sql
  INSERT INTO meta(ticker, name, sector)
  VALUES ('AAPL', 'Apple Inc.', 'Technology')
  ON CONFLICT(ticker) DO NOTHING;   -- same effect as OR IGNORE
  ```

### What it applies to

`IGNORE` (and `DO NOTHING`) suppresses errors for **constraint conflicts** on:

* `PRIMARY KEY` / `UNIQUE`
* `NOT NULL`
* `CHECK`

The row with the conflict is **discarded**; other rows in the same statement continue.


### Examples

**Skip duplicate primary key**

```sql
-- If 'AAPL' already exists, this inserts nothing and raises no error
INSERT OR IGNORE INTO meta(ticker, name, sector)
VALUES ('AAPL', 'Apple Inc.', 'Technology');
```

**Bulk insert: keep the non-duplicates**

```python
rows = [
    ("AAPL", "Apple Inc.", "Technology"),
    ("MSFT", "Microsoft", "Technology"),
    ("AAPL", "Apple Inc.", "Tech")  # duplicate PK -> ignored
]
con.executemany(
    "INSERT OR IGNORE INTO meta(ticker, name, sector) VALUES(?, ?, ?)", rows
)
```



In [None]:
# Bulk insert inside one transaction; ignore rows violating FK or PK (e.g., duplicates)
# with con:
#     con.executemany(
#         "INSERT OR IGNORE INTO prices(ticker,date,adj_close,volume,log_return) VALUES(?,?,?,?,?)",
#         prices.itertuples(index=False, name=None)
#     )
con.executemany(
        "INSERT OR IGNORE INTO prices(ticker,date,adj_close,volume,log_return) VALUES(?,?,?,?,?)",
        prices.itertuples(index=False, name=None)
    )
# Quick counts
print(pd.read_sql_query("SELECT COUNT(*) AS nrows FROM prices;", con))


   nrows
0   4500


In [None]:
cur.close()  # must close the cursor
con.commit()  #close the writer before reading next

In [None]:
print(pd.read_sql_query("""
SELECT ticker, COUNT(*) AS n
FROM prices
GROUP BY ticker
ORDER BY n DESC
LIMIT 5;
""",
con))

  ticker    n
0   AAPL  180
1   AMZN  180
2    BAC  180
3   CSCO  180
4    CVX  180


In [None]:
!pwd

/content/drive/MyDrive/dspt25/STAT4160


In [None]:
df = pd.read_csv("./data/raw/prices.csv")
df.date.min(), df.date.max()

('2020-01-01', '2020-09-08')

In [None]:
q1 = """
SELECT ticker, date, adj_close, volume
FROM prices
WHERE ticker = ? AND date BETWEEN ? AND ?
ORDER BY date ASC  -- Ascending order
LIMIT 5;
"""
print(pd.read_sql_query(q1, con, params=["AAPL","2020-01-01","2020-06-30"]))

  ticker        date   adj_close   volume
0   AAPL  2020-01-01  100.001230  4457901
1   AAPL  2020-01-02  100.300426  2664190
2   AAPL  2020-01-03  100.025841  4100245
3   AAPL  2020-01-06   99.138974  4586613
4   AAPL  2020-01-07   98.689241  1556062


In [None]:
# Top 10 absolute daily moves for a chosen ticker
q2 = """
SELECT p.ticker, p.date, p.log_return, ABS(p.log_return) AS abs_move
FROM prices AS p
WHERE p.ticker = ?
ORDER BY abs_move DESC
LIMIT 10;
"""
print(pd.read_sql_query(q2, con, params=["NVDA"]))

  ticker        date  log_return  abs_move
0   NVDA  2020-03-11    0.028289  0.028289
1   NVDA  2020-06-08   -0.027803  0.027803
2   NVDA  2020-08-20    0.027055  0.027055
3   NVDA  2020-07-23    0.026553  0.026553
4   NVDA  2020-06-09    0.026508  0.026508
5   NVDA  2020-02-26    0.026133  0.026133
6   NVDA  2020-03-10   -0.025227  0.025227
7   NVDA  2020-04-28    0.023624  0.023624
8   NVDA  2020-06-25    0.023492  0.023492
9   NVDA  2020-03-25   -0.022028  0.022028


In [None]:
# Mean |std| of daily returns per sector over a date range
q3 = """
SELECT m.sector,
       AVG(ABS(p.log_return)) AS mean_abs_return,
       AVG(p.log_return)      AS mean_return,
       STDDEV(p.log_return)   AS std_return
FROM prices p
JOIN meta   m ON p.ticker = m.ticker
WHERE p.date BETWEEN ? AND ?
GROUP BY m.sector
ORDER BY mean_abs_return DESC;
"""
try:
  print(pd.read_sql_query(q3, con, params=["2020-01-01","2020-06-30"]))
except:
  print("Error")


Error


# 1. `df.assign(...)`

* Adds new columns (or overwrites existing ones) in a **chainable** way.
* Returns a **new DataFrame** with the additional columns, leaving the original unchanged (unless you assign back).

### Syntax

```python
df.assign(new_col=some_expression)
```

* `new_col` can be:

  * A Series (`df["col"] * 2`)
  * A constant (e.g. `5`)
  * A lambda that takes the DataFrame (`lambda d: d["col1"] + d["col2"]`)

### Example

```python
df = pd.DataFrame({"x": [1, -2, 3]})

df2 = df.assign(
    y=lambda d: d["x"].abs(),   # new col "y" = abs of x
    z=10                        # new col "z" = constant 10
)
```

Result:

```
   x  y   z
0  1  1  10
1 -2  2  10
2  3  3  10
```

---

# 2. `.agg(...)` (aggregate)

### What it does

* Reduces values by applying **summary functions** (mean, sum, std, min, max, etc.).
* Often used **after `groupby`**.

### Syntax

```python
grouped.agg(new_col_name=("column", "function"))
```

* `"column"` → which column to aggregate.
* `"function"` → which aggregation (string like `"mean"`, `"sum"`, `"std"`, or a function).
* `new_col_name` → what to call the resulting column.

### Example

```python
df = pd.DataFrame({
    "sector": ["tech","tech","finance","finance"],
    "return": [0.02, -0.01, 0.03, -0.02]
})

(df.groupby("sector")
   .agg(
       mean_return=("return","mean"),
       std_return=("return","std"),
       max_return=("return","max")
   ))
```

Result:

```
         mean_return  std_return  max_return
sector                                    
finance        0.005    0.035355        0.03
tech           0.005    0.021213        0.02
```




In [None]:

# SQLite doesn't have STDDEV by default; fallback using variance formula via window? We'll compute in pandas:
df = pd.read_sql_query("""
SELECT m.sector, p.log_return
FROM prices p JOIN meta m ON p.ticker = m.ticker
WHERE p.date BETWEEN ? AND ?;
""",
con, params=["2020-01-01","2020-09-08"]) #The result df has two columns: sector and log_return.
agg = (df.assign(abs=lambda d: d["log_return"].abs())
         .groupby("sector")
         .agg(mean_abs_return=("abs","mean"),
              mean_return=("log_return","mean"),
              std_return=("log_return","std"))
         .sort_values("mean_abs_return", ascending=False))
agg

Unnamed: 0_level_0,mean_abs_return,mean_return,std_return
sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Technology,0.008058,-0.000303,0.010157
Healthcare,0.008015,0.000332,0.009978
Consumer Cyclical,0.007841,-0.000371,0.009917
Energy,0.007779,-0.000582,0.009748
Communication Services,0.007765,-0.000218,0.009825
Consumer Defensive,0.007547,-0.000534,0.009301
Financial Services,0.007533,0.000259,0.009595


### 1. `CREATE VIEW IF NOT EXISTS latest_prices AS …`

* Creates a **view** named `latest_prices`.
* A **view** is like a saved query — you can `SELECT` from it just like a table, but it doesn’t store extra data; it runs the underlying query each time. It’s a virtual table.
* `IF NOT EXISTS` means it won’t throw an error if you already created this view.

### 2. Subquery `t`
```sql
SELECT p.*
FROM prices p
JOIN (...) t
  ON p.ticker = t.ticker
 AND p.date   = t.max_date
```

* Joins the full `prices` table with the subquery `t`.
* Keeps only rows where:

  * `p.ticker` = `t.ticker`
  * `p.date` = the max date for that ticker.

# 3. **What is a subquery?**

* A **subquery** is a query nested inside another query.
* It can appear in the `SELECT`, `FROM`, or `WHERE` clause.
* The outer query uses the result of the subquery like a temporary table.

### Types of subqueries

**a. In the FROM clause (derived table):**

```sql
SELECT p.*
FROM prices p
JOIN (
  SELECT ticker, MAX(date) AS max_date
  FROM prices
  GROUP BY ticker
) t
ON p.ticker = t.ticker AND p.date = t.max_date;
```

Here, the inner query (`SELECT ticker, MAX(date) ...`) is the subquery, and we give it an alias `t`. It produces one row per ticker (ticker, max\_date), then the outer query joins it back to the main `prices` table.

**b. In the WHERE clause:**

```sql
SELECT *
FROM prices
WHERE date = (SELECT MAX(date) FROM prices);
```

This subquery finds the maximum date in the whole table, then the outer query uses it as a filter.




In [None]:
# View: latest available date per ticker
with con:
    con.execute("""
    CREATE VIEW IF NOT EXISTS latest_prices AS
    SELECT p.*
    FROM prices p
    JOIN (
      SELECT ticker, MAX(date) AS max_date
      FROM prices
      GROUP BY ticker
    ) t ON p.ticker = t.ticker AND p.date = t.max_date;
    """)
pd.read_sql_query("SELECT * FROM latest_prices ORDER BY ticker LIMIT 10;", con)

Unnamed: 0,ticker,date,adj_close,volume,log_return
0,AAPL,2020-09-08,72.726179,2230585,-0.000338
1,AMZN,2020-09-08,102.514707,2220739,0.006825
2,BAC,2020-09-08,97.642936,2397146,-0.020196
3,CSCO,2020-09-08,95.553353,968313,-0.000247
4,CVX,2020-09-08,98.670635,2143326,-0.008319
5,DIS,2020-09-08,79.387093,2761486,-0.002141
6,GOOGL,2020-09-08,109.333197,3050838,-0.000267
7,HD,2020-09-08,96.886174,2394816,0.007197
8,INTC,2020-09-08,97.814039,470434,0.020387
9,JNJ,2020-09-08,89.457276,4057950,-0.005492


In [None]:
# Demonstrate the UNIQUE/PK constraint: inserting a duplicate row should be ignored or fail
import sqlite3
row = pd.read_sql_query("SELECT * FROM prices LIMIT 1;", con).iloc[0].to_dict()
try:
    with con:
        con.execute(
            "INSERT INTO prices(ticker,date,adj_close,volume,log_return) VALUES(?,?,?,?,?)",
            (row["ticker"], row["date"], row["adj_close"], row["volume"], row["log_return"])
        )
    print("Unexpected: duplicate insert succeeded (should not).")
except sqlite3.IntegrityError as e:
    print("IntegrityError as expected:", e)

IntegrityError as expected: UNIQUE constraint failed: prices.ticker, prices.date


## A tiny SQL I/O helper for your project

**special import** you sometimes see at the top of modern Python files:

```python
from __future__ import annotations
```

It changes how **type annotations** (the things you write after `:` and `->`) are handled.

* **Normally**:
  In Python 3.7–3.9, if you annotate with a class that hasn’t been defined yet, you must use a string (a “forward reference”):

  ```python
  class Node:
      def __init__(self, next: "Node" | None = None):
          self.next = next
  ```

* **With `from __future__ import annotations`**:
  Annotations are **stored as strings automatically**, instead of being evaluated at runtime.
  So you can write the more natural:

  ```python
  from __future__ import annotations

  class Node:
      def __init__(self, next: Node | None = None):  # no quotes needed
          self.next = next
  ```

  and Python won’t complain that `Node` isn’t defined yet.

### 1. `@contextmanager`

* Comes from `contextlib.contextmanager`.
* Lets you write a generator function with `yield` that behaves like a context manager (`with ...:` block).
* Anything before `yield` is **setup**, anything after is **cleanup**.
* Ensures cleanup happens **even if an exception is raised** inside the `with` block.

---

### 2. Function signature

```python
def connect(db_path: str | Path = DB_PATH):
```

* `db_path`: the SQLite file to connect to.
* Uses Python 3.10’s `|` union type (`str | Path`).
* Default path is `DB_PATH` (probably defined elsewhere).

* `yield con`: passes the connection to the calling code inside the `with` block.
* `finally: con.close()`: guarantees the connection is closed, no matter what happens.

---

### 4. How you use it

```python
from contextlib import contextmanager

with connect() as con:
    cur = con.cursor()
    cur.execute("SELECT * FROM prices LIMIT 5;")
    rows = cur.fetchall()
# ← when the block ends, con.close() runs automatically
```


```python
sqlio_py = """\
from __future__ import annotations
...
"""
```
### Without the backslash

If you write:

```python
sqlio_py = """
from __future__ import annotations
...
"""
```

the string actually starts with a **newline character (`\n`)** right at the beginning.
So `sqlio_py` would contain:

```
"\nfrom __future__ import annotations\n..."
```

Notice the unwanted blank line at the top.

---

### With the backslash

When you add a backslash immediately after the opening quotes:

```python
sqlio_py = """\
from __future__ import annotations
...
"""
```

Python discards that initial newline.
So the string begins directly with `"from __future__..."` — no leading blank line.





In [None]:
from pathlib import Path
Path("src").mkdir(exist_ok=True)
Path("src/projectname").mkdir(parents=True, exist_ok=True)

sqlio_py = """\
from __future__ import annotations
import sqlite3
import pandas as pd
from contextlib import contextmanager
from pathlib import Path

DB_PATH = Path("data/prices.db")

@contextmanager
def connect(db_path: str | Path = DB_PATH):
    con = sqlite3.connect(str(db_path))
    con.execute("PRAGMA foreign_keys = ON;")
    try:
        yield con
    finally:
        con.close()

def query_df(sql: str, params: tuple | list | None = None, db_path: str | Path = DB_PATH) -> pd.DataFrame:
    with connect(db_path) as con:
        return pd.read_sql_query(sql, con, params=params)

def sector_summary(start: str, end: str, db_path: str | Path = DB_PATH) -> pd.DataFrame:
    sql = '''
    SELECT m.sector, p.log_return
    FROM prices p JOIN meta m ON p.ticker = m.ticker
    WHERE p.date BETWEEN ? AND ?;
    '''
    df = query_df(sql, [start, end], db_path)
    if df.empty:
        return df
    g = df.assign(abs=lambda d: d["log_return"].abs()).groupby("sector")
    return g.agg(mean_abs_return=("abs","mean"),
                 mean_return=("log_return","mean"),
                 std_return=("log_return","std")).reset_index()
"""
open("src/projectname/sqlio.py","w").write(sqlio_py)
print("Wrote src/projectname/sqlio.py")

Wrote src/projectname/sqlio.py


In [None]:
!pwd

/content/drive/MyDrive/dspt25/STAT4160


In [None]:
import sys
sys.path.append("/content/drive/MyDrive/dspt25/STAT4160")

from src.projectname.sqlio import sector_summary
sector_summary("2020-01-01","2020-08-01").head()

Unnamed: 0,sector,mean_abs_return,mean_return,std_return
0,Communication Services,0.007732,-0.000296,0.009866
1,Consumer Cyclical,0.007817,-0.000618,0.009942
2,Consumer Defensive,0.007491,-0.00079,0.009153
3,Energy,0.007629,-0.000692,0.00965
4,Financial Services,0.007428,-0.000142,0.009459


In [None]:
cur.close()  # need to close cursor first

In [None]:
con.commit()  # save the change

In [None]:
con.close()

In Jupyter/Colab you can use the **SQL magic** from the `ipython-sql` (or `jupysql`) extension.

### Quick setup (one-time)

```python
%pip install -q ipython-sql sqlalchemy
%load_ext sql
```

### Connect to your SQLite DB

Use an SQLAlchemy URL.

* **Relative path:** `sqlite:///data/prices.db`
* **Absolute path:** `sqlite:////content/drive/MyDrive/dspt25/STAT4160/data/prices.db`

```python
%sql sqlite:///data/prices.db
```

### Run queries

* **Line magic** (`%sql`) for one-liners:

```python
%sql SELECT COUNT(*) AS n FROM meta;
```

* **Cell magic** (`%%sql`) for multi-line SQL:

```sql
%%sql
SELECT ticker, COUNT(*) AS days
FROM prices
GROUP BY ticker
ORDER BY days DESC
LIMIT 5;
```

### Get results into pandas

```python
# Option A: assign the result, then convert
res = %sql SELECT * FROM meta LIMIT 5;
df = res.DataFrame()

# Option B: store directly into a DataFrame named df
%sql -o df SELECT ticker, sector FROM meta LIMIT 5;
```

### Use Python variables in your SQL

```python
sym = "AAPL"
%sql SELECT date, adj_close FROM prices WHERE ticker = :sym ORDER BY date DESC LIMIT 5;
```

### Persist a pandas DataFrame to SQLite (create/append a table)

```python
# Suppose you have a DataFrame named prices_df
%sql --persist prices_df     # creates a table named prices_df
# or explicitly:
# prices_df.to_sql("prices", sqlite3.connect("data/prices.db"), if_exists="append", index=False)
```

### Tips / gotchas

* The magic opens **its own DB connection**, separate from your `sqlite3` `con`. Commit your writes first to avoid “database is locked”.
* For **absolute paths**, use **four slashes** after `sqlite:` (e.g., `sqlite:////abs/path/to.db`).
* For an **in-memory DB**, use `sqlite:///:memory:` (note: it disappears when that connection closes).



* need `ipython-sql`** to get the `%sql` / `%%sql` magics in Jupyter.
* **need `SQLAlchemy`** because `ipython-sql` uses it under the hood to connect to databases via URLs like `sqlite:///path/to.db`. (Installing `ipython-sql` usually pulls `sqlalchemy`, but explicitly installing both avoids version/dependency hiccups.)

`%load_ext sql` loads the **IPython extension** provided by `ipython-sql`. Loading it:

* **Registers** the `%sql` (line) and `%%sql` (cell) magics.
* After that, you can connect and run SQL right in cells.

Example:

```python
%load_ext sql
%sql sqlite:///data/prices.db          # open a connection
%sql SELECT COUNT(*) AS n FROM meta;   # run a one-line query
```

* The connection string you pass (`sqlite:///...`, `postgresql://...`, `mysql+pymysql://...`) is a **SQLAlchemy URL**.
* `ipython-sql` uses SQLAlchemy’s engines/dialects to handle connections and execute your SQL.



* **`%pip`** → IPython **magic**. **Recommended.** Runs `python -m pip` with the **same interpreter as the kernel**, and refreshes the environment so installs are available immediately (when possible).
* **`!pip`** → **shell command**. May call a **different** `pip` from your system `PATH`, so packages can end up in the wrong environment and not be importable in the notebook.

---

### Why `%pip` is safer

* Uses the **kernel’s Python** (same `sys.executable`) → installs land in the notebook’s environment.
* After install, IPython updates import paths so you can typically `import` right away.
* Same idea applies to **`%conda`** vs `!conda`.

### What `!pip` really does

* The leading `!` runs a **shell** command. It picks whichever `pip` is first on your `PATH` (could be system Python, not your kernel’s).
* That’s why you sometimes install a package and still get `ModuleNotFoundError` in the next cell.


### Quick sanity check snippets

```python
import sys, subprocess, shlex

print("Kernel Python:", sys.executable)
print("python -m pip ->", subprocess.check_output([sys.executable, "-m", "pip", "--version"]).decode().strip())
# Compare with the shell's pip:
# (May be different!)
# !pip --version
```



In [None]:
%pip install -q ipython-sql sqlalchemy



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.6 MB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m1.2/1.6 MB[0m [31m17.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25h

Note that you may hit a known incompatibility between the **%sql** magic and **PrettyTable ≥ 3.12**: PrettyTable moved its style constants, so `%sql`’s default style lookup for `"DEFAULT"` crashes with `KeyError: 'DEFAULT'`.

### Quick fixes (pick one)

**A) Set the old fallback style once per notebook**

```python
%config SqlMagic.style = '_DEPRECATED_DEFAULT'
```

Then re-run your `%%sql` cell. This is the simplest workaround.

**B) Pin PrettyTable to a pre-change version**

```python
%pip install "prettytable<3.12"
```

Restart the kernel, `%load_ext sql`, reconnect, and run your query. (The break came with PrettyTable 3.12.)



In [None]:
%reload_ext sql
# %reload_ext sql
%sql sqlite:///data/prices.db
# %sql sqlite:////content/drive/MyDrive/dspt25/STAT4160/data/prices.db

# %config SqlMagic.autocommit = True

# Create connection and cursur objects.
# con = sqlite3.connect("data/prices.db")
# cur = con.cursor()
# print(con)



In [None]:
%config SqlMagic.style = '_DEPRECATED_DEFAULT'


In [None]:
%%sql
SELECT sqlite_version();  -- check SQLite version


 * sqlite:///data/prices.db
Done.


sqlite_version()
3.37.2


In [None]:
%%sql
SELECT name FROM sqlite_master WHERE type='table'; -- list all tables


 * sqlite:///data/prices.db
Done.


name
meta
prices


In [None]:
%%sql
PRAGMA database_list;  -- list all databases

 * sqlite:///data/prices.db
Done.


seq,name,file
0,main,/content/drive/MyDrive/dspt25/STAT4160/data/prices.db


In [None]:
%%sql
SELECT sql FROM sqlite_master
WHERE type='table'
ORDER BY name;                      -- full CREATE TABLE statements

 * sqlite:///data/prices.db
Done.


sql
"CREATE TABLE meta (  ticker TEXT PRIMARY KEY,  name TEXT,  sector TEXT NOT NULL )"
"CREATE TABLE prices (  ticker TEXT NOT NULL,  date TEXT NOT NULL, -- ISO 'YYYY-MM-DD'  adj_close REAL NOT NULL CHECK (adj_close >= 0),  volume INTEGER NOT NULL CHECK (volume >= 0),  log_return REAL NOT NULL,  PRIMARY KEY (ticker, date),  FOREIGN KEY (ticker) REFERENCES meta(ticker) )"


In [None]:
%%sql
SELECT * FROM meta LIMIT 5;


 * sqlite:///data/prices.db
Done.


ticker,name,sector
AAPL,Apple Inc.,Technology
MSFT,Microsoft Corporation,Technology
AMZN,"Amazon.com, Inc.",Consumer Cyclical
GOOGL,Alphabet Inc.,Communication Services
META,"Meta Platforms, Inc.",Communication Services


In [None]:
%%sql
SELECT ticker, COUNT(*) AS days
FROM prices
GROUP BY ticker
ORDER BY days DESC
LIMIT 5;


 * sqlite:///data/prices.db
Done.


ticker,days
AAPL,180
AMZN,180
BAC,180
CSCO,180
CVX,180


## To close a sql matic conecton, first list all connections:

In [None]:
%sql --connections

{'sqlite:///data/prices.db': <sql.connection.Connection at 0x78026ed93aa0>}

In [None]:
%sql --close sqlite:///data/prices.db  # cloase a sql magic connection

When
* SQLite raises “**cannot commit transaction – SQL statements in progress**” when any cursor on the same connection still has an active statement/result set.
* Closing the cursor guarantees there’s no active statement before you commit.

Safe patterns:

```python
# WRITE-ONLY path (either order is fine, but this is safest)
cur = con.cursor()
cur.executemany("INSERT INTO meta(ticker,name,sector) VALUES(?, ?, ?)", rows)
cur.close()          # ensure no statements in progress
con.commit()
```


## When is a SQLite database “locked”?

SQLite is lightweight and uses file locks for concurrency:

* **Write lock**: Only one writer at a time. If a transaction is writing (`INSERT`, `UPDATE`, `DELETE`) and hasn’t been committed yet, the whole DB is locked for other writes.
* **Read lock**: Readers normally can coexist, but if you’re in *exclusive* journaling mode or another reader hasn’t finished, conflicts can occur.
* **Common causes of “database is locked”**:

  * A previous connection started a transaction and never did `commit()` or `rollback()`.
  * You opened multiple connections (e.g. in different notebook cells) pointing to the same file, and one is still busy.
  * The DB file lives on a network/Google Drive mount, where file locking is flaky.

---

##  How to properly close connections

* **If you’re using the `sqlite3` module directly**:

  ```python
  import sqlite3
  conn = sqlite3.connect("data/prices.db")
  cur = conn.cursor()
  cur.execute("SELECT * FROM prices LIMIT 5;")
  rows = cur.fetchall()
  # Clean up
  cur.close()      # optional but recommended
  conn.close()     # IMPORTANT: frees the lock
  ```

  Closing the **connection** is what really releases the file lock.
  The cursor can be closed too, but it’s less critical; unclosed connections are the main cause of locks.


* **If you’re using Jupyter `ipython-sql` magic (`%%sql`)**:

  * Connections stay open for the session. To fully close:

    ```python
    %sql close
    ```

    or restart the kernel if needed.

---

##  Do you need to close the cursor?

* **Cursor**: it’s good practice to close, but Python’s garbage collector will usually clean it up.
* **Connection**: must be closed to release file locks. This is the critical step.

---

**Summary:**

* A database is locked when another connection/transaction hasn’t finished.
* Always `commit()` (if you wrote) and `close()` your connection.
* Closing the cursor is optional but cleaner.
* With SQLAlchemy, use context managers (`with ...`) or `dispose()`.



###  Rule:

* **If you’ve made changes** (e.g. `INSERT`, `UPDATE`, `DELETE`):

  * Yes — you must `commit()` before `close()` to save them.
  * Otherwise SQLite will roll back uncommitted changes when you close.

* **If you only read** (`SELECT` queries, no modifications):

  *  you don’t need `commit()`.
  * You can just `close()` the connection safely.

---

### Example: writing

```python
import sqlite3

conn = sqlite3.connect("data/prices.db")
cur = conn.cursor()

cur.execute("INSERT INTO logs(message) VALUES (?)", ("hello",))

conn.commit()   #  saves the row
cur.close()
conn.close()
```

If you skip `conn.commit()`, the inserted row disappears when you close the connection.

---

### Example: reading only

```python
import sqlite3

conn = sqlite3.connect("data/prices.db")
cur = conn.cursor()

cur.execute("SELECT * FROM prices LIMIT 5;")
rows = cur.fetchall()

cur.close()
conn.close()    #  commit not needed, nothing to save
```



# Homework

```args, _ = ap.parse_known_args()```
`argparse` will parse the `arg`s you care about (`--db, --tickers, --prices`) and silently ignore the `-f …` injected by Colab.

In [None]:
# scripts/build_db.py  # save this file to the directory
#!/usr/bin/env python
import argparse, sys, textwrap, sqlite3
from pathlib import Path
import pandas as pd, numpy as np

DDL = textwrap.dedent("""
PRAGMA foreign_keys = ON;
CREATE TABLE IF NOT EXISTS meta (
  ticker TEXT PRIMARY KEY,
  name   TEXT,
  sector TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS prices (
  ticker     TEXT NOT NULL,
  date       TEXT NOT NULL,
  adj_close  REAL NOT NULL CHECK (adj_close >= 0),
  volume     INTEGER NOT NULL CHECK (volume >= 0),
  log_return REAL NOT NULL,
  PRIMARY KEY (ticker,date),
  FOREIGN KEY (ticker) REFERENCES meta(ticker)
);
CREATE INDEX IF NOT EXISTS idx_prices_date ON prices(date);
""")

def load_meta(con, tickers_csv: Path):
    if tickers_csv.exists():
        tks = pd.read_csv(tickers_csv)["ticker"].dropna().unique().tolist()
    else:
        raise SystemExit(f"tickers CSV not found: {tickers_csv}")
    sectors = ["Technology","Financials","Healthcare","Energy","Consumer"]
    meta = pd.DataFrame({
        "ticker": tks,
        "name": tks,
        "sector": [sectors[i % len(sectors)] for i in range(len(tks))]
    })
    with con:
        con.executemany("INSERT OR REPLACE INTO meta(ticker,name,sector) VALUES(?,?,?)",
                        meta.itertuples(index=False, name=None))

def load_prices(con, prices_csv: Path):
    df = pd.read_csv(prices_csv, parse_dates=["date"])
    df["date"] = df["date"].dt.strftime("%Y-%m-%d")
    df = df[["ticker","date","adj_close","volume","log_return"]].drop_duplicates(["ticker","date"])
    with con:
        con.executemany(
            "INSERT OR REPLACE INTO prices(ticker,date,adj_close,volume,log_return) VALUES(?,?,?,?,?)",
            df.itertuples(index=False, name=None)
        )

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--db", default="data/prices.db")
    ap.add_argument("--tickers", default="tickers_25.csv")
    ap.add_argument("--prices", default="data/raw/prices.csv")
    # args = ap.parse_args()
    args, _ = ap.parse_known_args()

    Path(args.db).parent.mkdir(parents=True, exist_ok=True)
    con = sqlite3.connect(args.db)
    con.executescript(DDL)
    load_meta(con, Path(args.tickers))
    load_prices(con, Path(args.prices))
    con.close()
    print("Built DB:", args.db)

if __name__ == "__main__":
    # sys.exit(main())
    main()

Built DB: data/prices.db


In [None]:
import os, stat, pathlib
p = pathlib.Path("scripts/build_db.py")
os.chmod(p, os.stat(p).st_mode | stat.S_IEXEC)
print("Ready:", p)

Ready: scripts/build_db.py


Append to your Makefile:
```
DB := data/prices.db

.PHONY: db sql-report
db: ## Build/refresh SQLite database from CSVs
  python scripts/build_db.py --db $(DB) --tickers tickers_25.csv --prices data/raw/prices.csv

sql-report: db ## Generate a simple SQL-driven CSV summary
  python - << 'PY'
  import pandas as pd, sqlite3, os
  con = sqlite3.connect("data/prices.db")
  df = pd.read_sql_query("""
  SELECT m.sector,
    COUNT(*) AS n_obs,
    AVG(ABS(p.log_return)) AS mean_abs_return
  FROM prices p
  JOIN meta m ON p.ticker=m.ticker
  GROUP BY m.sector ORDER BY n_obs DESC;
  """, con)
os.makedirs("reports", exist_ok=True)
df.to_csv("reports/sql_sector_summary.csv", index=False)
print(df.head())
con.close()
PY
```

### 1. The command wrapper

```bash
python - << 'PY'
...
PY
```

* `python -` tells Python to **read code from stdin** (instead of a `.py` file).
* The `<< 'PY' ... PY` part is a **Bash here-document**:
  it sends everything between the two `PY` markers into Python’s stdin.
  (The quotes `'PY'` mean no variable expansion or substitution will happen inside.)

So, you’re running an inline Python script directly from the shell.

`<<-'PY'` lets Bash strip those leading TABs from the heredoc body.

In [None]:
%%bash
# BACK UP FIRST
cp Makefile Makefile.bak
# Replace lines that BEGIN with 4 spaces by a single tab
perl -i -pe 's/^\h{4}(?=\S)/\t/' Makefile
cat Makefile

# Makefile — unified-stocks
SHELL := /bin/bash
.SHELLFLAGS := -eu -o pipefail -c

PY := python
QUARTO := quarto

START ?= 2020-01-01
END   ?= 2025-08-01
ROLL  ?= 30

DATA_RAW := data/raw/prices.csv
FEATS    := data/processed/features.parquet
REPORT   := docs/reports/eda.html

# Default target
.DEFAULT_GOAL := help

.PHONY: help all clean clobber qa report backup

help: ## Show help for each target
	@awk 'BEGIN {FS = ":.*##"; printf "Available targets:\n"} /^[a-zA-Z0-9_\-]+:.*##/ {printf "  \033[36m%-18s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)

# all: $(DATA_RAW) $(FEATS) report backup ## Run the full pipeline and back up artifacts
all: $(DATA_RAW) $(FEATS) report train backup

$(DATA_RAW): scripts/get_prices.py tickers_25.csv
	$(PY) scripts/get_prices.py --tickers tickers_25.csv --start $(START) --end $(END) --out $(DATA_RAW)

$(FEATS): scripts/build_features.py $(DATA_RAW) scripts/qa_csv.sh
	# Basic QA first
	scripts/qa_csv.sh $(DATA_RAW)
	$(PY) scripts/build_features.py --input $(D

In [None]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"
make db
make sql-report

python scripts/build_db.py --db data/prices.db --tickers tickers_25.csv --prices data/raw/prices.csv
Built DB: data/prices.db
python scripts/build_db.py --db data/prices.db --tickers tickers_25.csv --prices data/raw/prices.csv
Built DB: data/prices.db
python - <<-'PY'
import pandas as pd, sqlite3, os
con = sqlite3.connect("data/prices.db")
df = pd.read_sql_query("""
SELECT m.sector,
COUNT(*) AS n_obs,
AVG(ABS(p.log_return)) AS mean_abs_return
FROM prices p
JOIN meta m ON p.ticker = m.ticker
GROUP BY m.sector
ORDER BY n_obs DESC;
""", con)
os.makedirs("reports", exist_ok=True)
df.to_csv("reports/sql_sector_summary.csv", index=False)
print(df.head())
con.close()
PY
       sector  n_obs  mean_abs_return
0  Technology    900         0.008182
1  Healthcare    900         0.007570
2  Financials    900         0.007768
3      Energy    900         0.007751
4    Consumer    900         0.007787


In [19]:
%load_ext sql
# %reload_ext sql
%sql sqlite:///data/prices.db

In [21]:
%config SqlMagic.style = '_DEPRECATED_DEFAULT'


In [22]:
%%sql
-- sql/sector_top_moves.sql
SELECT m.sector, p.ticker, p.date, p.log_return, ABS(p.log_return) AS abs_move
FROM prices p JOIN meta m ON p.ticker = m.ticker
ORDER BY abs_move DESC
LIMIT 10;

 * sqlite:///data/prices.db
Done.


sector,ticker,date,log_return,abs_move
Healthcare,DIS,2020-05-26,0.0406160631065191,0.0406160631065191
Energy,JNJ,2020-08-26,-0.0366108181020674,0.0366108181020674
Energy,NFLX,2020-02-24,-0.0352673134914223,0.0352673134914223
Healthcare,T,2020-04-22,-0.0324010609621243,0.0324010609621243
Consumer,INTC,2020-08-13,0.032125524475993,0.032125524475993
Financials,HD,2020-07-15,0.0319342315632713,0.0319342315632713
Energy,XOM,2020-06-09,0.0315028786157123,0.0315028786157123
Consumer,INTC,2020-07-23,-0.0312024480398154,0.0312024480398154
Energy,NFLX,2020-04-17,-0.0308633642099511,0.0308633642099511
Technology,CSCO,2020-08-03,0.0308342368173999,0.0308342368173999


In [26]:
from pathlib import Path

sql_code = """\
-- sql/sector_top_moves.sql
SELECT m.sector, p.ticker, p.date, p.log_return, ABS(p.log_return) AS abs_move
FROM prices p JOIN meta m ON p.ticker = m.ticker
ORDER BY abs_move DESC
LIMIT 10;
"""

# Ensure directory exists
path = Path("sql/sector_top_moves.sql")
path.parent.mkdir(parents=True, exist_ok=True)

# Write SQL text
path.write_text(sql_code, encoding="utf-8")

print(f"SQL written to {path}")


SQL written to sql/sector_top_moves.sql


If using heredoc in bash:
```
mkdir -p sql
cat <<'SQL' > sql/sector_top_moves.sql
-- sql/sector_top_moves.sql
SELECT m.sector, p.ticker, p.date, p.log_return, ABS(p.log_return) AS abs_move
FROM prices p JOIN meta m ON p.ticker = m.ticker
ORDER BY abs_move DESC
LIMIT 10;
SQL

```

In [27]:
# scripts/run_sql.py
#!/usr/bin/env python
import argparse, sqlite3, pandas as pd
from pathlib import Path

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--db", default="data/prices.db")
    # ap.add_argument("--sqlfile", required=True)
    ap.add_argument("--sqlfile", default="sql/sector_top_moves.sql")
    ap.add_argument("--params", nargs="*", default=[])
    ap.add_argument("--out", default="")
    # args = ap.parse_args()
    args, _ = ap.parse_known_args()

    sql = Path(args.sqlfile).read_text()
    con = sqlite3.connect(args.db)
    df = pd.read_sql_query(sql, con, params=args.params or None)
    con.close()
    if args.out:
        Path(args.out).parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(args.out, index=False)
    print(df.head())

if __name__ == "__main__":
    main()

       sector ticker        date  log_return  abs_move
0  Healthcare    DIS  2020-05-26    0.040616  0.040616
1      Energy    JNJ  2020-08-26   -0.036611  0.036611
2      Energy   NFLX  2020-02-24   -0.035267  0.035267
3  Healthcare      T  2020-04-22   -0.032401  0.032401
4    Consumer   INTC  2020-08-13    0.032126  0.032126


In [29]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"
python scripts/run_sql.py --sqlfile sql/sector_top_moves.sql --out reports/sector_top_moves.csv

       sector ticker        date  log_return  abs_move
0  Healthcare    DIS  2020-05-26    0.040616  0.040616
1      Energy    JNJ  2020-08-26   -0.036611  0.036611
2      Energy   NFLX  2020-02-24   -0.035267  0.035267
3  Healthcare      T  2020-04-22   -0.032401  0.032401
4    Consumer   INTC  2020-08-13    0.032126  0.032126
