In [12]:
# from google.colab import drive
# drive.flush_and_unmount()           # ignore errors if already unmounted

#If cannot remount, simply delete the mounted drive and then remount
# rm -rf /content/drive


In [13]:
# Colab cell
from google.colab import drive

drive.mount('/content/drive', force_remount=True)



Mounted at /content/drive


In [14]:
# Adjust these two for YOUR repo
REPO_OWNER = "kadkins3880"
REPO_NAME  = "STAT4160"   # e.g., unified-stocks-team1
BASE_DIR   = "/content/drive/MyDrive/dspt25"
CLONE_DIR  = f"{BASE_DIR}/{REPO_NAME}"
REPO_URL   = f"https://github.com/{REPO_OWNER}/{REPO_NAME}.git"

# if on my office computer

# REPO_NAME  = "lectureNotes"   # e.g., on my office computer
# BASE_DIR = r"E:\OneDrive - Auburn University Montgomery\teaching\AUM\STAT 4160 Productivity Tools" # on my office computer
# CLONE_DIR  = f"{BASE_DIR}\{REPO_NAME}"

import os, pathlib
pathlib.Path(BASE_DIR).mkdir(parents=True, exist_ok=True)


In [15]:
import os, subprocess, shutil, pathlib

if not pathlib.Path(CLONE_DIR).exists():
    !git clone {REPO_URL} {CLONE_DIR}
else:
    # If the folder exists, just ensure it's a git repo and pull latest
    os.chdir(CLONE_DIR)
    # !git status
    # !git pull --rebase # !git pull --ff-only
os.chdir(CLONE_DIR)
print("Working dir:", os.getcwd())

Working dir: /content/drive/MyDrive/dspt25/STAT4160


A **REST API** (Representational State Transfer, Application Programming Interface) is a way for different programs or systems to communicate over the web using **HTTP requests and responses**. It’s one of the most common patterns used today for backend services, mobile apps, and integrations.

---

## 1. Key ideas

* **Resources**: Everything is modeled as a resource (e.g., `users`, `orders`, `stocks`).
* **Endpoints (URLs)**: Each resource has a unique URL.
  Example:

  * `/users` → all users
  * `/users/42` → user with ID 42
* **HTTP methods** define the action:

  * `GET` → Read
  * `POST` → Create
  * `PUT` or `PATCH` → Update
  * `DELETE` → Delete
* **Stateless**: Each request is independent (server doesn’t remember previous state).
* **Data formats**: Typically JSON (but XML, CSV, etc. are possible).

---

## 2. Example REST API design (for a TODO app)

| Method   | Endpoint   | Meaning                 |
| -------- | ---------- | ----------------------- |
| `GET`    | `/tasks`   | List all tasks          |
| `POST`   | `/tasks`   | Create a new task       |
| `GET`    | `/tasks/1` | Get task with ID = 1    |
| `PUT`    | `/tasks/1` | Update task 1 (replace) |
| `PATCH`  | `/tasks/1` | Update part of task 1   |
| `DELETE` | `/tasks/1` | Delete task 1           |

---

## 3. Sample request & response

**Request:**

```http
GET /tasks/1 HTTP/1.1
Host: api.example.com
Accept: application/json
```

**Response:**

```json
{
  "id": 1,
  "title": "Finish homework",
  "done": false
}
```

---

## 4. Example in Python (using Flask)

```python
from flask import Flask, jsonify, request

app = Flask(__name__)

tasks = [{"id": 1, "title": "Finish homework", "done": False}]

@app.route("/tasks", methods=["GET"])
def get_tasks():
    return jsonify(tasks)

@app.route("/tasks/<int:task_id>", methods=["GET"])
def get_task(task_id):
    task = next((t for t in tasks if t["id"] == task_id), None)
    if task:
        return jsonify(task)
    return jsonify({"error": "Not found"}), 404

@app.route("/tasks", methods=["POST"])
def create_task():
    new_task = request.json
    new_task["id"] = len(tasks) + 1
    tasks.append(new_task)
    return jsonify(new_task), 201

if __name__ == "__main__":
    app.run(debug=True)
```

* Start the server: `python app.py`
* Try `GET http://127.0.0.1:5000/tasks`

---

## 5. Where REST APIs are used

* Mobile apps talking to backends
* Web frontends talking to servers
* Integrations between services (e.g., Slack, GitHub, Google Maps API)
* Internal microservices communication

---
In short: A REST API is a **structured way of exposing functionality/data** over the web using URLs and HTTP verbs.



```python
from pathlib import Path

Path(".env.template").write_text("FRED_API_KEY=\n")
```

### Step by step

1. `Path(".env.template")`
   → Creates a `Path` object pointing to a file named `.env.template` in the current working directory.

2. `.write_text("FRED_API_KEY=\n")`
   → Opens the file for writing (creates it if it doesn’t exist, overwrites if it does).
   → Writes the string `"FRED_API_KEY=\n"` into it.
   → Returns the number of characters written (`12` here).

The file `.env.template` will contain:

```
FRED_API_KEY=
```

(blank after the `=`) — it’s a placeholder where the user is supposed to put their actual API key for the [FRED (Federal Reserve Economic Data)](https://fred.stlouisfed.org/) service.

### Why `.env.template`?

* Convention: you keep a **template** file (with empty placeholders) in your repo.
* Users copy it to `.env` and fill in secrets locally:

  ```bash
  cp .env.template .env
  ```
* Then `.env` is listed in `.gitignore` so your real keys never get committed.

-So this line is just **setting up a boilerplate environment variable file** for API credentials.


```python
for line in [".env", ".cache/", "__pycache__/"]:
    if line not in gi_txt:
        gi_txt += ("\n" if not gi_txt.endswith("\n") else "") + line
```

1. **The list being looped over:**

   ```python
   [".env", ".cache/", "__pycache__/"]
   ```

   These are paths you usually don’t want tracked in Git:

   * `.env` → contains secrets / environment vars
   * `.cache/` → cache files/folders
   * `__pycache__/` → Python bytecode cache

2. **Check if already present:**

   ```python
   if line not in gi_txt:
   ```

   `gi_txt` is presumably a string with the current `.gitignore` content.
   This avoids adding duplicates.

3. **Add with newline handling:**

   ```python
   gi_txt += ("\n" if not gi_txt.endswith("\n") else "") + line
   ```

   * If `gi_txt` doesn’t end with a newline, add one first.
   * Then append the new ignore pattern.

---

### Example

Suppose `.gitignore` currently contains:

```
*.parquet
```

So `gi_txt = "*.parquet"`.

After running the loop:

```
*.parquet
.env
.cache/
__pycache__/
```







In [16]:
import os, pathlib, json, hashlib, time, sqlite3, pandas as pd, numpy as np
from pathlib import Path
# .env template for secrets
Path(".env.template").write_text("FRED_API_KEY=\n")
# Ensure .gitignore has secrets & cache
gi = Path(".gitignore")
if gi.exists():
    gi_txt = gi.read_text()
else:
    gi_txt = ""
for line in [".env", ".cache/", "__pycache__/"]:
    if line not in gi_txt:
        gi_txt += ("\n" if not gi_txt.endswith("\n") else "") + line
gi.write_text(gi_txt)
print("Ready. Fill your FRED key in a local .env (do not commit).")

Ready. Fill your FRED key in a local .env (do not commit).


`load_dotenv()` is a function from the **[python-dotenv](https://pypi.org/project/python-dotenv/)** package.

It’s used to load environment variables from a `.env` file into your Python process, so you can access them with `os.getenv` or `os.environ`.

### Typical workflow

1. **Install**:

```bash
pip install python-dotenv
```

2. **Create a `.env` file** (not tracked in Git):

```
FRED_API_KEY=abc123
DB_USER=alice
DB_PASS=secret
```

3. **Load variables in Python**:

```python
import os
from dotenv import load_dotenv

# By default, looks for a file named `.env` in the current directory
load_dotenv()

# Access them like normal env vars
print(os.getenv("FRED_API_KEY"))  # abc123
```

---

### What happens inside

* Reads the `.env` file line by line.
* For each `KEY=VALUE`, it puts them into the process environment (`os.environ`).
* Does not override variables that are already set in the environment, unless you call:

  ```python
  load_dotenv(override=True)
  ```

---

### Custom file path

```python
from dotenv import load_dotenv
from pathlib import Path

load_dotenv(dotenv_path=Path("/path/to/.env.local"))
```

---


You created `.env.template` with `FRED_API_KEY=`. Once you copy it to `.env` and fill in your real key, `load_dotenv()` will make `FRED_API_KEY` available in Python.




The function `sess()` is building a **pre-configured `requests.Session`** object for making HTTP calls.


1. **`requests.Session()`**

   * Creates a reusable HTTP session.
   * Benefits:

     * Reuses TCP connections (faster).
     * Can hold default headers, cookies, etc.
     * Better performance for multiple requests to the same host.

2. **Custom `User-Agent`**

   ```python
   s.headers.update({"User-Agent": "dspt-class/1.0"})
   ```

   * Identifies your client when making requests.
   * Some APIs require this; good practice for rate-limited APIs like FRED.

3. **Mount adapter with retries**

   ```python
   s.mount("https://", HTTPAdapter(max_retries=Retry(...)))
   ```

   * Tells the session: “whenever making HTTPS requests, use this retry policy.”
   * `Retry(total=3, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])`:

     * **total=3** → up to 3 retry attempts.
     * **backoff_factor=0.5** → exponential backoff between retries:

       * 0.5s, 1s, 2s … (depending on attempt count).
     * **status_forcelist** → retry only on these status codes:

       * `429` (Too Many Requests, often rate limiting).
       * `500, 502, 503, 504` (internal server errors / bad gateways / servcie unabailable/ timeouts).



THis  snippet  is a classic **cache key generator** for API requests.

```python
def cache_key(url, params):
    raw = url + "?" + "&".join(f"{k}={params[k]}" for k in sorted(params))
    return hashlib.sha1(raw.encode()).hexdigest()
```



### Step 1. `url + "?" + ...`

* Start with the base URL (e.g. `"https://api.stlouisfed.org/fred/series"`).
* Add a `?`, then build a query string from the parameters.

### Step 2. `"&".join(f"{k}={params[k]}" for k in sorted(params))`

* Loops over the keys in `params`, sorted alphabetically → ensures the order is consistent.

  * Without sorting, `{"a":1,"b":2}` vs `{"b":2,"a":1}` would produce different strings.
* Formats each key/value as `k=value`.
* Joins them with `&`.
* Example:

  ```python
  params = {"series_id":"GDP","api_key":"XYZ"}
  ```

  → `"api_key=XYZ&series_id=GDP"`

So `raw` might look like:

```
https://api.stlouisfed.org/fred/series?api_key=XYZ&series_id=GDP
```

### Step 3. `hashlib.sha1(raw.encode()).hexdigest()`

* Computes a **SHA-1 hash** of that string.
* The hashing funcitons in `hashlb`  operates on bytes, not text. `.encode()` converts a string into a bytes object- a binary representaiton of the text.  
* `.hexdigest()` gives a fixed-length hex string (40 characters).
* Example:

  ```
  "f3a21d84b5a31fe8afc86f9c834b31d25e6db852"
  ```

---

### Why hash it?

* File systems don’t like really long filenames with `?` and `&`.
* Hashing gives you a compact, unique identifier you can use as a cache key (e.g. for filenames in `.cache/`).

---

### Example in action

```python
import hashlib

def cache_key(url, params):
    raw = url + "?" + "&".join(f"{k}={params[k]}" for k in sorted(params))
    return hashlib.sha1(raw.encode()).hexdigest()

url = "https://api.stlouisfed.org/fred/series"
params = {"series_id": "GDP", "api_key": "XYZ"}
print(cache_key(url, params))
# → "f3a21d84b5a31fe8afc86f9c834b31d25e6db852"
```

---


This function takes a URL + parameters, makes a **deterministic string**, and returns a **SHA-1 hash** you can use as a stable cache key (for memoization, local disk cache, etc.).




A **hash code** is a (usually fixed-length) number produced by running some input data through a **hash function**. Think of it as a compact “fingerprint” of the data.

## Key properties

* **Deterministic:** same input → same output.
* **Fixed size:** output length doesn’t depend on input size (e.g., SHA-256 is always 256 bits).
* **Fast to compute.**
* **Sensitive to changes:** tiny input change → very different output (avalanche effect).
* **Collisions exist:** different inputs *can* share a hash (inevitable, but rare for good functions).

## Two big families

* **Cryptographic hash functions** (e.g., SHA-256, BLAKE3): designed to be collision-resistant and hard to invert. Used for integrity checks, digital signatures, content addressing.
* **Non-cryptographic hash functions** (e.g., MurmurHash, xxHash): very fast, good for hash tables and partitioning, but not secure for integrity/attacks.

## Common uses

* **Hash tables / dictionaries:** map keys to buckets quickly.
* **Caching & deduplication:** store by hash to avoid recomputing/duplicating.
* **Integrity checks:** verify downloads (hash matches = not corrupted).
* **Digital signatures & blockchain:** sign/chain fixed-length digests, not whole files.

## Python quick examples

```python
# Cryptographic (stable across runs)
import hashlib
data = b"hello"
print(hashlib.sha256(data).hexdigest())  # 64 hex chars (256 bits)

# Non-crypto: Python’s built-in hash() is for hash tables, but note:
# it's salted and changes each process by default (not stable across runs).
print(hash("hello"))  # varies between Python processes
```

## Terminology

* **Hash / digest / fingerprint** are often used interchangeably.
* **Checksum** (e.g., CRC32) is a simpler integrity check, faster but weaker than cryptographic hashes.



The snippet a  nice **API response caching helper**

```python
def cached_get(url, params, ttl_hours=24):
    key = cache_key(url, params)
    path = Path(f".cache/api/{key}.json")
    if path.exists() and (time.time() - path.stat().st_mtime < ttl_hours*3600):
        return json.loads(path.read_text())
    s = session_with_retry()
    r = s.get(url, params=params, timeout=20)
    r.raise_for_status()
    data = r.json()
    path.write_text(json.dumps(data))
    return data
```

---

1. **Generate a cache key**

   ```python
   key = cache_key(url, params)
   path = Path(f".cache/api/{key}.json")
   ```

   * Uses your earlier `cache_key()` (SHA-1 hash of URL+params).
   * Stores responses under `.cache/api/<hash>.json`.
   * This avoids super long filenames with `?` and `&`.

2. **Check if a valid cached file exists**

   ```python
   if path.exists() and (time.time() - path.stat().st_mtime < ttl_hours*3600):
       return json.loads(path.read_text())
   ```

   * If the file exists **and** it’s fresh (modified less than `ttl_hours` ago):
     → Load it with `json.loads` and return immediately.
   * `path.stat().st_mtime` = last modified timestamp.
   * `ttl_hours*3600` converts hours → seconds.

3. **Otherwise, make a real API request**

   ```python
   s = session_with_retry()
   r = s.get(url, params=params, timeout=20)
   r.raise_for_status()
   data = r.json()
   ```

   * Uses your custom `session_with_retry()` (the one with retries & User-Agent).
   * `timeout=20` → don’t hang forever.
   * `r.raise_for_status()` → throw an error if HTTP code isn’t 2xx.
   * `r.json()` → parse the response body as JSON.

4. **Cache the response**

   ```python
   path.write_text(json.dumps(data))
   ```

   * Saves the JSON to disk for next time.

5. **Return the fresh data**

   ```python
   return data
   ```

---

### Example usage

```python
url = "https://api.stlouisfed.org/fred/series"
params = {"series_id": "GDP", "api_key": "...", "file_type": "json"}

gdp_data = cached_get(url, params, ttl_hours=12)
print(gdp_data.keys())
```

First call → fetches from FRED API and writes `.cache/api/<hash>.json`.
Second call (within 12h) → just loads the cached file (no API hit).

---

### Why this is useful

* **Faster**: repeated runs don’t hammer the API.
* **Resilient**: avoids hitting API rate limits (429).
* **Reproducible**: results are stored locally, so even if API changes, you can reproduce old runs.




In [17]:
import os, requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from dotenv import load_dotenv

load_dotenv()  # reads .env if present

def session_with_retry(total=3, backoff=0.5):
    s = requests.Session()
    retry = Retry(total=total, backoff_factor=backoff, status_forcelist=[429,500,502,503,504])
    s.mount("https://", HTTPAdapter(max_retries=retry))
    s.headers.update({"User-Agent": "dspt-class/1.0 (+edu)"})
    return s

def cache_key(url, params):
    raw = url + "?" + "&".join(f"{k}={params[k]}" for k in sorted(params))
    return hashlib.sha1(raw.encode()).hexdigest()

def cached_get(url, params, ttl_hours=24):
    key = cache_key(url, params)
    path = Path(f".cache/api/{key}.json")
    # Ensure the parent directories exist
    path.parent.mkdir(parents=True, exist_ok=True)

    if path.exists() and (time.time() - path.stat().st_mtime < ttl_hours*3600):
        return json.loads(path.read_text())
    s = session_with_retry()
    r = s.get(url, params=params, timeout=20)
    r.raise_for_status()
    data = r.json()
    path.write_text(json.dumps(data))
    return data


```python
obs = data.get("observations", [])
```

* In a typical FRED series response, one of the top-level keys is `"observations"`, which holds a list of daily/weekly/monthly data points:

  ```json
  {
    "realtime_start": "2024-01-01",
    "realtime_end": "2024-01-01",
    "observations": [
      {"date": "2010-01-01", "value": "123.4"},
      {"date": "2010-02-01", "value": "125.7"},
      ...
    ]
  }
  ```
* `.get("observations", [])` means:

  * Look for the `"observations"` key in `data`.
  * If it exists → return its value (a list of dicts).
  * If it does **not** exist → return an empty list `[]` (instead of throwing a `KeyError`).



To get FRED API key: https://fred.stlouisfed.org/docs/api/api_key.html

In [18]:
# !cp .env.template .env # only run this for the first time to create .env

In [19]:
import os
load_dotenv()
os.getenv("FRED_API_KEY", "").strip()

'900a04438691b8ed89c6c3daac111c85'

In [20]:
import os
load_dotenv()
API_KEY = os.getenv("FRED_API_KEY", "").strip()
if not API_KEY:
    print("WARNING: No FRED_API_KEY in .env; continuing with unauthenticated request may fail on FRED. Add your key to use in class.")

FRED_SERIES_URL = "https://api.stlouisfed.org/fred/series/observations"

def fred_series(series_id, start="2010-01-01", end=None):
    p = {"series_id":series_id, "api_key":API_KEY, "file_type":"json",
         "observation_start":start}
    if end is not None: p["observation_end"]=end
    data = cached_get(FRED_SERIES_URL, p, ttl_hours=24)
    obs = data.get("observations", [])
    df = pd.DataFrame(obs)[["date","value"]]
    df["date"] = pd.to_datetime(df["date"])
    df["value"] = pd.to_numeric(df["value"], errors="coerce")
    df["series_id"] = series_id
    return df.dropna()

vix = fred_series("VIXCLS", start="2015-01-01")       # CBOE VIX
fed = fred_series("FEDFUNDS", start="2015-01-01")     # Effective Fed Funds

In [21]:
# The code below my runinto trouble if you run it more than one time. But the fix is below this cell.
# The reason is it does not handle duplicated rows.

# Write to SQLite
db = sqlite3.connect("data/prices.db")
db.execute("""CREATE TABLE IF NOT EXISTS macro_series(
    series_id TEXT NOT NULL, date TEXT NOT NULL, value REAL NOT NULL,
    PRIMARY KEY(series_id, date))""")
for df in [vix, fed]:
    df.to_sql("macro_series", db, if_exists="append", index=False)
db.commit(); db.close()

vix.head(), fed.head()

IntegrityError: UNIQUE constraint failed: macro_series.series_id, macro_series.date

In [22]:
# Write to SQLite
db = sqlite3.connect("data/prices.db")
db.execute("""CREATE TABLE IF NOT EXISTS macro_series(
    series_id TEXT NOT NULL, date TEXT NOT NULL, value REAL NOT NULL,
    PRIMARY KEY(series_id, date))""")

def insert_or_ignore(db, df):
    # normalize and drop any in-batch duplicates
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"]).dt.date # .astype(str)  # 'YYYY-MM-DD'
    df = df.drop_duplicates(subset=["series_id","date"])
    rows = list(df[["series_id","date","value"]].itertuples(index=False, name=None))
    db.executemany(
        "INSERT OR IGNORE INTO macro_series (series_id, date, value) VALUES (?,?,?)",
        rows
    )

for df in [vix, fed]:
    insert_or_ignore(db, df)

db.commit(); db.close()
vix.head(), fed.head()


(        date  value series_id
 1 2015-01-02  17.79    VIXCLS
 2 2015-01-05  19.92    VIXCLS
 3 2015-01-06  21.12    VIXCLS
 4 2015-01-07  19.31    VIXCLS
 5 2015-01-08  17.01    VIXCLS,
         date  value series_id
 0 2015-01-01   0.11  FEDFUNDS
 1 2015-02-01   0.11  FEDFUNDS
 2 2015-03-01   0.11  FEDFUNDS
 3 2015-04-01   0.12  FEDFUNDS
 4 2015-05-01   0.12  FEDFUNDS)

In [23]:
# Ensure consistent key types
for df in (vix, fed):
    df["date"] = pd.to_datetime(df["date"]).dt.date
    df["series_id"] = df["series_id"].astype(str)

# Check duplicates inside each df
print(vix.duplicated(["series_id","date"]).sum(),
      fed.duplicated(["series_id","date"]).sum())

# Check duplicates against what’s already in the DB
import sqlite3, pandas as pd
db = sqlite3.connect("data/prices.db")
existing = pd.read_sql("SELECT series_id, date FROM macro_series", db,
                       parse_dates=["date"])
existing["date"] = existing["date"].dt.date

for name, df in {"vix": vix, "fed": fed}.items():
    merged = df.merge(existing, on=["series_id","date"], how="inner")
    if not merged.empty:
        print(f"Already present in macro_series ({name}):")
        print(merged.head())
db.close()


0 0
Already present in macro_series (vix):
         date  value series_id
0  2015-01-02  17.79    VIXCLS
1  2015-01-05  19.92    VIXCLS
2  2015-01-06  21.12    VIXCLS
3  2015-01-07  19.31    VIXCLS
4  2015-01-08  17.01    VIXCLS
Already present in macro_series (fed):
         date  value series_id
0  2015-01-01   0.11  FEDFUNDS
1  2015-02-01   0.11  FEDFUNDS
2  2015-03-01   0.11  FEDFUNDS
3  2015-04-01   0.12  FEDFUNDS
4  2015-05-01   0.12  FEDFUNDS


In [24]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   date       129 non-null    object 
 1   value      129 non-null    float64
 2   series_id  129 non-null    object 
dtypes: float64(1), object(2)
memory usage: 3.2+ KB


```python
fed.rename(columns={"value": "fedfunds"}).drop(columns="series_id")
```

* FRED returns a column `"value"` for all series.
* You rename it to `"fedfunds"` to make the column self-describing.
* Example before/after:

| date       | value | series_id |
| ---------- | ----- | --------- |
| 2015-01-01 | 0.25  | FEDFUNDS  |

↓

| date       | fedfunds | series_id |
| ---------- | -------- | --------- |
| 2015-01-01 | 0.25     | FEDFUNDS  |





```python
vix.assign(var="vix").rename(columns={"value":"val"})
fed.assign(var="fedfunds").rename(columns={"value":"val"})
```

* `.assign(var="vix")` → adds a new column `var` to label the source.
* `.rename(columns={"value":"val"})` → standardize the measurement column name across series.

So now both look like:

**VIX:**

| date       | val  | series_id | var |
| ---------- | ---- | --------- | --- |
| 2015-01-01 | 19.5 | VIXCLS    | vix |

**Fed Funds:**

| date       | val  | series_id | var      |
| ---------- | ---- | --------- | -------- |
| 2015-01-01 | 0.25 | FEDFUNDS  | fedfunds |

---

### 2. Concatenate

```python
pd.concat([...])
```

Stacks the two DataFrames row-wise → one long table with a `var` column identifying which series each row came from.


```python
.pivot_table(index="date", columns="var", values="val")
```

* `index="date"` → rows are unique dates.
* `columns="var"` → creates one column per series (`vix`, `fedfunds`).
* `values="val"` → fills cells with the actual numeric values.

Result:

| date       | fedfunds | vix  |
| ---------- | -------- | ---- |
| 2015-01-01 | 0.25     | 19.5 |
| 2015-02-01 | 0.25     | 16.2 |
| ...        | ...      | ...  |

---

### 4. Reset index

```python
.reset_index()
```

Moves `date` back to a normal column instead of index, giving a clean DataFrame.



In [25]:
# Load features (build if missing)
from pathlib import Path
fvpath = Path("data/processed/features_v1.parquet")
if not fvpath.exists():
    raise SystemExit("Missing features_v1.parquet — run Session 10 lab or homework.")

fv1 = pd.read_parquet(fvpath).sort_values(["ticker","date"])
macro = pd.concat([vix.rename(columns={"value":"vix"}).drop(columns="series_id"),
                   fed.rename(columns={"value":"fedfunds"}).drop(columns="series_id")], axis=0)
# Pivot macro wide
macro_wide = (pd.concat([
    vix.assign(var="vix").rename(columns={"value":"val"}),
    fed.assign(var="fedfunds").rename(columns={"value":"val"})
]) .pivot_table(index="date", columns="var", values="val").reset_index())

# ensure macro_wide date column is in pd.datetime format
macro_wide["date"] = pd.to_datetime(macro_wide["date"])

enriched = fv1.merge(macro_wide, on="date", how="left")
enriched[["vix","fedfunds"]] = enriched[["vix","fedfunds"]].astype("float32")
enriched.to_parquet("data/processed/features_v1_ext.parquet", compression="zstd", index=False)
print("Wrote data/processed/features_v1_ext.parquet", enriched.shape)
enriched.head(5)

Wrote data/processed/features_v1_ext.parquet (3975, 20)


Unnamed: 0,date,ticker,log_return,r_1d,weekday,month,lag1,lag2,lag3,roll_mean_20,roll_std_20,zscore_20,ewm_mean_20,ewm_std_20,exp_mean,exp_std,adj_close,volume,fedfunds,vix
0,2020-01-29,AAPL,-0.018417,-0.002351,2,1,-0.012895,-0.019012,-0.004576,-0.004086,0.008476,-1.69083,-0.005252,0.009304,-0.004086,0.008476,92.154846,1598707,,16.389999
1,2020-01-30,AAPL,-0.002351,-0.012675,3,1,-0.018417,-0.012895,-0.019012,-0.004353,0.008324,0.240455,-0.004976,0.008875,-0.004003,0.00827,91.938454,2992900,,15.49
2,2020-01-31,AAPL,-0.012675,0.002713,4,1,-0.002351,-0.018417,-0.012895,-0.004849,0.008517,-0.918756,-0.005709,0.008745,-0.004397,0.00828,90.780533,634335,,18.84
3,2020-02-03,AAPL,0.002713,0.001568,0,2,-0.012675,-0.002351,-0.018417,-0.004268,0.008622,0.809695,-0.004907,0.00869,-0.004088,0.008224,91.027122,913454,,17.969999
4,2020-02-04,AAPL,0.001568,-0.001869,1,2,0.002713,-0.012675,-0.002351,-0.003963,0.008719,0.634254,-0.004291,0.008486,-0.003852,0.008126,91.169922,662663,,16.049999


In [26]:
enriched.tail()

Unnamed: 0,date,ticker,log_return,r_1d,weekday,month,lag1,lag2,lag3,roll_mean_20,roll_std_20,zscore_20,ewm_mean_20,ewm_std_20,exp_mean,exp_std,adj_close,volume,fedfunds,vix
3970,2020-09-01,XOM,0.014174,-0.006229,1,9,0.006585,-0.005386,-0.013911,0.001183,0.009813,1.323795,0.000534,0.010391,-0.001136,0.00947,82.733025,1024247,0.09,26.120001
3971,2020-09-02,XOM,-0.006229,-0.00904,2,9,0.014174,0.006585,-0.005386,0.000485,0.009819,-0.683827,-0.00011,0.010091,-0.001165,0.00945,82.219261,2456402,,26.57
3972,2020-09-03,XOM,-0.00904,0.014518,3,9,-0.006229,0.014174,0.006585,2.4e-05,0.010048,-0.902052,-0.00096,0.009968,-0.001209,0.009442,81.479378,1828850,,33.599998
3973,2020-09-04,XOM,0.014518,-0.008826,4,9,-0.00904,-0.006229,0.014174,0.000511,0.010515,1.332045,0.000514,0.010566,-0.001121,0.009489,82.670891,1777769,,30.75
3974,2020-09-07,XOM,-0.008826,0.020736,0,9,0.014518,-0.00904,-0.006229,0.000405,0.010602,-0.870585,-0.000376,0.010436,-0.001164,0.00948,81.944473,1272137,,


# Homework

Save the following code to scripts/get_macro.py

In [27]:
#!/usr/bin/env python
import os, json, time, hashlib, pandas as pd, sqlite3
from pathlib import Path
import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from dotenv import load_dotenv

load_dotenv()
API_KEY = os.getenv("FRED_API_KEY","").strip()
BASE = "https://api.stlouisfed.org/fred/series/observations"

def sess():
    s = requests.Session()
    s.headers.update({"User-Agent":"dspt-class/1.0"})
    s.mount("https://", HTTPAdapter(max_retries=Retry(total=3, backoff_factor=0.5,
                                                      status_forcelist=[429,500,502,503,504])))
    return s

def ckey(url, params):
    raw = url + "?" + "&".join(f"{k}={params[k]}" for k in sorted(params))
    return hashlib.sha1(raw.encode()).hexdigest()

def cached_get(url, params, ttl=86400):
    key = ckey(url, params); p = Path(f".cache/api/{key}.json")
    p.parent.mkdir(parents=True, exist_ok=True)
    if p.exists() and (time.time() - p.stat().st_mtime < ttl):
        return json.loads(p.read_text())
    r = sess().get(url, params=params, timeout=20); r.raise_for_status()
    data = r.json(); p.write_text(json.dumps(data)); return data

def fetch_series(series_id, start="2015-01-01"):
    if not API_KEY: raise SystemExit("Set FRED_API_KEY in .env")
    params = {"series_id":series_id, "api_key":API_KEY, "file_type":"json", "observation_start":start}
    data = cached_get(BASE, params)
    df = pd.DataFrame(data["observations"])[["date","value"]]
    df["date"] = pd.to_datetime(df["date"]).dt.strftime("%Y-%m-%d")  # store as TEXT YYYY-MM-DD
    df["value"] = pd.to_numeric(df["value"], errors="coerce")
    df["series_id"] = series_id
    # df = df.drop_duplicates(subset=["series_id","date"])
    return df.dropna(subset=["value"])

def ensure_schema(con):
    con.execute("""
        CREATE TABLE IF NOT EXISTS macro_series(
            series_id TEXT,
            date      TEXT,
            value     REAL,
            PRIMARY KEY(series_id, date)
        )
    """)
    # Optional: index on date for faster slicing across all series
    con.execute("CREATE INDEX IF NOT EXISTS idx_macro_series_date ON macro_series(date)")

def upsert_dataframe(con, df):
    sql = """
        INSERT INTO macro_series(series_id, date, value)
        VALUES (?, ?, ?)
        ON CONFLICT(series_id, date) DO UPDATE SET
            value = excluded.value  --replace teh existing row's value with the incoming row's value
    """
    rows = list(df[["series_id","date","value"]].itertuples(index=False, name=None))
    con.executemany(sql, rows)

def main(series_id, start="2015-01-01"):
    df = fetch_series(series_id, start=start)
    con = sqlite3.connect("data/prices.db")
    try:
        ensure_schema(con)
        upsert_dataframe(con, df)
        con.commit()
    finally:
        con.close()
    print(f"Upserted {series_id}: {len(df)} rows from {start}")

if __name__ == "__main__":
    import argparse
    ap = argparse.ArgumentParser()
    ap.add_argument("--series-id", default="VIXCLS")
    ap.add_argument("--start", default="2015-01-01")
    args, _ = ap.parse_known_args()
    main(args.series_id, start=args.start)


Upserted VIXCLS: 2740 rows from 2015-01-01


In [28]:
%%bash
chmod +x scripts/get_macro.py
python scripts/get_macro.py --series-id DGS10   # 10‑Year Treasury Constant Maturity Rate

Upserted DGS10: 2699 rows from 2015-01-01


In [29]:
import pandas as pd, sqlite3
fv = pd.read_parquet("data/processed/features_v1.parquet")
con = sqlite3.connect("data/prices.db")
macro = pd.read_sql_query("SELECT series_id, date, value FROM macro_series", con, parse_dates=["date"])
con.close()
wide = macro.pivot_table(index="date", columns="series_id", values="value").reset_index()
out = fv.merge(wide, on="date", how="left")
out.to_parquet("data/processed/features_v1_ext.parquet", compression="zstd", index=False)
print("Wrote features_v1_ext.parquet with extra series:", out.shape)

Wrote features_v1_ext.parquet with extra series: (3975, 21)


In [30]:
import pandas as pd
def test_enriched_has_macro():
    df = pd.read_parquet("data/processed/features_v1_ext.parquet")
    assert "date" in df and "ticker" in df
    assert df.filter(regex="^(VIXCLS|DGS10|FEDFUNDS)$").shape[1] >= 1

In [31]:
#See what pytest thinks it can collect:
!pytest -q --collect-only -vv

platform linux -- Python 3.12.12, pytest-8.4.2, pluggy-1.6.0
rootdir: /content/drive/MyDrive/dspt25/STAT4160
configfile: pytest.ini
testpaths: tests
plugins: anyio-4.11.0, typeguard-4.4.4, langsmith-0.4.35
collected 7 items                                                              [0m

<Dir STAT4160>
  <Dir tests>
    <Module test_dictionary_provenance.py>
      <Function test_provenance_and_dict>
    <Module test_enriched_has_macro.py>
      <Function test_enriched_has_macro>
    <Module test_health_outputs.py>
      <Function test_health_files_exist>
    <Module test_logging.py>
    <Module test_mathy.py>
      <Function test_moving_avg_basic>
      <Function test_moving_avg_bad_window>
    <Module test_no_lookahead.py>
      <Function test_features_no_lookahead>



In [32]:
%%bash
pytest  tests/test_enriched_has_macro.py

.                                                                        [100%]
1 passed in 0.65s
