In [3]:
#ed  # from google.colab import drive
# drive.flush_and_unmount()           # ignore errors if already unmounted

#If cannot remount, simply delete the mounted drive and then remount
# rm -rf /content/drive


In [4]:
# Colab cell
from google.colab import drive

drive.mount('/content/drive', force_remount=True)



Mounted at /content/drive


In [5]:
# Adjust these two for YOUR repo
REPO_OWNER = "kadkins3880"
REPO_NAME  = "STAT4160"   # e.g., unified-stocks-team1
BASE_DIR   = "/content/drive/MyDrive/dspt25"
CLONE_DIR  = f"{BASE_DIR}/{REPO_NAME}"
REPO_URL   = f"https://github.com/{REPO_OWNER}/{REPO_NAME}.git"

# if on my office computer

# REPO_NAME  = "lectureNotes"   # e.g., on my office computer
# BASE_DIR = r"E:\OneDrive - Auburn University Montgomery\teaching\AUM\STAT 4160 Productivity Tools" # on my office computer
# CLONE_DIR  = f"{BASE_DIR}\{REPO_NAME}"

import os, pathlib
pathlib.Path(BASE_DIR).mkdir(parents=True, exist_ok=True)


In [6]:
import os, subprocess, shutil, pathlib

if not pathlib.Path(CLONE_DIR).exists():
    !git clone {REPO_URL} {CLONE_DIR}
else:
    # If the folder exists, just ensure it's a git repo and pull latest
    os.chdir(CLONE_DIR)
    # !git status
    # !git pull --rebase # !git pull --ff-only
os.chdir(CLONE_DIR)
print("Working dir:", os.getcwd())

Working dir: /content/drive/MyDrive/dspt25/STAT4160


### What is `robots.txt`?

A tiny text file at a site’s root (one per host, e.g. `https://example.com/robots.txt`) that tells **web crawlers** (search engines, bots) which parts of the site they **shouldn’t crawl**. It’s a **voluntary** standard (not access control).

### “Check robots.txt; identify disallow rules”

* **Check robots.txt**: open the site’s robots file and read it.
* **Identify Disallow rules**: find lines under each `User-agent` that begin with `Disallow:`—these list URL path prefixes that bots should not crawl.

### Example

```txt
User-agent: *          # which crawler the rules apply to (* = all)
Disallow: /private/    # don't crawl anything under /private/
Allow: /private/help   # exception: this path is allowed
Sitemap: https://example.com/sitemap.xml #indented to help visitors or crawlers to find a page
```

* `Disallow: /` → **block all paths** for that user-agent.
* `Disallow:` (empty) → **allow all paths**.
* Rules are **path-prefix** matches; many crawlers also support wildcards:

  * `*` = any chars, `$` = end of URL (e.g., `Disallow: /*.pdf$`).
* Crawlers pick the **most specific** rule that matches their `User-agent` and the URL.

### Important caveats

* **Advisory, not security**: It doesn’t prevent access; it just asks bots not to crawl. Anyone (or a non-compliant bot) can still fetch the URL. For real blocking, use auth or server rules.
* **Indexing vs crawling**: `Disallow` stops crawling, but a URL can still be **indexed** if others link to it. Use `noindex` (meta header) or 401/403 to keep it out of indexes.
* **Scope**: It’s per **scheme + host** (each subdomain and `http` vs `https` needs its own robots.txt).

### Fetch and View

```bash
# Fetch and view
curl -s https://example.com/robots.txt. # -s: silent mode

# Grep disallow lines (case-insensitive)
curl -s https://example.com/robots.txt | grep -i '^disallow:'  #-i: case insensitive
```




# Use Python to Fetch robots.txt and Parse
```python
r=requests.get(urljoin(base, "/robots.txt"), headers=UA, timeout=20)
```

- **`requests.get(...)`**

   * From the `requests` library.
   * Sends an HTTP **GET** request to a URL.
   * Returns a `Response` object with `status_code`, `text`, `json()`, etc.

- **`headers=UA`**

   * Sets the request headers.
   * Typically `UA` is a dict like:

     ```python
     UA = {"User-Agent": "mycrawler/1.0"}
     ```
   * Websites look at the `User-Agent` string to know who’s crawling them.

- **`timeout=20`**

   * If the server doesn’t respond within **20 seconds**, raise a `Timeout` exception.
   * Prevents the program from hanging forever.



- **Split lines**

   * `lines = r.text.splitlines()` makes a list of strings, one per line of the robots.txt.

3. **Extract `Disallow:` rules**
```python
disallows = [ln.split(":")[1].strip() for ln in lines if ln.lower().startswith("disallow:")]
```

   * It picks only the lines that *start with* `Disallow:` (case-insensitive).
   * For each such line, it takes the part after the first `:` → e.g. `"/private/"`.
   * Leading/trailing whitespace is stripped.

   Example robots.txt:

   ```
   User-agent: *
   Disallow: /private/
   Disallow: /tmp
   ```

   becomes:

   ```python
   disallows = ["/private/", "/tmp"]
   ```
```
ln.split(":")[1].strip()
```
unpacking the text of a line like:

```
Disallow: /private/
```

- **`ln.split(":")`**

   * Splits the string on the colon `:`.
   * Example:

     ```python
     "Disallow: /private/".split(":")
     # → ["Disallow", " /private/"]
     ```

- **`[1]`**

   * Takes the second element (Python is 0-indexed).
   * That’s the part *after* the colon.
   * Example:

     ```python
     ["Disallow", " /private/"][1]
     # → " /private/"
     ```

- **`.strip()`**

   * Removes leading/trailing whitespace.
   * Example:

     ```python
     " /private/".strip()
     # → "/private/"
     ```
- **Check the path**

   * `return all(not path.startswith(d) for d in disallows)`
   * For every disallowed prefix `d`, it checks `path.startswith(d)`.
   * If **any** disallowed prefix matches the beginning of `path`, the function returns `False`.
   * Otherwise, it returns `True`.


---

### Limitations

This is a *very simplified* robots.txt parser:

* It ignores `User-agent` scoping (it assumes all rules apply to everyone).
* It ignores `Allow:` rules that override `Disallow:`.
* It doesn’t handle wildcards (`*`, `$`) or comments.
* It treats an empty `Disallow:` (which means “allow all”) as a block of the root path.

For production-grade parsing, use Python’s built-in `urllib.robotparser.RobotFileParser` or a library like `reppy`.



In [7]:
import os, pathlib, requests, time, hashlib, pandas as pd, numpy as np
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime

os.chdir(CLONE_DIR)
for p in [".cache/html","data/static","reports"]:
    pathlib.Path(p).mkdir(parents=True, exist_ok=True)

UA = {"User-Agent": "dspt-class/1.0 (+edu)"}
WIKI_URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

def allowed_by_robots(base, path="/wiki/"):
    r = requests.get(urljoin(base, "/robots.txt"), headers=UA, timeout=20)
    if r.status_code != 200: return True   #200: success. If no robots.txt, all are allowed.
    lines = r.text.splitlines()
    disallows = [ln.split(":")[1].strip() for ln in lines if ln.lower().startswith("disallow:")]
    return all(not path.startswith(d) for d in disallows)

print("Robots allows /wiki/?", allowed_by_robots("https://en.wikipedia.org"))

Robots allows /wiki/? False


# Python robot Parser

`urllib.robotparser.RobotFileParser` is Python’s built-in way to parse and respect `robots.txt`. It handles **User-agent**, `Disallow`, `Allow`, wildcards (`*`, `$`), etc.


* `rp.read()` downloads and parses `robots.txt`.
* `can_fetch(useragent, url)` returns `True` or `False`.

  * `True` → it’s allowed to crawl that URL.
  * `False` → it’s disallowed by robots.txt for that agent.



### More useful methods

* `rp.disallow_all` → `True` if `Disallow: /` (block everything).
* `rp.allow_all` → `True` if `robots.txt` allows everything.
* `rp.mtime()` → last time it fetched robots.txt.
* `rp.modified()` → manually set modification time.


In [8]:
from urllib.robotparser import RobotFileParser

# 1. Create the parser
rp = RobotFileParser()

# 2. Point it to the robots.txt of a site
rp.set_url("https://en.wikipedia.org/robots.txt")

# 3. Read and parse the file
rp.read()

# 4. Ask if a given user-agent can fetch a URL
print(rp.can_fetch("*", "https://en.wikipedia.org/wiki/Python_(programming_language)"))
print(rp.can_fetch("*", "https://en.wikipedia.org/wiki/Special:Random"))


False
False


# Python url Parser

```python
from urllib.parse import urlparse

url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
parts = urlparse(url)
print(parts)
```

Output:

```
ParseResult(
    scheme='https',
    netloc='en.wikipedia.org',
    path='/wiki/Python_(programming_language)',
    params='',
    query='',
    fragment=''
)
```

* `parts.scheme` → `"https"`
* `parts.netloc` → `"en.wikipedia.org"`
* `parts.path` → `"/wiki/Python_(programming_language)"`

---
# Construct the base address from the urlparse results

```python
base = f"{urlparse(url).scheme}://{urlparse(url).netloc}"
```

* `urlparse(url).scheme` → `"https"`
* `urlparse(url).netloc` → `"en.wikipedia.org"`

Together:

```python
base = "https://en.wikipedia.org"
```





In [9]:
# Rewrite earlier function using python robotparser
def allowed_by_robots(url, user_agent="*"):
    from urllib.robotparser import RobotFileParser
    from urllib.parse import urljoin, urlparse

    base = f"{urlparse(url).scheme}://{urlparse(url).netloc}"
    robots_url = urljoin(base, "/robots.txt")
    rp = RobotFileParser()
    rp.set_url(robots_url)
    rp.read()
    return rp.can_fetch(user_agent, url)

print("Robots allows /wiki/?", allowed_by_robots("https://en.wikipedia.org"))

Robots allows /wiki/? False


# Generate a hexadigest code (40 chars) for a (url+inquiry) (review)
This hexadigest code is to be used as a file name (keep file name in fixed length).

```python
import hashlib

url = "https://example.com/page?id=123"

# 1. Encode the string into bytes
data = url.encode()   # b'https://example.com/page?id=123'

# 2. Pass the bytes into SHA-1
h = hashlib.sha1(data)

# 3. Get the hex digest
digest = h.hexdigest()

print(digest)  # something like '8f67a339f4a4656bb72a1b0cc23e77ed2f1a59f7'
```


* **`url.encode()`** → SHA functions work on bytes, not Python strings. `.encode()` (default UTF-8) converts it.
* **`hashlib.sha1(...)`** → computes the SHA-1 cryptographic hash of those bytes.
* **`.hexdigest()`** → gives you a fixed-length **40-character hex string**


### Why use it here?


That produces a **unique ID** for a request (`url + params`).

* The SHA-1 digest is short, fixed length, and safe for filenames.
* Instead of trying to save a file named

  ```
  https://api.stlouisfed.org/fred/series/observations?series_id=VIXCLS&api_key=...
  ```

  you just save

  ```
  8f67a339f4a4656bb72a1b0cc23e77ed2f1a59f7.json
  ```



# Quick Introduciton to UTF-8

Unicode is a **universal character standard** that assigns every written symbol (letters, digits, punctuation, emojis, scripts, control chars) a unique **code point**—an abstract number like `U+0041` (“A”), `U+4F60` (“你”), or `U+1F600` (😀).

**UTF**-8 (Unicode Transformation Format-8 bit units) is a **character encoding**—a way to turn text ( like “A”, “é”, “😊”, “你”) into bytes (Unicode **code points**) so computers can store and transmit it.


* **Unicode-based:** It can represent every character in Unicode (over a million code points).
* **Variable-length:** Each character uses **1 to 4 bytes**:

  * ASCII (U+0000–U+007F) → **1 byte** (same as ASCII)
  * Latin accents, Greek, Cyrillic, etc. → **2 bytes**
  * Most CJK (中日韓) → **3 bytes**
  * Emojis/rare symbols → **4 bytes**

* **ASCII-compatible:** Plain English text (A–Z, digits, punctuation) is byte-for-byte identical to ASCII. That’s a huge reason UTF-8 won the internet.

* **Self-synchronizing:** You can detect character boundaries reliably, which helps with error recovery.
* **Ubiquitous:** Default on the web, in modern databases, programming languages, and APIs.



Key ideas:

* **Characters vs encodings:** Unicode defines the *characters* and their code points. Encodings like **UTF-8**, **UTF-16**, and **UTF-32** define how those code points are stored as bytes.


(**Advanced**)

* **Code point format:** `U+hhhh` (hex). Range is `U+0000`–`U+10FFFF`, organized into **17 planes**. Most common chars are in the **BMP** (Basic Multilingual Plane, `U+0000`–`U+FFFF`); others (many emojis, historic scripts) are in **supplementary planes**.
* **Combining marks & grapheme clusters:** Some visible “characters” are actually sequences, e.g. `e` + combining acute (`U+0301`) → “é”. What users see (a **grapheme**) may be multiple code points.
* **Normalization:** The same visual text can have different code-point sequences; Unicode defines forms like **NFC/NFD** to canonicalize them.
* **Directionality & scripts:** Includes metadata for right-to-left scripts (Arabic, Hebrew), line breaking, casing, collation, etc.
* **UTF-16 surrogate pairs:** Code points above `U+FFFF` are represented as pairs in UTF-16.

In short: **Unicode is the global map of characters; UTF-8/16/32 are the ways to put that map into bytes.**





# Making HTTP requests using Python pacakge rquests

### Example

```python
import requests

r = requests.get("https://httpbin.org/status/404")
print(r.status_code)  # 404
r.raise_for_status()
```
`r.raise_for_status()` is a **convenience method** on a `requests.Response` object.


* Checks the HTTP status code of the response `r`.
* If the status code is **4xx (client error)** or **5xx (server error)**, it raises a `requests.HTTPError` exception.
* If the status is **2xx (success)** or **3xx (redirect)**, it does nothing.

---
Output (of the previous code):

```
Traceback (most recent call last):
  ...
requests.exceptions.HTTPError: 404 Client Error: NOT FOUND for url: https://httpbin.org/status/404
```

But if you fetch a good page:

```python
r = requests.get("https://httpbin.org/status/200")
r.raise_for_status()   # no error
print("All good")
```

### Why use it?

* It’s a **fail-fast** mechanism: instead of silently getting bad responses, you immediately get an exception you can handle.
* Often paired with caching or retries:

  ```python
  r = requests.get(url, timeout=10)
  r.raise_for_status()
  data = r.json()
  ```

If you don’t call it, you’d have to check manually:

```python
if r.status_code != 200:
    # handle error
```



# Adding a **pause** between requests:

```python
time.sleep(1.0)  # be polite
```


* **`time.sleep(1.0)`** → makes the program wait for **1 second**.

### Why do this in web scraping?

* **Politeness:** Hitting a server with hundreds of rapid-fire requests can look like an attack. A short delay makes your crawler behave more like a human.
* **Avoid rate limiting:** Many sites (and APIs) block or throttle clients that send requests too quickly.
* **Respect `robots.txt`:** Some robots.txt files even suggest a `Crawl-delay` (though not all parsers honor it).

---

### Example

```python
import time, requests

urls = ["https://httpbin.org/get?a=1",
        "https://httpbin.org/get?a=2",
        "https://httpbin.org/get?a=3"]

for url in urls:
    r = requests.get(url)
    print(r.json())
    time.sleep(1.0)  # pause 1 sec before the next request
```




# Quick HTML Basics

Defines the **head section** of a webpage:

```html
<head>
  <title>Example</title>
</head>
```

* `<head> ... </head>`

  * The **head element** of an HTML document.
  * Contains metadata (info *about* the page, not shown as page content).
  * Common things inside `<head>`:

    * `<title>` → title text shown in the browser tab and search engine results.
    * `<meta>` → metadata like charset, description, viewport.
    * `<link>` → external CSS (Cascading Style Sheet), icons.
    * `<script>` → JavaScript includes (often deferred).

* `<title>Example</title>`

  * The **title of the page** = `"Example"`.
  * Appears in the browser tab and in search engines’ listings.
  * Does **not** appear in the visible page body.

---

### Mini full example

```html
<!DOCTYPE html>
<html>
<head>
  <title>Example</title>
  <meta charset="UTF-8">
  <meta name="description" content="A demo page.">
  <link rel="stylesheet" href="styles.css">
</head>
<body>
  <h1>Hello, world!</h1>
</body>
</html>
```

Note: `<!DOCTYPE html>`: DOCTYPE declaration: it tells teh web broswer what version of HTML the page is written in  and how to interprete the HTML code. In modern HTML, this means is in HTML5.


# Using **BeautifulSoup** (from the `bs4` library) to turn raw HTML into a structured object you can query.

```python
soup = BeautifulSoup(html, "html.parser")
```


1. **`html`**

   * A string containing raw HTML text (e.g. from `requests.get(url).text`).

2. **`BeautifulSoup(html, "html.parser")`**

   * Passes the HTML string into BeautifulSoup.
   * `"html.parser"` tells it which parser to use:

     * Built-in Python parser: `"html.parser"` (default, no extra install).
     * Faster/more lenient alternatives: `"lxml"`, `"html5lib"` (require extra packages).

3. **`soup`**

   * Now a **parse tree** — a nested structure of tags, attributes, and text that you can navigate and search.

---

### Example

```python
from bs4 import BeautifulSoup

html = """
<html><head><title>Example</title></head>
<body>
  <h1>Hello</h1>
  <p class="msg">This is a test.</p>
</body></html>
"""

soup = BeautifulSoup(html, "html.parser")

print(soup.title.text)       # "Example"
print(soup.h1.string)        # "Hello"
print(soup.find("p").text)   # "This is a test."
print(soup.find("p")["class"])  # ['msg']
```



```python
table = soup.find("table", {"id":"constituents"}) or soup.find("table", {"class":"wikitable"})
```


1. **`soup.find("table", {"id":"constituents"})`**

   * Looks for the **first `<table>` element** in the HTML that has `id="constituents"`.
   * Example:

     ```html
     <table id="constituents"> ... </table>
     ```

2. **`soup.find("table", {"class":"wikitable"})`**

   * If the first search fails (returns `None`), it tries again: find the **first `<table>` with class `wikitable`**.
   * Example:

     ```html
     <table class="wikitable"> ... </table>
     ```







```python
headers = [th.get_text(strip=True) for th in table.find("tr").find_all("th")]
```

* `table.find("tr")` → the **first row** (`<tr>`).
* `.find_all("th")` → all header cells in that row.
* `.get_text(strip=True)` → the visible text inside each cell, trimmed of whitespace.
* Result: a list of header names.
  Example:

  ```html
  <tr><th>Company</th><th>Symbol</th><th>Sector</th></tr>
  ```

  → `["Company", "Symbol", "Sector"]`


```python
for tr in table.find_all("tr")[1:]:
```

* `table.find_all("tr")` → every row in the table.
* `[1:]` → skip the first row (already used for headers).


```python
tds = [td.get_text(strip=True) for td in tr.find_all(["td","th"])]
```

* `tr.find_all(["td","th"])` → all data cells (`<td>`) in that row (and sometimes tables repeat headers with `<th>` inside).
* `.get_text(strip=True)` → extract the text.
* Result: a list of strings for the row’s values.

Example row:

  ```html
  <tr><td>Apple</td><td>AAPL</td><td>Technology</td></tr>
  ```

  → `["Apple", "AAPL", "Technology"]`


```python
if len(tds) == len(headers):
    rows.append(dict(zip(headers, tds)))
```

```python
rows.append(dict(zip(headers, tds)))
```


* `zip()` takes two (or more) iterables and pairs up elements **by position**.
* Example:

```python
headers = ["Company", "Symbol", "Sector"]
tds     = ["Apple", "AAPL", "Technology"]

pairs = zip(headers, tds)
print(list(pairs))
# [('Company', 'Apple'), ('Symbol', 'AAPL'), ('Sector', 'Technology')]
```

---

### Step 2: `dict(...)`

* Turns the list of pairs into a dictionary:

```python
dict([("Company", "Apple"), ("Symbol", "AAPL"), ("Sector", "Technology")])
# {"Company": "Apple", "Symbol": "AAPL", "Sector": "Technology"}
```



### End result

`rows` becomes a list of dictionaries — one per row:

```python
[
  {"Company": "Apple", "Symbol": "AAPL", "Sector": "Technology"},
  {"Company": "Microsoft", "Symbol": "MSFT", "Sector": "Technology"},
  ...
]
```




In [10]:
def get_html_cached(url, ttl_hours=24):
    key = hashlib.sha1(url.encode()).hexdigest()
    path = pathlib.Path(f".cache/html/{key}.html")
    if path.exists() and (time.time() - path.stat().st_mtime < ttl_hours * 3600):  #.st_mtime: last modified time in sec
        return path.read_text()
    r = requests.get(url, headers=UA, timeout=30)
    r.raise_for_status()
    path.write_text(r.text)
    time.sleep(1.0)  # be polite
    return r.text

html = get_html_cached(WIKI_URL)
# print(html)
soup = BeautifulSoup(html, "html.parser")

# Try soup table first; fallback to pandas.read_html
table = soup.find("table", {"id":"constituents"}) or soup.find("table", {"class":"wikitable"})
if table is not None:
    rows = []
    headers = [th.get_text(strip=True) for th in table.find("tr").find_all("th")]
    for tr in table.find_all("tr")[1:]:
        tds = [td.get_text(strip=True) for td in tr.find_all(["td","th"])]
        if len(tds) == len(headers):
            rows.append(dict(zip(headers, tds)))
    sp = pd.DataFrame(rows)
else:
    sp = pd.read_html(html)[0]

sp.head(3), sp.columns.tolist()

(  Symbol             Security   GICSSector         GICS Sub-Industry  \
 0    MMM                   3M  Industrials  Industrial Conglomerates   
 1    AOS          A. O. Smith  Industrials         Building Products   
 2    ABT  Abbott Laboratories  Health Care     Health Care Equipment   
 
      Headquarters Location  Date added         CIK Founded  
 0    Saint Paul, Minnesota  1957-03-04  0000066740    1902  
 1     Milwaukee, Wisconsin  2017-07-26  0000091142    1916  
 2  North Chicago, Illinois  1957-03-04  0000001800    1888  ,
 ['Symbol',
  'Security',
  'GICSSector',
  'GICS Sub-Industry',
  'Headquarters Location',
  'Date added',
  'CIK',
  'Founded'])

In [11]:
print(html)

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" lang="en" dir="ltr">
<head>
<meta charset="UTF-8">
<title>List of S&amp;P 500 companies - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enab

# UTC time

UTC = **Coordinated Universal Time** — the world’s primary civil time standard.

* **What it is:** A single, global clock based on highly accurate atomic time, kept close to Earth-rotation time (UT1). It doesn’t observe **time zones** or **daylight saving**.
* **Offset:** UTC is the reference; other zones are offsets from it (e.g., New York is UTC−05:00 in winter, UTC−04:00 in summer).
* **Notation:** In ISO-8601 timestamps, a trailing **`Z`** means “Zulu” = UTC (e.g., `2025-10-01T20:15:30Z`).
* **Leap seconds:** Occasionally a 1-second adjustment is inserted to keep UTC aligned with Earth’s rotation.

### 1. What is **Zulu** time?

* **"Zulu"** is the NATO phonetic alphabet word for the letter **Z**.
* In timekeeping, **Z** denotes **UTC (Coordinated Universal Time)**.
* So `"2025-10-01T20:15:30Z"` means *2025 Oct 1, 20:15:30 in UTC*.
* It’s used in aviation, military, astronomy, and international communication to avoid confusion with local time zones.
* `"Z"` = `"UTC+00:00"`.

### Why add `"Z"`?

* ISO 8601 allows multiple time zone notations.
* `"Z"` specifically means UTC (same as `+00:00`).
* Some APIs (e.g. JSON feeds) prefer `"Z"` for timestamps instead of offsets.


### Quick examples

* America/Chicago: CST = **UTC−06:00**, CDT = **UTC−05:00** (DST).
* 15:00 UTC = 10:00 CDT (summer in Chicago) or 09:00 CST (winter).

### In Python

Prefer timezone-aware datetimes:

```python
from datetime import datetime, timezone

now_utc = datetime.now(timezone.utc)     # aware UTC datetime
iso_utc = now_utc.isoformat().replace('+00:00', 'Z')  # ISO-8601 with Z
```


```python
{"source_url": WIKI_URL, "fetched_at_utc": datetime.utcnow().isoformat()+"Z"}
```



2. **`"fetched_at_utc": datetime.utcnow().isoformat()+"Z"`**

   * Key: `"fetched_at_utc"`
   * Value: current UTC timestamp in **ISO 8601** format with a `Z` suffix.
   * `datetime.utcnow()` → current time in UTC.
   * `.isoformat()` → convert to ISO 8601 string (`"2025-09-30T20:15:43.123456"`).
   * `+"Z"` → add the trailing `"Z"` to indicate **Zulu time** (UTC).
   * Example: `"2025-09-30T20:15:43.123456Z"`

3. **Whole dictionary**

   ```python
   {
     "source_url": "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies",
     "fetched_at_utc": "2025-09-30T20:15:43.123456Z"
   }
   ```


### Deprecation Warning

```
DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version.
Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC)
```

* `datetime.utcnow()` gives you a **naive** datetime object:

  * It has the right clock value for UTC.
  * But it has **no timezone info attached** (`tzinfo=None`).
* Python is moving towards **timezone-aware** datetimes, because naive ones can easily cause mistakes when you compare/convert time zones.

Better modern version:

```python
from datetime import datetime, UTC

src = {
    "source_url": WIKI_URL,
    "fetched_at_utc": datetime.now(UTC).isoformat().replace("+00:00", "Z")
}
```

* `datetime.now(UTC)` → aware UTC datetime.
* `.isoformat()` → string like `"2025-10-01T20:15:30+00:00"`.
* `.replace("+00:00", "Z")` → standardize to the `"Z"` form.





In [12]:
import re
def snake(s):
    s = re.sub(r"[^\w\s]", "_", s)
    s = re.sub(r"\s+", "_", s.strip().lower())
    return re.sub(r"_+", "_", s)

sp.columns = [snake(c) for c in sp.columns]
cand_cols = [c for c in sp.columns if "symbol" in c or "security" in c or "sector" in c] # if a col contains those words
sp = sp.rename(columns={c:"symbol" for c in sp.columns if "symbol" in c or c=="ticker"}) #rename c to "symbol"
sp = sp.rename(columns={c:"sector" for c in sp.columns if "sector" in c})
keep = [c for c in ["symbol","sector"] if c in sp.columns]
sp = sp[keep].dropna().drop_duplicates()
sp = sp.rename(columns={"symbol":"ticker"})
sp["ticker"] = sp["ticker"].str.strip()
sp["sector"] = sp["sector"].astype("category")

# Save with provenance
src = {"source_url": WIKI_URL, "fetched_at_utc": datetime.utcnow().isoformat()+"Z"}
sp.to_csv("data/static/sector_map.csv", index=False)
with open("data/static/sector_map.provenance.json","w") as f:
    import json; json.dump(src, f, indent=2)
print("Wrote data/static/sector_map.csv", sp.shape)
sp.head(5)

  src = {"source_url": WIKI_URL, "fetched_at_utc": datetime.utcnow().isoformat()+"Z"}


Wrote data/static/sector_map.csv (503, 2)


Unnamed: 0,ticker,sector
0,MMM,Industrials
1,AOS,Industrials
2,ABT,Health Care
3,ABBV,Health Care
4,ACN,Information Technology


In [13]:
src = {"source_url": WIKI_URL, "fetched_at_utc": datetime.utcnow().isoformat()+"Z"}
src

  src = {"source_url": WIKI_URL, "fetched_at_utc": datetime.utcnow().isoformat()+"Z"}


{'source_url': 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies',
 'fetched_at_utc': '2025-10-26T16:49:10.340847Z'}

In [14]:
from datetime import datetime, UTC

src = {
    "source_url": WIKI_URL,
    "fetched_at_utc": datetime.now(UTC).isoformat().replace("+00:00", "Z")
}
src

{'source_url': 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies',
 'fetched_at_utc': '2025-10-26T16:49:10.351518Z'}

In [15]:
from datetime import datetime, UTC
now = datetime.now(UTC)
print(now)
print(now.tzinfo)


2025-10-26 16:49:10.360059+00:00
UTC


In [16]:
now=datetime.utcnow()
# 2025-10-01 20:45:30.123456   (no timezone info!)
print(now.tzinfo)


None


  now=datetime.utcnow()


In [17]:
from pathlib import Path
pp = Path("data/processed/prices.parquet")
if not pp.exists():
    raise SystemExit("Need prices.parquet (Session 9).")

prices = pd.read_parquet(pp)
if "sector" not in prices.columns or prices["sector"].isna().all():
    prices2 = prices.merge(sp, on="ticker", how="left")
    prices2["sector"] = prices2["sector"].astype("category")
    prices2.to_parquet("data/processed/prices.parquet", compression="zstd", index=False)
    print("Updated prices.parquet with sector column.")
else:
    print("Sector already present; no merge needed.")

Sector already present; no merge needed.


# Homework

# Python JSON Basics

```python
json.loads(Path("data/static/sector_map.provenance.json").read_text())
```



1. **`Path("data/static/sector_map.provenance.json")`**

   * Creates a `Path` object (from `pathlib`) pointing to the file `data/static/sector_map.provenance.json`.

2. **`.read_text()`**

   * Opens the file and reads its contents as a single text string.
   * Example result:

     ```json
     {"source_url": "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies",
      "fetched_at_utc": "2025-10-01T20:15:43Z"}
     ```

3. **`json.loads(...)`**

   * Takes that JSON-formatted string and parses it into a **Python object**.
   * JSON object → Python `dict`
   * JSON array → Python `list`
   * JSON string → Python `str`
   * JSON number → Python `int` or `float`
   * JSON null → Python `None`

---

### Example

Suppose the file contains:

```json
{
  "source_url": "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies",
  "fetched_at_utc": "2025-10-01T20:15:43Z"
}
```

Then:

```python
from pathlib import Path
import json

data = json.loads(Path("data/static/sector_map.provenance.json").read_text())
print(data)
```

Output:

```python
{'source_url': 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies',
 'fetched_at_utc': '2025-10-01T20:15:43Z'}
```

Now `data["source_url"]` and `data["fetched_at_utc"]` are just normal Python strings.


## Alternatively: a more direct way:
```python
import json

with open("data/static/sector_map.provenance.json", "r", encoding="utf-8") as f:
    data = json.load(f)
```


# Write a Markdown file using Python

###  1. Add table rows

```python
for p in provenance:
    md.append(f"| {p['source_url']} | {p['fetched_at_utc']} |")
```

* Loops over each element in `provenance` (likely a list of dicts).
* For each dict `p`, it appends a table row with two columns:

  * The `source_url` (where the data came from)
  * The `fetched_at_utc` timestamp (when it was fetched)

Example if `provenance = [{"source_url":"https://en.wikipedia.org/wiki/List_of_S%26P_500_companies","fetched_at_utc":"2025-10-01T20:15:43Z"}]`, you’d get:

```
| https://en.wikipedia.org/wiki/List_of_S%26P_500_companies | 2025-10-01T20:15:43Z |
```

---

###  2. Write to file

```python
Path("reports/provenance.md").write_text("\n".join(md))
```

* `"\n".join(md)` → joins the list of strings with newline characters, making one big Markdown text block.
* `Path(...).write_text(...)` → saves it into the file `reports/provenance.md`.

---

### 3. Resulting file

The file `reports/provenance.md` will look like:

```markdown
# Data provenance

## Web sources

| Source | Fetched at |
|---|---|
| https://en.wikipedia.org/wiki/List_of_S%26P_500_companies | 2025-10-01T20:15:43Z |
```


In [18]:
# scripts/write_provenance.py
#!/usr/bin/env python
import json, pandas as pd
from pathlib import Path
Path("reports").mkdir(exist_ok=True)

provenance = []
if Path("data/static/sector_map.provenance.json").exists():
    provenance.append(json.loads(Path("data/static/sector_map.provenance.json").read_text()))
else:
    provenance.append({"source_url":"(none)","fetched_at_utc":"(n/a)"})

md = ["# Data provenance",
      "",
      "## Web sources",
      "",
      "| Source | Fetched at |",
      "|---|---|"]
for p in provenance:
    md.append(f"| {p['source_url']} | {p['fetched_at_utc']} |")

Path("reports/provenance.md").write_text("\n".join(md))
print("Wrote reports/provenance.md")

Wrote reports/provenance.md


## Saving the above file into scripts/write_proveance.py. Then run the following shell command.

In [19]:
%%bash
chmod +x scripts/write_provenance.py
python scripts/write_provenance.py

Wrote reports/provenance.md


# Pandas
```python
dtypes = df.dtypes.astype(str).to_dict()
```



1. **`df.dtypes`**

   * Returns a `Series` of column names → data types.
   * Example:

     ```python
     A    int64
     B    float64
     C    object
     dtype: object
     ```

2. **`.astype(str)`**

   * Converts each dtype object (like `int64`, `float64`, `object`) into a string.
   * Example:

     ```python
     A    "int64"
     B    "float64"
     C    "object"
     dtype: object
     ```

3. **`.to_dict()`**

   * Converts the Series into a Python dictionary.
   * Example:

     ```python
     {"A": "int64", "B": "float64", "C": "object"}
     ```

### Example

```python
import pandas as pd

df = pd.DataFrame({
    "A": [1, 2, 3],
    "B": [1.1, 2.2, 3.3],
    "C": ["x", "y", "z"]
})

dtypes = df.dtypes.astype(str).to_dict()
print(dtypes)
```

Output:

```python
{'A': 'int64', 'B': 'float64', 'C': 'object'}
```




```python
df.insert(0, "dataset", p.name)
```


1. **`df.insert(loc, column, value)`**

   * `loc=0` → position where to insert the new column (0 = first column).
   * `"dataset"` → the new column’s name.
   * `p.name` → the value(s) to fill the column with.

2. **`p.name`**

   * If `p` is a `Path` object (from `pathlib`), then `.name` gives the final component of the path (the filename).
   * Example:

     ```python
     from pathlib import Path
     p = Path("data/raw/prices.csv")
     print(p.name)  # "prices.csv"
     ```

3. **Effect on DataFrame**
   Suppose `df` originally looks like:

   ```text
      ticker   price
   0    AAPL    150
   1    MSFT    320
   ```

   After `df.insert(0, "dataset", p.name)`, it becomes:

   ```text
       dataset  ticker  price
   0  prices.csv   AAPL    150
   1  prices.csv   MSFT    320
   ```



In [20]:
# scripts/data_dictionary.py
#!/usr/bin/env python
import pandas as pd
from pathlib import Path

def describe_parquet(path):
    df = pd.read_parquet(path)
    dtypes = df.dtypes.astype(str).to_dict()
    return pd.DataFrame({"column": list(dtypes.keys()), "dtype": list(dtypes.values())})

def main():
    rows=[]
    for path in ["data/processed/prices.parquet",
                 "data/processed/returns.parquet",
                 "data/processed/features_v1.parquet",
                 "data/processed/features_v1_ext.parquet"]:
        p = Path(path)
        if p.exists():
            df = describe_parquet(p)
            df.insert(0, "dataset", p.name) # insert a new col as the first col
            rows.append(df)
    out = pd.concat(rows, ignore_index=True) if rows else pd.DataFrame(columns=["dataset","column","dtype"])
    Path("reports").mkdir(exist_ok=True)
    out.to_csv("reports/data_dictionary.csv", index=False)
    print("Wrote reports/data_dictionary.csv")

if __name__ == "__main__":
    main()

Wrote reports/data_dictionary.csv


## Save the above file into scripts/data_dictionary.py, and then run the following shell comands

In [21]:
%%bash
chmod +x scripts/data_dictionary.py
python scripts/data_dictionary.py

Wrote reports/data_dictionary.csv


# Pytest

In [22]:
# tests/test_dictionary_provenance.py
import os, pandas as pd
def test_provenance_and_dict():
    assert os.path.exists("reports/provenance.md")
    assert os.path.exists("reports/data_dictionary.csv")
    df = pd.read_csv("reports/data_dictionary.csv")
    assert {"dataset","column","dtype"}.issubset(df.columns)

## Save the above file into tests/test_dictionary_provenance.py

In [23]:
%%bash
pytest -q tests/test_dictionary_provenance.py

.                                                                        [100%]
