In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import pandas as pd
import re

In [2]:
import sys
sys.path.append('d:/myprojects/project_api_data_orchestrator/src')

%run ../src/project_api_data_orchestrator/core/config.py
%run ../src/project_api_data_orchestrator/db/connection.py

In [None]:

## Scrape Depth Chart Functions
BASE_URL = "https://www.ourlads.com"
DEPTHCHARTS_INDEX = BASE_URL + "/nfldepthcharts/depthcharts.aspx"

# Target positions (including variants for WR)
TARGET_POS = {"QB", "WR", "RB", "TE", "LWR", "RWR", "SWR"}

def get_team_links():
    resp = requests.get(DEPTHCHARTS_INDEX, headers={"User-Agent": "Mozilla/5.0"})
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.startswith("/nfldepthcharts/depthchart/"):
            full = BASE_URL + href
            if full not in links:
                links.append(full)
    return links

def parse_team_depthchart(team_url):
    resp = requests.get(team_url, headers={"User-Agent": "Mozilla/5.0"})
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # Derive team abbreviation
    team_abbrev = None
    depwrapper = soup.find("div", id="ctl00_phContent_DepWrapper")
    if depwrapper:
        for c in depwrapper.get("class", []):
            if c.startswith("dt-"):
                team_abbrev = c[3:]
                break
    if not team_abbrev:
        team_abbrev = team_url.rstrip("/").split("/")[-1].upper()

    records = []
    # Locate the table
    table = depwrapper.find("table", class_="table-bordered") if depwrapper else None
    if table is None:
        table = soup.find("table", class_="table-bordered")
    if table is None:
        return records

    tbody = table.find("tbody") or table.find("tbody", id="ctl00_phContent_dcTBody")
    if tbody is None:
        return records

    for tr in tbody.find_all("tr"):
        tds = tr.find_all("td")
        if len(tds) < 2:
            continue

        pos_raw = tds[0].get_text(strip=True)
        pos = pos_raw.strip()
        pos_norm = "WR" if pos in ("LWR", "RWR", "SWR") else pos

        if pos_norm not in TARGET_POS:
            continue

        tier = 1
        for num_idx in range(1, len(tds), 2):
            player_idx = num_idx + 1
            if player_idx >= len(tds):
                break
            a = tds[player_idx].find("a")
            if a and a.get_text(strip=True):
                player_text = a.get_text(strip=True)
                player_text = re.sub(
                    r"\s+(?:[A-Z]{2}\d{2}|\d{2}/\d|[A-Z]{1,2}/[A-Za-z]{2,3})$",
                    "",
                    player_text,
                    flags=re.IGNORECASE,
                )
                player_clean = player_text.title().strip()
                records.append((player_clean, team_abbrev, pos_norm, tier))
            tier += 1

    return records

def scrape_all_to_dataframe():
    team_links = get_team_links()
    print(f"Found {len(team_links)} team pages.")
    all_records = []
    for link in team_links:
        try:
            recs = parse_team_depthchart(link)
            # print(f"  {link} → {len(recs)} records")
            all_records.extend(recs)
        except Exception as e:
            print(f"Error parsing {link}: {e}")
        time.sleep(1)

    # Deduplicate
    seen = set()
    deduped = []
    for rec in all_records:
        if rec not in seen:
            seen.add(rec)
            deduped.append(rec)

    df = pd.DataFrame(deduped, columns=["Player", "Team", "Position", "Tier"])
    print(f"Scraped {len(df)} unique player-position records.")
    return df

In [4]:
df = scrape_all_to_dataframe()
print(df.head())

Found 32 team pages.
Scraped 469 unique player-position records.
            Player Team Position  Tier
0   Palmer, Joshua  BUF       WR     1
1  Shavers, Tyrell  BUF       WR     2
2    Coleman, Keon  BUF       WR     1
3   Samuel, Curtis  BUF       WR     2
4   Shakir, Khalil  BUF       WR     1


In [11]:

sorted(df['Team'].unique())

['ARZ',
 'ATL',
 'BAL',
 'BUF',
 'CAR',
 'CHI',
 'CIN',
 'CLE',
 'DAL',
 'DEN',
 'DET',
 'GB',
 'HOU',
 'IND',
 'JAX',
 'KC',
 'LAC',
 'LAR',
 'LV',
 'MIA',
 'MIN',
 'NE',
 'NO',
 'NYG',
 'NYJ',
 'PHI',
 'PIT',
 'SEA',
 'SF',
 'TB',
 'TEN',
 'WAS']

In [6]:

conn = get_connection('nfl_data')

query = "SELECT * FROM public.teams ORDER BY team_id ASC"
df_pg = pd.read_sql_query(query, conn)
conn.close()

  df_pg = pd.read_sql_query(query, conn)


In [None]:

## Check if Teams match
sorted(df['Team'].unique()) == sorted(df_pg['abbreviation'].unique())

True