In [None]:
# Quick check of existing tables
import sqlite3
conn = sqlite3.connect('../data/processed/nwsldata.db')
tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name").fetchall()
print(f"Current tables ({len(tables)}):")
for table in tables:
    print(f"  - {table[0]}")
conn.close()

In [None]:
import sqlite3
import pandas as pd

# Connect and check what tables actually exist
conn = sqlite3.connect('../data/processed/nwsldata.db')

print("🔍 DEBUGGING: WHAT TABLES ACTUALLY EXIST?")
print("=" * 50)

# Check all tables in sqlite_master
all_tables = pd.read_sql_query("""
    SELECT name, type, sql 
    FROM sqlite_master 
    WHERE type='table' 
    ORDER BY name
""", conn)

print(f"📊 Found {len(all_tables)} tables in sqlite_master:")
for _, row in all_tables.iterrows():
    print(f"  📋 {row['name']}")

# Check if our match report tables were created
missing_tables = []
expected_tables = ['match_event', 'match_team', 'match_player', 'match_shot']

print(f"\n🔍 Checking for match report tables:")
for table in expected_tables:
    try:
        conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()
        print(f"✅ {table} exists")
    except:
        missing_tables.append(table)
        print(f"❌ {table} missing")

if missing_tables:
    print(f"\n🔧 Creating missing tables: {missing_tables}")
    
    # Create match_event table
    if 'match_event' in missing_tables:
        conn.execute("""
        CREATE TABLE match_event (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            match_id INTEGER NOT NULL,
            minute INTEGER,
            event_type TEXT,
            player_name TEXT,
            team_name TEXT,
            description TEXT,
            FOREIGN KEY (match_id) REFERENCES match(id)
        )
        """)
        print("✅ Created match_event table")

    # Create match_team table  
    if 'match_team' in missing_tables:
        conn.execute("""
        CREATE TABLE match_team (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            match_id INTEGER NOT NULL,
            team_name TEXT NOT NULL,
            formation TEXT,
            manager TEXT,
            captain TEXT,
            possession REAL,
            shots INTEGER,
            shots_on_target INTEGER,
            passes_completed INTEGER,
            passes_attempted INTEGER,
            FOREIGN KEY (match_id) REFERENCES match(id)
        )
        """)
        print("✅ Created match_team table")

    # Create match_player table
    if 'match_player' in missing_tables:
        conn.execute("""
        CREATE TABLE match_player (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            match_id INTEGER NOT NULL,
            player_name TEXT NOT NULL,
            team_name TEXT NOT NULL,
            position TEXT,
            minutes_played INTEGER,
            goals INTEGER DEFAULT 0,
            assists INTEGER DEFAULT 0,
            yellow_cards INTEGER DEFAULT 0,
            red_cards INTEGER DEFAULT 0,
            substitution_time INTEGER,
            FOREIGN KEY (match_id) REFERENCES match(id)
        )
        """)
        print("✅ Created match_player table")

    # Create match_shot table
    if 'match_shot' in missing_tables:
        conn.execute("""
        CREATE TABLE match_shot (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            match_id INTEGER NOT NULL,
            player_name TEXT NOT NULL,
            team_name TEXT NOT NULL,
            minute INTEGER,
            shot_type TEXT,
            outcome TEXT,
            xg_value REAL,
            distance REAL,
            angle REAL,
            body_part TEXT,
            FOREIGN KEY (match_id) REFERENCES match(id)
        )
        """)
        print("✅ Created match_shot table")

    conn.commit()

# Force WAL checkpoint to sync with DBeaver
print(f"\n🔄 Running WAL checkpoint to sync with DBeaver...")
conn.execute("PRAGMA wal_checkpoint")
conn.commit()

# Final verification
print(f"\n✅ FINAL CHECK - All tables in database:")
final_tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name", conn)
for _, row in final_tables.iterrows():
    print(f"  📋 {row['name']}")

print(f"\n🎯 Total tables: {len(final_tables)}")
print(f"Expected: 10 tables (match, player, season, team_seasons, teams + 4 match report tables)")

conn.close()
print(f"\n✅ Database connection closed. DBeaver should now show all 10 tables after refresh.")

In [None]:
# ══════════════════════════════════════════════════════════════════════════════
#  PROCESS ALL MATCH REPORTS - Using Improved Team Mapping
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
import re
from datetime import datetime
from tqdm import tqdm
import os

# Connect to database
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)
cur = conn.cursor()

# 1) Get expanded team mapping from database
team_mapping_query = """
SELECT team_name, team_id FROM teams
UNION ALL
SELECT team_name_short, team_id FROM teams WHERE team_name_short IS NOT NULL
UNION ALL  
SELECT team_name_alias_1, team_id FROM teams WHERE team_name_alias_1 IS NOT NULL
UNION ALL
SELECT team_name_alias_2, team_id FROM teams WHERE team_name_alias_2 IS NOT NULL
"""

team_mapping_df = pd.read_sql_query(team_mapping_query, conn)
# Create expanded mapping dictionary
expanded_team_mapping = dict(zip(team_mapping_df['team_name'], team_mapping_df['team_id']))

print(f"✅ Loaded expanded team mapping with {len(expanded_team_mapping)} variations")

# Add a few more problematic mappings discovered during testing
additional_mappings = {
    "San Diego Wave": "bf961da0",  # Sometimes appears without "FC"
    "OL Reign": "257fad2b",         # Historical name
    "Red Stars": "d976a235",        # Short version
    "Current": "6f666306"           # Very short version
}
expanded_team_mapping.update(additional_mappings)

print(f"📋 Team mapping includes: {len(expanded_team_mapping)} team name variations")

# 2) Create Match table if it doesn't exist
cur.execute("""
CREATE TABLE IF NOT EXISTS Match (
    match_id      INTEGER PRIMARY KEY AUTOINCREMENT,
    season_id     INTEGER,
    match_date    DATE,
    home_team_id  TEXT REFERENCES teams(team_id),
    away_team_id  TEXT REFERENCES teams(team_id),
    home_xg       REAL,
    away_xg       REAL,
    attendance    INTEGER,
    venue         TEXT,
    referee       TEXT,
    home_formation TEXT,
    away_formation TEXT,
    filename      TEXT,
    extraction_status TEXT
);
""")

# 3) Helper function to resolve team names
def resolve_team_name(team_name):
    """Resolve team name to team_id using expanded mapping"""
    if not team_name:
        return None
        
    # Direct match
    if team_name in expanded_team_mapping:
        return expanded_team_mapping[team_name]
    
    # Try without common suffixes
    clean_name = team_name.replace(" FC", "").replace(" SC", "").strip()
    if clean_name in expanded_team_mapping:
        return expanded_team_mapping[clean_name]
    
    return None

# 4) Enhanced match extraction function
def extract_match_data_from_html(html_path):
    """Extract match data from HTML with improved team mapping"""
    
    try:
        soup = BeautifulSoup(html_path.read_text(encoding="utf-8"), "lxml")
        
        # Extract headline
        headline = soup.find("h1")
        if not headline:
            return None, "No headline found"
            
        headline_text = headline.get_text(strip=True)
        
        # Parse teams and date from headline
        match_pattern = r"(.+?)\s+vs\.?\s+(.+?)\s+Match Report.*?(\w+)\s+(\w+)\s+(\d{1,2}),?\s+(\d{4})"
        match = re.search(match_pattern, headline_text)
        
        if not match:
            return None, f"Could not parse headline: {headline_text}"
            
        home_name, away_name, day_name, month, day, year = match.groups()
        
        # Clean team names
        home_name = home_name.strip()
        away_name = away_name.strip()
        
        # Resolve team IDs
        home_team_id = resolve_team_name(home_name)
        away_team_id = resolve_team_name(away_name)
        
        if not home_team_id:
            return None, f"Could not resolve home team: '{home_name}'"
        if not away_team_id:
            return None, f"Could not resolve away team: '{away_name}'"
        
        # Parse date
        try:
            match_date = datetime.strptime(f"{month} {day} {year}", "%B %d %Y").date()
        except ValueError:
            try:
                match_date = datetime.strptime(f"{month} {day} {year}", "%b %d %Y").date()
            except ValueError:
                return None, f"Could not parse date: {month} {day} {year}"
        
        # Extract xG data
        xg_text = soup.find(string=re.compile(r"xG"))
        home_xg = away_xg = None
        
        if xg_text:
            xg_numbers = re.findall(r"(\d+\.\d+)", str(xg_text))
            if len(xg_numbers) >= 2:
                home_xg, away_xg = float(xg_numbers[0]), float(xg_numbers[1])
        
        # Extract attendance
        attendance_text = soup.find(string=re.compile(r"Attendance:"))
        attendance = None
        if attendance_text:
            att_match = re.search(r"(\d[\d,]*)", attendance_text)
            if att_match:
                attendance = int(att_match.group(1).replace(",", ""))
        
        # Extract venue
        venue_text = soup.find(string=re.compile(r"Venue:"))
        venue = None
        if venue_text:
            venue_parts = venue_text.split(":", 1)
            if len(venue_parts) > 1:
                venue = venue_parts[1].strip()
        
        # Extract referee
        referee_text = soup.find(string=re.compile(r"Referee:"))
        referee = None
        if referee_text:
            ref_parts = referee_text.split(":", 1)
            if len(ref_parts) > 1:
                referee = ref_parts[1].strip()
        
        return {
            'match_date': match_date,
            'home_team_id': home_team_id,
            'away_team_id': away_team_id,
            'home_team_name': home_name,
            'away_team_name': away_name,
            'home_xg': home_xg,
            'away_xg': away_xg,
            'attendance': attendance,
            'venue': venue,
            'referee': referee,
            'season_year': int(year)
        }, "success"
        
    except Exception as e:
        return None, f"Exception: {str(e)}"

# 5) Get all HTML files to process
html_dir = Path("/Users/thomasmcmillan/projects/nwsl_data/data/raw_match_pages")
html_files = list(html_dir.glob("*.html"))

print(f"🔍 Found {len(html_files)} HTML files to process")

# 6) Process all files with progress tracking
successful_extractions = []
failed_extractions = []

print("🚀 Processing all match reports...")

for html_file in tqdm(html_files, desc="Processing matches"):
    match_data, status = extract_match_data_from_html(html_file)
    
    if match_data:
        match_data['filename'] = html_file.name
        match_data['extraction_status'] = status
        successful_extractions.append(match_data)
    else:
        failed_extractions.append({
            'filename': html_file.name,
            'error': status
        })

# 7) Results summary
total_files = len(html_files)
successful_count = len(successful_extractions)
failed_count = len(failed_extractions)

print(f"\n📊 Extraction Results:")
print(f"   ✅ Successful: {successful_count}/{total_files} ({successful_count/total_files*100:.1f}%)")
print(f"   ❌ Failed: {failed_count}/{total_files} ({failed_count/total_files*100:.1f}%)")

# 8) Insert successful extractions into database
if successful_extractions:
    print(f"📝 Inserting {successful_count} matches into database...")
    
    # Clear existing match data
    cur.execute("DELETE FROM Match")
    
    for match_data in successful_extractions:
        cur.execute("""
            INSERT INTO Match (
                season_id, match_date, home_team_id, away_team_id,
                home_xg, away_xg, attendance, venue, referee,
                filename, extraction_status
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """, (
            match_data['season_year'],  # Using year as season_id for now
            match_data['match_date'],
            match_data['home_team_id'],
            match_data['away_team_id'],
            match_data['home_xg'],
            match_data['away_xg'],
            match_data['attendance'],
            match_data['venue'],
            match_data['referee'],
            match_data['filename'],
            match_data['extraction_status']
        ))
    
    conn.commit()
    print(f"✅ Successfully inserted {successful_count} matches")

# 9) Show sample results
if successful_extractions:
    print(f"\n--- Sample Successful Extractions ---")
    sample_df = pd.DataFrame(successful_extractions[:5])
    print(sample_df[['match_date', 'home_team_name', 'away_team_name', 'home_xg', 'away_xg']])

# 10) Analyze failures
if failed_extractions:
    print(f"\n--- Failure Analysis ---")
    failure_df = pd.DataFrame(failed_extractions)
    
    # Group failures by error type
    error_counts = failure_df['error'].value_counts()
    print("Top failure reasons:")
    for error, count in error_counts.head(10).items():
        print(f"  {error}: {count} files")
    
    # Show some sample failures
    print(f"\n--- Sample Failed Files ---")
    print(failure_df.head())

# 11) Database verification
print(f"\n--- Database Verification ---")
matches_in_db = pd.read_sql_query("""
    SELECT COUNT(*) as total_matches,
           COUNT(DISTINCT home_team_id) + COUNT(DISTINCT away_team_id) as unique_teams,
           MIN(match_date) as earliest_match,
           MAX(match_date) as latest_match,
           COUNT(DISTINCT season_id) as seasons
    FROM Match
""", conn)
print(matches_in_db)

# Show sample matches from database
sample_matches = pd.read_sql_query("""
    SELECT m.match_date, 
           h.team_name as home_team,
           a.team_name as away_team,
           m.home_xg, m.away_xg,
           m.venue
    FROM Match m
    JOIN teams h ON m.home_team_id = h.team_id
    JOIN teams a ON m.away_team_id = a.team_id
    ORDER BY m.match_date DESC
    LIMIT 10
""", conn)

print(f"\n--- Recent Matches in Database ---")
print(sample_matches)

conn.close()

print(f"\n🎉 Match extraction complete!")
print(f"   📊 {successful_count} matches successfully processed")
print(f"   🗂️ Ready for lineup and player stats extraction")

In [2]:
# --- Parse ONE saved FBref match-report HTML file --------------------------
#
# 1)  Set the path below to whichever file you want to inspect.
# 2)  Run the cell – it will:
#       • read the file
#       • find every <table> element inside (including all tabbed sections)
#       • convert each to a Pandas DataFrame
#       • print its index, shape, and the first 5 rows so you can sanity-check
#
# --------------------------------------------------------------------------

from pathlib import Path
from bs4 import BeautifulSoup
import pandas as pd

# 👉 EDIT ME -----------------------------------------------------------------
html_path = Path("/Users/thomasmcmillan/projects/nwsl_data/data/raw_match_pages/Seattle Reign FC vs. Orlando Pride Match Report – Saturday April 12, 2025 _ FBref.com.html")
# ---------------------------------------------------------------------------

# 1) Load & parse
soup = BeautifulSoup(html_path.read_text(encoding="utf-8"), "lxml")

# 2) Extract every <table>
tables = soup.select("table")
print(f"✅ Found {len(tables)} <table> elements in {html_path.name}")

# 3) Convert & preview
for i, tbl in enumerate(tables):
    try:
        df = pd.read_html(str(tbl))[0]
    except ValueError:
        continue
    print(f"\n── table_{i:02d}  shape={df.shape} ──")
    display(df.head())

# 4) Optional: keep all dataframes in a list `tables_dfs` for later use tables_dfs = [pd.read_html(str(t))[0] for t in tables if not t.find_all("tr") == []]


✅ Found 20 <table> elements in Seattle Reign FC vs. Orlando Pride Match Report – Saturday April 12, 2025 _ FBref.com.html

── table_00  shape=(21, 2) ──


  df = pd.read_html(str(tbl))[0]


Unnamed: 0,Reign (3-5-2),Reign (3-5-2).1
0,1,Claudia Dickey
1,2,Maddie Mercado
2,3,Lauren Barnes
3,5,Madeline Dahlien
4,9,Jordyn Huitema



── table_01  shape=(21, 2) ──


  df = pd.read_html(str(tbl))[0]


Unnamed: 0,Pride (4-4-2),Pride (4-4-2).1
0,1,Anna Moorhouse
1,2,Haley Hanson
2,3,Kylie Strom
3,6,Emily Sams
4,10,Marta



── table_02  shape=(9, 2) ──


  df = pd.read_html(str(tbl))[0]


Unnamed: 0_level_0,Reign,Pride
Unnamed: 0_level_1,Possession,Possession
0,42%,58%
1,Passing Accuracy,Passing Accuracy
2,289 of 382 — 76%,82% — 443 of 538
3,Shots on Target,Shots on Target
4,5 of 13 — 38%,57% — 4 of 7



── table_03  shape=(17, 31) ──


  df = pd.read_html(str(tbl))[0]


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Performance,Performance,Performance,Performance,...,SCA,SCA,Passes,Passes,Passes,Passes,Carries,Carries,Take-Ons,Take-Ons
Unnamed: 0_level_1,Player,#,Nation,Pos,Age,Min,Gls,Ast,PK,PKatt,...,SCA,GCA,Cmp,Att,Cmp%,PrgP,Carries,PrgC,Att,Succ
0,Jordyn Huitema,9.0,ca CAN,FW,23-339,45,0,0,0,0,...,1,0,15,17,88.2,4,13,0,1,0
1,Lynn Biyendolo,6.0,us USA,FW,31-326,45,0,0,0,0,...,6,0,16,18,88.9,5,12,0,1,0
2,Nérilia Mondésir,30.0,ht HAI,FW,26-085,90,0,0,0,0,...,3,0,8,21,38.1,0,21,3,4,2
3,Maddie Mercado,2.0,us USA,CM,24-011,68,0,0,0,0,...,3,0,20,27,74.1,2,24,2,1,1
4,Shae Holmes,25.0,us USA,CM,25-035,22,0,0,0,0,...,0,0,8,13,61.5,3,9,1,1,1



── table_04  shape=(17, 28) ──


  df = pd.read_html(str(tbl))[0]


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Total,Total,Total,Total,...,Long,Long,Unnamed: 20_level_0,Unnamed: 21_level_0,Unnamed: 22_level_0,Unnamed: 23_level_0,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0
Unnamed: 0_level_1,Player,#,Nation,Pos,Age,Min,Cmp,Att,Cmp%,TotDist,...,Att,Cmp%,Ast,xAG,xA,KP,1/3,PPA,CrsPA,PrgP
0,Jordyn Huitema,9.0,ca CAN,FW,23-339,45,15,17,88.2,213,...,1,100.0,0,0.0,0.0,0,3,0,0,4
1,Lynn Biyendolo,6.0,us USA,FW,31-326,45,16,18,88.9,207,...,1,100.0,0,0.5,0.1,4,1,3,0,5
2,Nérilia Mondésir,30.0,ht HAI,FW,26-085,90,8,21,38.1,95,...,2,50.0,0,0.0,0.0,0,0,0,0,0
3,Maddie Mercado,2.0,us USA,CM,24-011,68,20,27,74.1,307,...,3,66.7,0,0.1,0.0,3,2,0,0,2
4,Shae Holmes,25.0,us USA,CM,25-035,22,8,13,61.5,133,...,2,0.0,0,0.0,0.0,0,2,0,0,3



── table_05  shape=(17, 21) ──


  df = pd.read_html(str(tbl))[0]


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Pass Types,Pass Types,Pass Types,Pass Types,Pass Types,Pass Types,Pass Types,Pass Types,Corner Kicks,Corner Kicks,Corner Kicks,Outcomes,Outcomes,Outcomes
Unnamed: 0_level_1,Player,#,Nation,Pos,Age,Min,Att,Live,Dead,FK,...,Sw,Crs,TI,CK,In,Out,Str,Cmp,Off,Blocks
0,Jordyn Huitema,9.0,ca CAN,FW,23-339,45,17,17,0,0,...,0,0,0,0,0,0,0,15,0,0
1,Lynn Biyendolo,6.0,us USA,FW,31-326,45,18,18,0,0,...,0,0,0,0,0,0,0,16,0,0
2,Nérilia Mondésir,30.0,ht HAI,FW,26-085,90,21,21,0,0,...,0,1,0,0,0,0,0,8,0,0
3,Maddie Mercado,2.0,us USA,CM,24-011,68,27,24,2,0,...,0,1,0,0,0,0,0,20,1,0
4,Shae Holmes,25.0,us USA,CM,25-035,22,13,12,1,0,...,0,0,1,0,0,0,0,8,0,1



── table_06  shape=(17, 22) ──


  df = pd.read_html(str(tbl))[0]


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Tackles,Tackles,Tackles,Tackles,...,Challenges,Challenges,Challenges,Blocks,Blocks,Blocks,Unnamed: 18_level_0,Unnamed: 19_level_0,Unnamed: 20_level_0,Unnamed: 21_level_0
Unnamed: 0_level_1,Player,#,Nation,Pos,Age,Min,Tkl,TklW,Def 3rd,Mid 3rd,...,Att,Tkl%,Lost,Blocks,Sh,Pass,Int,Tkl+Int,Clr,Err
0,Jordyn Huitema,9.0,ca CAN,FW,23-339,45,0,0,0,0,...,0,,0,0,0,0,0,0,0,0
1,Lynn Biyendolo,6.0,us USA,FW,31-326,45,1,0,0,0,...,0,,0,0,0,0,0,1,0,0
2,Nérilia Mondésir,30.0,ht HAI,FW,26-085,90,3,3,1,2,...,0,,0,0,0,0,1,4,0,0
3,Maddie Mercado,2.0,us USA,CM,24-011,68,0,0,0,0,...,0,,0,0,0,0,1,1,0,0
4,Shae Holmes,25.0,us USA,CM,25-035,22,1,0,0,1,...,0,,0,0,0,0,1,2,3,0



── table_07  shape=(17, 28) ──


  df = pd.read_html(str(tbl))[0]


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Touches,Touches,Touches,Touches,...,Carries,Carries,Carries,Carries,Carries,Carries,Carries,Carries,Receiving,Receiving
Unnamed: 0_level_1,Player,#,Nation,Pos,Age,Min,Touches,Def Pen,Def 3rd,Mid 3rd,...,Carries,TotDist,PrgDist,PrgC,1/3,CPA,Mis,Dis,Rec,PrgR
0,Jordyn Huitema,9.0,ca CAN,FW,23-339,45,21,0,0,14,...,13,51,24,0,1,0,2,0,16,1
1,Lynn Biyendolo,6.0,us USA,FW,31-326,45,23,0,0,8,...,12,27,10,0,0,0,1,2,15,1
2,Nérilia Mondésir,30.0,ht HAI,FW,26-085,90,30,0,2,13,...,21,193,99,3,2,1,2,2,25,9
3,Maddie Mercado,2.0,us USA,CM,24-011,68,33,0,2,20,...,24,150,66,2,2,1,4,0,27,6
4,Shae Holmes,25.0,us USA,CM,25-035,22,17,1,1,13,...,9,48,36,1,0,0,0,0,8,0



── table_08  shape=(17, 22) ──


  df = pd.read_html(str(tbl))[0]


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Aerial Duels,Aerial Duels,Aerial Duels
Unnamed: 0_level_1,Player,#,Nation,Pos,Age,Min,CrdY,CrdR,2CrdY,Fls,...,Crs,Int,TklW,PKwon,PKcon,OG,Recov,Won,Lost,Won%
0,Jordyn Huitema,9.0,ca CAN,FW,23-339,45,0,0,0,1,...,0,0,0,0,0,0,2,2,2,50.0
1,Lynn Biyendolo,6.0,us USA,FW,31-326,45,0,0,0,0,...,0,0,0,0,0,0,5,1,2,33.3
2,Nérilia Mondésir,30.0,ht HAI,FW,26-085,90,0,0,0,3,...,1,1,3,0,0,0,5,1,1,50.0
3,Maddie Mercado,2.0,us USA,CM,24-011,68,0,0,0,1,...,1,1,0,0,0,0,2,1,1,50.0
4,Shae Holmes,25.0,us USA,CM,25-035,22,0,0,0,0,...,0,1,0,0,0,0,2,1,1,50.0



── table_09  shape=(1, 24) ──


  df = pd.read_html(str(tbl))[0]


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Shot Stopping,Shot Stopping,Shot Stopping,Shot Stopping,Shot Stopping,Launched,...,Passes,Passes,Goal Kicks,Goal Kicks,Goal Kicks,Crosses,Crosses,Crosses,Sweeper,Sweeper
Unnamed: 0_level_1,Player,Nation,Age,Min,SoTA,GA,Saves,Save%,PSxG,Cmp,...,Launch%,AvgLen,Att,Launch%,AvgLen,Opp,Stp,Stp%,#OPA,AvgDist
0,Claudia Dickey,us USA,25-096,90,4,1,3,75.0,1.5,8,...,23.1,28.5,10,70.0,38.9,7,0,0.0,0,1.0



── table_10  shape=(17, 31) ──


  df = pd.read_html(str(tbl))[0]


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Performance,Performance,Performance,Performance,...,SCA,SCA,Passes,Passes,Passes,Passes,Carries,Carries,Take-Ons,Take-Ons
Unnamed: 0_level_1,Player,#,Nation,Pos,Age,Min,Gls,Ast,PK,PKatt,...,SCA,GCA,Cmp,Att,Cmp%,PrgP,Carries,PrgC,Att,Succ
0,Barbra Banda,22.0,zm ZAM,FW,25-023,90,1,0,0,0,...,2,0,8,13,61.5,1,10,1,3,1
1,Marta,10.0,br BRA,FW,39-052,60,0,0,0,0,...,3,0,23,31,74.2,0,16,1,1,0
2,Prisca Chilufya,9.0,zm ZAM,FW,25-308,30,0,0,0,0,...,0,0,5,6,83.3,0,8,1,2,1
3,Summer Yates,28.0,us USA,LM,24-299,42,0,1,0,0,...,2,1,13,17,76.5,2,15,1,3,1
4,Angelina,15.0,br BRA,LM,25-076,48,0,0,0,0,...,1,0,23,31,74.2,3,14,2,0,0


  df = pd.read_html(str(tbl))[0]



── table_11  shape=(17, 28) ──


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Total,Total,Total,Total,...,Long,Long,Unnamed: 20_level_0,Unnamed: 21_level_0,Unnamed: 22_level_0,Unnamed: 23_level_0,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0
Unnamed: 0_level_1,Player,#,Nation,Pos,Age,Min,Cmp,Att,Cmp%,TotDist,...,Att,Cmp%,Ast,xAG,xA,KP,1/3,PPA,CrsPA,PrgP
0,Barbra Banda,22.0,zm ZAM,FW,25-023,90,8,13,61.5,126,...,2,50.0,0,0.0,0.0,0,1,0,0,1
1,Marta,10.0,br BRA,FW,39-052,60,23,31,74.2,458,...,8,62.5,0,0.0,0.0,1,1,0,0,0
2,Prisca Chilufya,9.0,zm ZAM,FW,25-308,30,5,6,83.3,79,...,0,,0,0.0,0.0,0,0,0,0,0
3,Summer Yates,28.0,us USA,LM,24-299,42,13,17,76.5,311,...,4,100.0,1,1.1,0.4,2,2,1,0,2
4,Angelina,15.0,br BRA,LM,25-076,48,23,31,74.2,349,...,2,50.0,0,0.0,0.0,0,1,0,0,3


  df = pd.read_html(str(tbl))[0]



── table_12  shape=(17, 21) ──


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Pass Types,Pass Types,Pass Types,Pass Types,Pass Types,Pass Types,Pass Types,Pass Types,Corner Kicks,Corner Kicks,Corner Kicks,Outcomes,Outcomes,Outcomes
Unnamed: 0_level_1,Player,#,Nation,Pos,Age,Min,Att,Live,Dead,FK,...,Sw,Crs,TI,CK,In,Out,Str,Cmp,Off,Blocks
0,Barbra Banda,22.0,zm ZAM,FW,25-023,90,13,13,0,0,...,0,1,0,0,0,0,0,8,0,0
1,Marta,10.0,br BRA,FW,39-052,60,31,29,2,0,...,1,3,0,1,1,0,0,23,0,1
2,Prisca Chilufya,9.0,zm ZAM,FW,25-308,30,6,6,0,0,...,0,0,0,0,0,0,0,5,0,0
3,Summer Yates,28.0,us USA,LM,24-299,42,17,17,0,0,...,1,1,0,0,0,0,0,13,0,0
4,Angelina,15.0,br BRA,LM,25-076,48,31,26,4,0,...,0,1,4,0,0,0,0,23,1,1



── table_13  shape=(17, 22) ──


  df = pd.read_html(str(tbl))[0]


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Tackles,Tackles,Tackles,Tackles,...,Challenges,Challenges,Challenges,Blocks,Blocks,Blocks,Unnamed: 18_level_0,Unnamed: 19_level_0,Unnamed: 20_level_0,Unnamed: 21_level_0
Unnamed: 0_level_1,Player,#,Nation,Pos,Age,Min,Tkl,TklW,Def 3rd,Mid 3rd,...,Att,Tkl%,Lost,Blocks,Sh,Pass,Int,Tkl+Int,Clr,Err
0,Barbra Banda,22.0,zm ZAM,FW,25-023,90,1,0,0,0,...,2,0.0,2,1,0,1,0,1,0,0
1,Marta,10.0,br BRA,FW,39-052,60,0,0,0,0,...,0,,0,0,0,0,3,3,0,0
2,Prisca Chilufya,9.0,zm ZAM,FW,25-308,30,1,1,1,0,...,1,100.0,0,0,0,0,0,1,0,0
3,Summer Yates,28.0,us USA,LM,24-299,42,1,1,0,1,...,1,100.0,0,1,0,1,0,1,0,0
4,Angelina,15.0,br BRA,LM,25-076,48,1,1,0,1,...,1,100.0,0,1,0,1,1,2,0,0


  df = pd.read_html(str(tbl))[0]



── table_14  shape=(17, 28) ──


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Touches,Touches,Touches,Touches,...,Carries,Carries,Carries,Carries,Carries,Carries,Carries,Carries,Receiving,Receiving
Unnamed: 0_level_1,Player,#,Nation,Pos,Age,Min,Touches,Def Pen,Def 3rd,Mid 3rd,...,Carries,TotDist,PrgDist,PrgC,1/3,CPA,Mis,Dis,Rec,PrgR
0,Barbra Banda,22.0,zm ZAM,FW,25-023,90,23,0,0,10,...,10,64,45,1,1,0,2,0,13,7
1,Marta,10.0,br BRA,FW,39-052,60,36,0,4,20,...,16,96,34,1,1,1,0,1,27,5
2,Prisca Chilufya,9.0,zm ZAM,FW,25-308,30,11,0,4,1,...,8,66,29,1,1,1,1,4,9,3
3,Summer Yates,28.0,us USA,LM,24-299,42,24,0,6,14,...,15,115,43,1,0,1,1,1,17,3
4,Angelina,15.0,br BRA,LM,25-076,48,40,0,2,19,...,14,58,27,2,2,0,1,4,27,7



── table_15  shape=(17, 22) ──


  df = pd.read_html(str(tbl))[0]


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Aerial Duels,Aerial Duels,Aerial Duels
Unnamed: 0_level_1,Player,#,Nation,Pos,Age,Min,CrdY,CrdR,2CrdY,Fls,...,Crs,Int,TklW,PKwon,PKcon,OG,Recov,Won,Lost,Won%
0,Barbra Banda,22.0,zm ZAM,FW,25-023,90,0,0,0,2,...,1,0,0,0,0,0,3,0,0,
1,Marta,10.0,br BRA,FW,39-052,60,0,0,0,0,...,3,3,0,0,0,0,2,0,0,
2,Prisca Chilufya,9.0,zm ZAM,FW,25-308,30,0,0,0,0,...,0,0,1,0,0,0,2,0,1,0.0
3,Summer Yates,28.0,us USA,LM,24-299,42,0,0,0,0,...,1,0,1,0,0,0,2,0,0,
4,Angelina,15.0,br BRA,LM,25-076,48,0,0,0,1,...,1,1,1,0,0,0,3,1,2,33.3



── table_16  shape=(1, 24) ──


  df = pd.read_html(str(tbl))[0]


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Shot Stopping,Shot Stopping,Shot Stopping,Shot Stopping,Shot Stopping,Launched,...,Passes,Passes,Goal Kicks,Goal Kicks,Goal Kicks,Crosses,Crosses,Crosses,Sweeper,Sweeper
Unnamed: 0_level_1,Player,Nation,Age,Min,SoTA,GA,Saves,Save%,PSxG,Cmp,...,Launch%,AvgLen,Att,Launch%,AvgLen,Opp,Stp,Stp%,#OPA,AvgDist
0,Anna Moorhouse,eng ENG,30-013,90,5,0,5,100.0,1.3,3,...,9.4,25.8,11,27.3,25.5,9,0,0.0,0,9.0


  df = pd.read_html(str(tbl))[0]



── table_17  shape=(21, 13) ──


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,SCA 1,SCA 1,SCA 2,SCA 2
Unnamed: 0_level_1,Minute,Player,Squad,xG,PSxG,Outcome,Distance,Body Part,Notes,Player,Event,Player,Event
0,8.0,Samantha Meza,Reign,0.02,0.0,Blocked,35.0,Right Foot,,Maddie Mercado,Pass (Live),Jordyn Huitema,Pass (Live)
1,11.0,Jordyn Huitema,Reign,0.06,0.0,Off Target,25.0,Right Foot,,Jordyn Bugg,Pass (Live),Claudia Dickey,Pass (Live)
2,13.0,Ainsley Mccammon,Reign,0.06,0.0,Off Target,23.0,Right Foot,,Maddie Mercado,Pass (Live),Madeline Dahlien,Pass (Live)
3,14.0,Summer Yates,Pride,0.04,0.18,Saved,16.0,Right Foot,,Kerry Abello,Pass (Live),Marta,Pass (Live)
4,17.0,Summer Yates,Pride,0.06,0.0,Off Target,18.0,Left Foot,,Barbra Banda,Pass (Live),Marta,Pass (Live)



── table_18  shape=(14, 13) ──


  df = pd.read_html(str(tbl))[0]


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,SCA 1,SCA 1,SCA 2,SCA 2
Unnamed: 0_level_1,Minute,Player,Squad,xG,PSxG,Outcome,Distance,Body Part,Notes,Player,Event,Player,Event
0,8.0,Samantha Meza,Reign,0.02,0.0,Blocked,35.0,Right Foot,,Maddie Mercado,Pass (Live),Jordyn Huitema,Pass (Live)
1,11.0,Jordyn Huitema,Reign,0.06,0.0,Off Target,25.0,Right Foot,,Jordyn Bugg,Pass (Live),Claudia Dickey,Pass (Live)
2,13.0,Ainsley Mccammon,Reign,0.06,0.0,Off Target,23.0,Right Foot,,Maddie Mercado,Pass (Live),Madeline Dahlien,Pass (Live)
3,22.0,Madeline Dahlien,Reign,0.03,0.24,Saved,17.0,Right Foot,,Madeline Dahlien,Take-On,Lauren Barnes,Pass (Live)
4,27.0,Ainsley Mccammon,Reign,0.05,0.0,Blocked,22.0,Right Foot,,Madison Curry,Pass (Live),Ainsley Mccammon,Pass (Live)



── table_19  shape=(8, 13) ──


  df = pd.read_html(str(tbl))[0]


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,SCA 1,SCA 1,SCA 2,SCA 2
Unnamed: 0_level_1,Minute,Player,Squad,xG,PSxG,Outcome,Distance,Body Part,Notes,Player,Event,Player,Event
0,14.0,Summer Yates,Pride,0.04,0.18,Saved,16.0,Right Foot,,Kerry Abello,Pass (Live),Marta,Pass (Live)
1,17.0,Summer Yates,Pride,0.06,0.0,Off Target,18.0,Left Foot,,Barbra Banda,Pass (Live),Marta,Pass (Live)
2,21.0,Barbra Banda,Pride,0.1,0.16,Saved,11.0,Right Foot,,Barbra Banda,Take-On,Summer Yates,Pass (Live)
3,23.0,Barbra Banda,Pride,0.31,0.0,Off Target,1.0,Head,,,,,
4,41.0,Barbra Banda,Pride,0.97,0.99,Goal,1.0,Left Foot,,Summer Yates,Pass (Live),Kerry Abello,Pass (Live)


In [4]:
# ------------------------------------------------------------------------------
# Ingest ONE FBref match page into a clean SQLite db (core-dimensions only)
# ------------------------------------------------------------------------------

import os, re, sqlite3, pandas as pd
from bs4 import BeautifulSoup

# ------------------------------------------------------------------ paths
html_file = ("/Users/thomasmcmillan/projects/nwsl_data/data/raw_match_pages/Chicago Stars vs. Seattle Reign FC Match Report – Saturday June 14, 2025 _ FBref.com.html")
db_path   = "data/processed/nwsldata.db"

# ------------------------------------------------------------------ 1) reset db
os.makedirs(os.path.dirname(db_path), exist_ok=True)
if os.path.exists(db_path):
    os.remove(db_path)

conn = sqlite3.connect(db_path)
cur  = conn.cursor()

# ------------------------------------------------------------------ 2) schema
schema_sql = """
PRAGMA foreign_keys = ON;

CREATE TABLE Season (
    season_id   INTEGER PRIMARY KEY AUTOINCREMENT,
    season_year INTEGER UNIQUE,
    league_name TEXT
);

CREATE TABLE Team (
    team_id     INTEGER PRIMARY KEY AUTOINCREMENT,
    team_name   TEXT UNIQUE
);

CREATE TABLE Player (        -- left empty for this demo
    player_id   INTEGER PRIMARY KEY AUTOINCREMENT,
    player_name TEXT,
    dob         DATE,
    nationality TEXT
);

CREATE TABLE Match (
    match_id      INTEGER PRIMARY KEY AUTOINCREMENT,
    season_id     INTEGER REFERENCES Season(season_id),
    match_date    DATE,
    home_team_id  INTEGER REFERENCES Team(team_id),
    away_team_id  INTEGER REFERENCES Team(team_id),
    home_xg       REAL,
    away_xg       REAL,
    attendance    INTEGER,
    venue         TEXT
);
"""
cur.executescript(schema_sql)
conn.commit()

# ------------------------------------------------------------------ 3) quick-and-dirty page parser
def ingest_match(html_path: str, cxn: sqlite3.Connection):
    with open(html_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "lxml")

    # ---- pull headline “X vs Y Match Report – Saturday May 10, 2025”
    headline = soup.find("h1").get_text(strip=True)
    m = re.match(r"(.+?) vs\. (.+?) Match Report.* ([A-Za-z]+) (\d{1,2}), (\d{4})", headline)
    if not m:
        raise ValueError("Could not parse headline → teams & date")
    home_name, away_name, month, day, year = m.groups()
    match_date = pd.to_datetime(f"{month} {day} {year}").date()

    # ---- xG numbers (first 2 floats near the “xG” label)
    xg_label = soup.find(string=re.compile(r"\bxG\b"))
    nums     = re.findall(r"(\d+\.\d+)", xg_label or "")
    home_xg, away_xg = (map(float, nums[:2]) if len(nums) >= 2 else (None, None))

    # ---- attendance & venue (look for plain-text labels)
    att_text  = soup.find(string=re.compile(r"Attendance:"))
    attendance = int(re.search(r"(\d[\d,]*)", att_text).group(1).replace(",", "")) if att_text else None

    venue_text = soup.find(string=re.compile(r"Venue:"))
    venue      = venue_text.split(":", 1)[1].strip() if venue_text else None

    cur = cxn.cursor()

    # ---- helpers to *upsert* dimension rows --------------------------------
    def get_or_create(table: str, col: str, value):
        cur.execute(f"SELECT {table.lower()}_id FROM {table} WHERE {col}=?;", (value,))
        row = cur.fetchone()
        if row:
            return row[0]
        cur.execute(f"INSERT INTO {table}({col}) VALUES (?);", (value,))
        return cur.lastrowid

    season_id = get_or_create("Season", "season_year", int(year))
    home_id   = get_or_create("Team",   "team_name",  home_name)
    away_id   = get_or_create("Team",   "team_name",  away_name)

    # ---- fact row ----------------------------------------------------------
    cur.execute(
        """
        INSERT INTO Match (
            season_id, match_date,
            home_team_id, away_team_id,
            home_xg, away_xg, attendance, venue
        )
        VALUES (?,?,?,?,?,?,?,?)
        """,
        (
            season_id, match_date,
            home_id,   away_id,
            home_xg,   away_xg, attendance, venue
        ),
    )
    cxn.commit()
    print(f"✅ Ingested {os.path.basename(html_path)}")

# actually do it
ingest_match(html_file, conn)

# ------------------------------------------------------------------ 4) sanity-check what landed
print("\n--- Seasons ---")
display(pd.read_sql_query("SELECT * FROM Season", conn))

print("\n--- Teams ---")
display(pd.read_sql_query("SELECT * FROM Team", conn))

print("\n--- Matches ---")
display(pd.read_sql_query("SELECT * FROM Match", conn))

conn.close()


✅ Ingested Chicago Stars vs. Seattle Reign FC Match Report – Saturday June 14, 2025 _ FBref.com.html

--- Seasons ---


  cur.execute(


Unnamed: 0,season_id,season_year,league_name
0,1,2025,



--- Teams ---


Unnamed: 0,team_id,team_name
0,1,Chicago Stars
1,2,Seattle Reign FC



--- Matches ---


Unnamed: 0,match_id,season_id,match_date,home_team_id,away_team_id,home_xg,away_xg,attendance,venue
0,1,1,2025-06-14,1,2,1.9,1.28,,


In [6]:
# ⬇️ 1.  Raw block from your message
teams_txt = """
Orlando Pride               2a6178ac
Utah Royals                 d4c130bc
North Carolina Courage      85c458aa
Houston Dash                e813709a
San Diego Wave FC           bf961da0
Bay FC                      231a532f
Gotham FC                   8e306dc6
Chicago Stars FC            d976a235
Racing Louisville           da19ebd1
Kansas City Current         6f666306
Portland Thorns FC          df9a10a1
Washington Spirit           e442aad0
Seattle Reign FC            257fad2b
Angel City FC               ae38d267
""".strip()

# ⬇️ 2.  Parse every line → (team_name, external_id)
records = [ln.rsplit(maxsplit=1) for ln in teams_txt.splitlines()]
import pandas as pd
df_teams = pd.DataFrame(records, columns=["team_name", "external_id"])

# ⬇️ 3.  Connect to (clean/empty) DB and create the table if needed
import sqlite3, os
db_path = os.path.join("data", "processed", "nwsldata.db")
conn = sqlite3.connect(db_path)
cur  = conn.cursor()

cur.execute("""
CREATE TABLE IF NOT EXISTS teams (
    team_id     INTEGER PRIMARY KEY AUTOINCREMENT,
    team_name   TEXT UNIQUE NOT NULL,
    external_id TEXT NOT NULL
);
""")

# ⬇️ 4.  Insert, ignoring duplicates if we rerun the cell
cur.executemany(
    "INSERT OR IGNORE INTO teams (team_name, external_id) VALUES (?, ?);",
    df_teams.to_records(index=False)
)
conn.commit()

# ⬇️ 5.  Quick sanity check
print("✅  Loaded teams into", db_path)
print(pd.read_sql_query("SELECT * FROM teams ORDER BY team_id;", conn))

conn.close()



✅  Loaded teams into data/processed/nwsldata.db
    team_id               team_name external_id
0         1           Orlando Pride    2a6178ac
1         2             Utah Royals    d4c130bc
2         3  North Carolina Courage    85c458aa
3         4            Houston Dash    e813709a
4         5       San Diego Wave FC    bf961da0
5         6                  Bay FC    231a532f
6         7               Gotham FC    8e306dc6
7         8        Chicago Stars FC    d976a235
8         9       Racing Louisville    da19ebd1
9        10     Kansas City Current    6f666306
10       11      Portland Thorns FC    df9a10a1
11       12       Washington Spirit    e442aad0
12       13        Seattle Reign FC    257fad2b
13       14           Angel City FC    ae38d267


In [7]:
# ╔══════════════╗
# ║  TEAM SEEDS  ║
# ╚══════════════╝
import sqlite3, re, pandas as pd, pathlib

# ── 1) raw list → tidy DataFrame ───────────────────────────────────────────────
teams_txt = """
Orlando Pride               2a6178ac
Utah Royals                 d4c130bc
North Carolina Courage      85c458aa
Houston Dash                e813709a
San Diego Wave FC           bf961da0
Bay FC                      231a532f
Gotham FC                   8e306dc6
Chicago Stars FC            d976a235
Racing Louisville           da19ebd1
Kansas City Current         6f666306
Portland Thorns FC          df9a10a1
Washington Spirit           e442aad0
Seattle Reign FC            257fad2b
Angel City FC               ae38d267
"""

# split on 2 + spaces so embedded single spaces in names are preserved
records = [re.split(r'\s{2,}', line.strip()) for line in teams_txt.strip().splitlines()]
df_teams = pd.DataFrame(records, columns=["team_name", "team_id"])
df_teams["team_id"] = df_teams["team_id"].str.strip()       # just in case

# ── 2) (Re)create teams table with the exact ids you want ─────────────────────
db_path = pathlib.Path("data/processed/nwsldata.db")   # adjust if yours lives elsewhere
conn    = sqlite3.connect(db_path)

with conn:
    conn.execute("DROP TABLE IF EXISTS teams;")
    conn.execute("""
        CREATE TABLE teams (
            team_id   TEXT PRIMARY KEY,
            team_name TEXT NOT NULL
        );
    """)
    df_teams.to_sql("teams", conn, if_exists="append", index=False)

# ── 3) quick sanity-check ─────────────────────────────────────────────────────
print("✅ teams seeded\n")
print(pd.read_sql("SELECT * FROM teams ORDER BY team_name;", conn))

conn.close()


✅ teams seeded

     team_id               team_name
0   ae38d267           Angel City FC
1   231a532f                  Bay FC
2   d976a235        Chicago Stars FC
3   8e306dc6               Gotham FC
4   e813709a            Houston Dash
5   6f666306     Kansas City Current
6   85c458aa  North Carolina Courage
7   2a6178ac           Orlando Pride
8   df9a10a1      Portland Thorns FC
9   da19ebd1       Racing Louisville
10  bf961da0       San Diego Wave FC
11  257fad2b        Seattle Reign FC
12  d4c130bc             Utah Royals
13  e442aad0       Washington Spirit


In [8]:
import sqlite3, pandas as pd, os

# --- config -----------------------------------------------------------------
db_path = os.path.expanduser("data/processed/nwsldata.db")   # adjust if yours lives elsewhere
# ---------------------------------------------------------------------------

conn = sqlite3.connect(db_path)
cur  = conn.cursor()

# 1 ▪️ add the new column only if it doesn’t already exist
existing_cols = [col[1] for col in cur.execute("PRAGMA table_info(Team);")]
if "team_name_short" not in existing_cols:
    cur.execute("ALTER TABLE Team ADD COLUMN team_name_short TEXT;")

# 2 ▪️ map full names → short labels (your screenshot order)
abbr = {
    "Kansas City Current"     : "Current",
    "Orlando Pride"           : "Pride",
    "San Diego Wave FC"       : "Wave",
    "Washington Spirit"       : "Spirit",
    "Portland Thorns FC"      : "Thorns",
    "Seattle Reign FC"        : "Reign",
    "Racing Louisville"       : "Louisville",
    "Gotham FC"               : "Gotham FC",
    "North Carolina Courage"  : "Courage",
    "Bay FC"                  : "Bay FC",
    "Angel City FC"           : "Angel City",
    "Houston Dash"            : "Dash",
    "Chicago Stars FC"        : "Chicago Stars",
    "Utah Royals"             : "Royals",
}

# 3 ▪️ update rows
for full, short in abbr.items():
    cur.execute(
        "UPDATE Team SET team_name_short = ? WHERE team_name = ?;",
        (short, full)
    )

conn.commit()

# 4 ▪️ quick sanity-check
df = pd.read_sql_query("SELECT team_id, team_name, team_name_short FROM Team ORDER BY team_id;", conn)
display(df)

conn.close()


Unnamed: 0,team_id,team_name,team_name_short
0,1,Chicago Stars,
1,2,Seattle Reign FC,Reign


In [10]:
import sqlite3
import pandas as pd
from pathlib import Path

# --- path to your DB ---------------------------------------------------------
db_path = Path("data/processed/nwsldata.db")   # adjust if you moved it
# ------------------------------------------------------------------------------

with sqlite3.connect(db_path) as conn:
    # 1) Show column definitions for sanity-check
    coldef = pd.read_sql_query("PRAGMA table_info(teams);", conn)
    print("=== Column layout for teams ===")
    display(coldef)            # in Jupyter this renders nicely

    # 2) Pull the actual data
    teams = pd.read_sql_query(
        "SELECT * FROM teams ORDER BY team_id;",
        conn
    )

print("\n=== teams table ===")
display(teams)                 # or print(teams.to_string(index=False)) if you prefer plain text


=== Column layout for teams ===


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,team_id,TEXT,0,,1
1,1,team_name,TEXT,1,,0



=== teams table ===


Unnamed: 0,team_id,team_name
0,231a532f,Bay FC
1,257fad2b,Seattle Reign FC
2,2a6178ac,Orlando Pride
3,6f666306,Kansas City Current
4,85c458aa,North Carolina Courage
5,8e306dc6,Gotham FC
6,ae38d267,Angel City FC
7,bf961da0,San Diego Wave FC
8,d4c130bc,Utah Royals
9,d976a235,Chicago Stars FC


In [11]:
import sqlite3, pandas as pd
from pathlib import Path

# --- path to your DB ---------------------------------------------------------
db_path = Path("data/processed/nwsldata.db")   # adjust if you keep it elsewhere

# --- mapping: full name  ➜  short label -------------------------------------
short_map = {
    "Orlando Pride"           : "Pride",
    "Utah Royals"             : "Royals",
    "North Carolina Courage"  : "Courage",
    "Houston Dash"            : "Dash",
    "San Diego Wave FC"       : "Wave",
    "Bay FC"                  : "Bay FC",
    "Gotham FC"               : "Gotham FC",
    "Chicago Stars FC"        : "Chicago Stars",
    "Racing Louisville"       : "Louisville",
    "Kansas City Current"     : "Current",
    "Portland Thorns FC"      : "Thorns",
    "Washington Spirit"       : "Spirit",
    "Seattle Reign FC"        : "Reign",
    "Angel City FC"           : "Angel City",
}

with sqlite3.connect(db_path) as conn:
    cur = conn.cursor()

    # 1️⃣ add the column once (skip if it already exists)
    cur.execute("PRAGMA table_info(teams);")
    cols = {row[1] for row in cur.fetchall()}
    if "team_name_short" not in cols:
        cur.execute("ALTER TABLE teams ADD COLUMN team_name_short TEXT;")

    # 2️⃣ populate / update
    for full, short in short_map.items():
        cur.execute(
            "UPDATE teams SET team_name_short = ? WHERE team_name = ?;",
            (short, full),
        )

    conn.commit()

    # 3️⃣ quick check
    df = pd.read_sql_query(
        "SELECT team_id, team_name, team_name_short FROM teams ORDER BY team_id;",
        conn,
    )

df


Unnamed: 0,team_id,team_name,team_name_short
0,231a532f,Bay FC,Bay FC
1,257fad2b,Seattle Reign FC,Reign
2,2a6178ac,Orlando Pride,Pride
3,6f666306,Kansas City Current,Current
4,85c458aa,North Carolina Courage,Courage
5,8e306dc6,Gotham FC,Gotham FC
6,ae38d267,Angel City FC,Angel City
7,bf961da0,San Diego Wave FC,Wave
8,d4c130bc,Utah Royals,Royals
9,d976a235,Chicago Stars FC,Chicago Stars


In [12]:
import sqlite3, pandas as pd
from pathlib import Path

# --- path to your database ---
db_path = Path("data/processed/nwsldata.db")

# --- master list of team metadata ---
teams = [
    # (team_id,           team_name,              short,            alias_1,            alias_2)
    ("231a532f", "Bay FC",                   "Bay FC",         None,              None),
    ("257fad2b", "Seattle Reign FC",         "Reign",          "OL Reign",        "Reign FC"),
    ("2a6178ac", "Orlando Pride",            "Pride",          None,              None),
    ("6f666306", "Kansas City Current",      "Current",        "Kansas City",     None),
    ("85c458aa", "North Carolina Courage",   "Courage",        None,              None),
    ("8e306dc6", "Gotham FC",                "Gotham FC",      "Sky Blue FC",     None),
    ("d976a235", "Chicago Stars FC",         "Chicago Stars",  "Chicago Red Stars","Red Stars"),
    ("da19ebd1", "Racing Louisville",        "Louisville",     None,              None),
    ("df9a10a1", "Portland Thorns FC",       "Thorns",         None,              None),
    ("e442aad0", "Washington Spirit",        "Spirit",         None,              None),
    ("e813709a", "Houston Dash",             "Dash",           None,              None),
    ("d4c130bc", "Utah Royals",              "Royals",         "Utah Royals FC",  None),
    ("bf961da0", "San Diego Wave FC",        "Wave",           None,              None),
    ("ae38d267", "Angel City FC",            "Angel City",     None,              None),
    ("ab757728", "Boston Breakers",          "Breakers",       None,              None),
    ("5f911568", "Western New York Flash",   "WNY Flash",      "Flash",           None),
]

with sqlite3.connect(db_path) as conn:
    cur = conn.cursor()

    # ------------------------------------------------------------------
    # 1) add the new columns if they don't already exist
    # ------------------------------------------------------------------
    cur.execute("PRAGMA table_info(teams);")
    existing_cols = {row[1] for row in cur.fetchall()}

    for col in ("team_name_short", "team_name_alias_1", "team_name_alias_2"):
        if col not in existing_cols:
            cur.execute(f"ALTER TABLE teams ADD COLUMN {col} TEXT;")

    # ------------------------------------------------------------------
    # 2) insert any missing teams
    # ------------------------------------------------------------------
    cur.executemany(
        "INSERT OR IGNORE INTO teams (team_id, team_name) VALUES (?, ?);",
        [(tid, name) for tid, name, *_ in teams],
    )

    # ------------------------------------------------------------------
    # 3) update short & alias fields
    # ------------------------------------------------------------------
    cur.executemany(
        """
        UPDATE teams
           SET team_name_short  = ?,
               team_name_alias_1 = ?,
               team_name_alias_2 = ?
         WHERE team_id = ?;
        """,
        [
            (short, alias1, alias2, tid)
            for tid, _name, short, alias1, alias2 in teams
        ],
    )

    conn.commit()

    # ------------------------------------------------------------------
    # 4) sanity-check -- show the result
    # ------------------------------------------------------------------
    df = pd.read_sql_query(
        "SELECT team_id, team_name, team_name_short, "
        "       team_name_alias_1, team_name_alias_2 "
        "  FROM teams ORDER BY team_id;",
        conn,
    )

df


Unnamed: 0,team_id,team_name,team_name_short,team_name_alias_1,team_name_alias_2
0,231a532f,Bay FC,Bay FC,,
1,257fad2b,Seattle Reign FC,Reign,OL Reign,Reign FC
2,2a6178ac,Orlando Pride,Pride,,
3,5f911568,Western New York Flash,WNY Flash,Flash,
4,6f666306,Kansas City Current,Current,Kansas City,
5,85c458aa,North Carolina Courage,Courage,,
6,8e306dc6,Gotham FC,Gotham FC,Sky Blue FC,
7,ab757728,Boston Breakers,Breakers,,
8,ae38d267,Angel City FC,Angel City,,
9,bf961da0,San Diego Wave FC,Wave,,


In [18]:
# ══════════════════════════════════════════════════════════════════════════════
#  EXTRACT PLAYER DATA from NWSL Player Stats Page (from commented HTML)
# ══════════════════════════════════════════════════════════════════════════════

from pathlib import Path
from bs4 import BeautifulSoup, Comment
import pandas as pd
import sqlite3
import re

# 👉 Path to your player stats HTML file
html_path = Path("/Users/thomasmcmillan/projects/nwsl_data/data/raw_player_pages/NWSL Player Stats _ FBref.com.html")

# 1) Load HTML source
html_content = html_path.read_text(encoding="utf-8")

# 2) The player table is inside HTML comments! Let's extract it
# Find the commented section containing the player stats table
comment_pattern = r'<!--\s*.*?<table[^>]*id="stats_standard".*?</table>.*?-->'
comment_match = re.search(comment_pattern, html_content, re.DOTALL)

if comment_match:
    # Extract the HTML content from inside the comment
    commented_html = comment_match.group(0)
    # Remove the comment markers
    table_html = commented_html.replace('<!--', '').replace('-->', '')

    # Parse the extracted HTML
    soup = BeautifulSoup(table_html, 'lxml')
    player_table = soup.find("table", id="stats_standard")

    if player_table:
        # 3) Convert to DataFrame
        df_players = pd.read_html(str(player_table))[0]

        # 4) Clean up column names (FBref often has multi-level headers)
        if isinstance(df_players.columns, pd.MultiIndex):
            # Flatten multi-level columns
            df_players.columns = ['_'.join(str(col).strip() for col in column if
str(col) != 'nan') for column in df_players.columns.values]

        # Clean up column names
        df_players.columns = [col.replace('Unnamed: ', '').strip('_') for col in
df_players.columns]

        print(f"✅ Found {len(df_players)} players")
        print(f"📊 Columns ({len(df_players.columns)}): {list(df_players.columns)}")
        print("\n--- Sample Data ---")
        display(df_players.head(10))

        # 5) Show key player info columns
        print(f"\n--- Key Player Data Available ---")
        key_cols = ['Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born']
        for col in key_cols:
            if col in df_players.columns:
                sample_data = df_players[col].dropna().head(5).tolist()
                print(f"{col}: {sample_data}")

        # 6) Show data types and missing values
        print(f"\n--- Data Quality Check ---")
        print(f"Total rows: {len(df_players)}")
        print(f"Missing data by column:")
        missing_data = df_players.isnull().sum()
        for col, missing in missing_data.items():
            if missing > 0:
                print(f"  {col}: {missing} missing ({missing/len(df_players)*100:.1f}%)")

    else:
        print("❌ Could not find player stats table in commented HTML")

else:
    print("❌ Could not find commented player stats section")
    print("Let's check what HTML comments exist...")
    soup = BeautifulSoup(html_content, 'lxml')
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    print(f"Found {len(comments)} HTML comments")
    for i, comment in enumerate(comments[:5]):  # Show first 5
        preview = str(comment)[:100].replace('\n', ' ')
        print(f"  Comment {i}: {preview}...")

✅ Found 322 players
📊 Columns (37): ['0_level_0_Rk', '1_level_0_Player', '2_level_0_Nation', '3_level_0_Pos', '4_level_0_Squad', '5_level_0_Age', '6_level_0_Born', 'Playing Time_MP', 'Playing Time_Starts', 'Playing Time_Min', 'Playing Time_90s', 'Performance_Gls', 'Performance_Ast', 'Performance_G+A', 'Performance_G-PK', 'Performance_PK', 'Performance_PKatt', 'Performance_CrdY', 'Performance_CrdR', 'Expected_xG', 'Expected_npxG', 'Expected_xAG', 'Expected_npxG+xAG', 'Progression_PrgC', 'Progression_PrgP', 'Progression_PrgR', 'Per 90 Minutes_Gls', 'Per 90 Minutes_Ast', 'Per 90 Minutes_G+A', 'Per 90 Minutes_G-PK', 'Per 90 Minutes_G+A-PK', 'Per 90 Minutes_xG', 'Per 90 Minutes_xAG', 'Per 90 Minutes_xG+xAG', 'Per 90 Minutes_npxG', 'Per 90 Minutes_npxG+xAG', '36_level_0_Matches']

--- Sample Data ---


  df_players = pd.read_html(str(player_table))[0]


Unnamed: 0,0_level_0_Rk,1_level_0_Player,2_level_0_Nation,3_level_0_Pos,4_level_0_Squad,5_level_0_Age,6_level_0_Born,Playing Time_MP,Playing Time_Starts,Playing Time_Min,...,Per 90 Minutes_Ast,Per 90 Minutes_G+A,Per 90 Minutes_G-PK,Per 90 Minutes_G+A-PK,Per 90 Minutes_xG,Per 90 Minutes_xAG,Per 90 Minutes_xG+xAG,Per 90 Minutes_npxG,Per 90 Minutes_npxG+xAG,36_level_0_Matches
0,1,Kerry Abello,us USA,DF,Pride,25-309,1999,13,10,923,...,0.0,0.0,0.0,0.0,0.03,0.05,0.08,0.03,0.08,Matches
1,2,Emeri Adames,us USA,"FW,MF",Reign,19-111,2006,10,4,420,...,0.0,0.86,0.64,0.64,0.53,0.11,0.64,0.37,0.49,Matches
2,3,Marie-Yasmine Alidou,ca CAN,"MF,FW",Thorns,30-086,1995,10,4,405,...,0.22,0.44,0.22,0.44,0.19,0.06,0.25,0.19,0.25,Matches
3,4,Aline,br BRA,"FW,MF",Courage,20-016,2005,6,1,118,...,0.0,0.76,0.76,0.76,0.13,0.17,0.3,0.13,0.3,Matches
4,5,Emmie Allen,us USA,GK,Bay FC,22-271,2002,2,2,180,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches
5,6,Michelle Alozie,ng NGA,"DF,MF",Dash,28-086,1997,10,1,293,...,0.0,0.0,0.0,0.0,0.02,0.03,0.05,0.02,0.05,Matches
6,7,Angelina Anderson,us USA,GK,Angel City,24-123,2001,13,13,1170,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches
7,8,Hannah Anderson,us USA,DF,Chicago Stars,24-111,2001,8,3,371,...,0.0,0.0,0.0,0.0,0.09,0.0,0.1,0.09,0.1,Matches
8,9,Joelle Anderson,us USA,DF,Bay FC,26-290,1998,11,8,727,...,0.25,0.37,0.12,0.37,0.06,0.02,0.08,0.06,0.08,Matches
9,10,Angelina,br BRA,MF,Pride,25-178,2000,12,10,890,...,0.1,0.1,0.0,0.1,0.12,0.22,0.34,0.12,0.34,Matches



--- Key Player Data Available ---

--- Data Quality Check ---
Total rows: 322
Missing data by column:
  2_level_0_Nation: 7 missing (2.2%)
  5_level_0_Age: 7 missing (2.2%)
  6_level_0_Born: 7 missing (2.2%)


In [24]:
# ═══════════════════════════════════════════════════════════════════════
#  CREATE PLAYER TABLE - Pure Dimension Table (No Squad Column)
# ═══════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd
from bs4 import BeautifulSoup
import re
from pathlib import Path

# Connect to your database
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)
cur = conn.cursor()

# 1) Drop existing Player table if it exists (to recreate with proper schema)
cur.execute("DROP TABLE IF EXISTS Player;")

# 2) Create Player table with ONLY unchanging attributes
cur.execute("""
    CREATE TABLE Player (
    player_id    TEXT PRIMARY KEY,
    player_name  TEXT NOT NULL,
    nationality  TEXT,
    pos          TEXT,
    dob          DATE,
    preferred_foot TEXT 
);
""")

print("✅ Created Player table with clean schema (no Squad column)")

# 3) Parse the commented HTML to extract player data
html_content = Path("/Users/thomasmcmillan/projects/nwsl_data/data/raw_player_pages/NWSL Player Stats _ FBref.com.html").read_text(encoding="utf-8")

comment_pattern = r'<!--\s*.*?<table[^>]*id="stats_standard".*?</table>.*?-->'
comment_match = re.search(comment_pattern, html_content, re.DOTALL)

if comment_match:
    commented_html = comment_match.group(0)
    table_html = commented_html.replace('<!--', '').replace('-->', '')
    soup = BeautifulSoup(table_html, 'lxml')

    # 4) Extract ONLY the core player dimensional data
    players_data = []

    # Find all table rows with player data
    rows = soup.find_all('tr')

    for row in rows:
        # Look for the player cell with data-append-csv attribute (contains player ID)
        player_cell = row.find('td', {'data-stat': 'player'})
        if player_cell:
            # Extract player ID from data-append-csv
            player_id = player_cell.get('data-append-csv')

            # Extract player name from the link text
            player_link = player_cell.find('a')
            player_name = player_link.get_text(strip=True) if player_link else None

            # Extract nationality
            nationality_cell = row.find('td', {'data-stat': 'nationality'})
            nationality = None
            if nationality_cell:
                # Extract country code (e.g., "USA" from "us USA")
                nat_text = nationality_cell.get_text(strip=True)
                # Get the 2-3 letter country code at the end
                nat_match = re.search(r'[A-Z]{2,3}$', nat_text)
                nationality = nat_match.group() if nat_match else nat_text

            # Extract position
            position_cell = row.find('td', {'data-stat': 'position'})
            position = position_cell.get_text(strip=True) if position_cell else None

            # Extract birth year and convert to DOB
            birth_year_cell = row.find('td', {'data-stat': 'birth_year'})
            birth_year = birth_year_cell.get_text(strip=True) if birth_year_cell else None
            dob = f"{birth_year}-01-01" if birth_year and birth_year.isdigit() else None

            # Only add if we have both player_id and player_name
            if player_id and player_name:
                players_data.append({
                    'player_id': player_id,
                    'player_name': player_name,
                    'nationality': nationality,
                    'pos': position,
                    'dob': dob,
                    'preferred_foot': None  # Not available in this dataset
                })

    # 5) Convert to DataFrame and display sample
    df_players = pd.DataFrame(players_data)

    print(f"✅ Extracted {len(df_players)} players with core dimensions only")
    print(f"\n--- Sample Player Data (No Squad) ---")
    display(df_players.head(10))

    # 6) Insert into database
    print(f"📝 Inserting {len(df_players)} players into clean dimension table...")

    for _, row in df_players.iterrows():
        cur.execute("""
            INSERT OR REPLACE INTO Player (player_id, player_name, nationality, pos, dob, preferred_foot)
            VALUES (?, ?, ?, ?, ?, ?)
        """, (
            row['player_id'],
            row['player_name'],
            row['nationality'],
            row['pos'],
            row['dob'],
            row['preferred_foot']
        ))

    conn.commit()
    print(f"✅ Successfully inserted all players")

    # 7) Verification queries
    print(f"\n--- Player Table Summary ---")
    summary = pd.read_sql_query("""
        SELECT 
            COUNT(*) as total_players,
            COUNT(DISTINCT nationality) as unique_countries,
            COUNT(DISTINCT pos) as unique_positions,
            COUNT(dob) as players_with_dob,
            COUNT(preferred_foot) as players_with_foot_data
        FROM Player
    """, conn)
    display(summary)

    print(f"\n--- Sample Players from Database ---")
    sample = pd.read_sql_query("""
        SELECT player_id, player_name, nationality, pos, dob
        FROM Player 
        ORDER BY player_name
        LIMIT 10
    """, conn)
    display(sample)

    # 8) Verify Kerry Abello specifically
    print(f"\n--- Verify Kerry Abello (ID: 45419c74) ---")
    kerry = pd.read_sql_query("""
        SELECT * FROM Player WHERE player_id = '45419c74'
    """, conn)
    display(kerry)

    # 9) Show some interesting breakdowns
    print(f"\n--- Players by Country ---")
    by_country = pd.read_sql_query("""
        SELECT nationality, COUNT(*) as player_count
        FROM Player 
        WHERE nationality IS NOT NULL
        GROUP BY nationality
        ORDER BY player_count DESC
        LIMIT 10
    """, conn)
    display(by_country)

    print(f"\n--- Players by Position ---")
    by_position = pd.read_sql_query("""
        SELECT pos, COUNT(*) as player_count
        FROM Player 
        WHERE pos IS NOT NULL
        GROUP BY pos
        ORDER BY player_count DESC
    """, conn)
    display(by_position)

else:
    print("❌ Could not find commented player stats section")

conn.close()
print(f"\n🎉 Player dimension table created successfully!")
print(f"   - Contains only unchanging player attributes")
print(f"   - Squad/team relationships will come from match data")
print(f"   - Ready for your relational model!")

✅ Created Player table with clean schema (no Squad column)
✅ Extracted 310 players with core dimensions only

--- Sample Player Data (No Squad) ---


Unnamed: 0,player_id,player_name,nationality,pos,dob,preferred_foot
0,45419c74,Kerry Abello,USA,DF,1999-01-01,
1,e64b3c35,Emeri Adames,USA,"FW,MF",2006-01-01,
2,a4c093fc,Marie-Yasmine Alidou,CAN,"MF,FW",1995-01-01,
3,51e267dc,Aline,BRA,"FW,MF",2005-01-01,
4,f3e2324d,Emmie Allen,USA,GK,2002-01-01,
5,b03e80f6,Michelle Alozie,NGA,"DF,MF",1997-01-01,
6,06115ef0,Angelina Anderson,USA,GK,2001-01-01,
7,53cef818,Hannah Anderson,USA,DF,2001-01-01,
8,edb4b4b1,Joelle Anderson,USA,DF,1998-01-01,
9,49d6d4a8,Angelina,BRA,MF,2000-01-01,


📝 Inserting 310 players into clean dimension table...
✅ Successfully inserted all players

--- Player Table Summary ---


Unnamed: 0,total_players,unique_countries,unique_positions,players_with_dob,players_with_foot_data
0,309,37,10,302,0



--- Sample Players from Database ---


Unnamed: 0,player_id,player_name,nationality,pos,dob
0,52b58405,Abby Dahlkemper,USA,DF,1993-01-01
1,947d22fb,Abby Smith,USA,GK,1993-01-01
2,15c7bfb7,Adriana Leon,CAN,"FW,MF",1992-01-01
3,220b9b36,Ainsley Mccammon,USA,MF,2007-01-01
4,abf96cef,Aisha Solórzano,GUA,FW,1998-01-01
5,7ca7c385,Alana Cook,USA,DF,1997-01-01
6,8252bc62,Alanna Kennedy,AUS,MF,1995-01-01
7,ea429c45,Alex Loera,USA,MF,1999-01-01
8,0567dc96,Alexa Spaanstra,USA,"FW,MF",2000-01-01
9,51e267dc,Aline,BRA,"FW,MF",2005-01-01



--- Verify Kerry Abello (ID: 45419c74) ---


Unnamed: 0,player_id,player_name,nationality,pos,dob,preferred_foot
0,45419c74,Kerry Abello,USA,DF,1999-01-01,



--- Players by Country ---


Unnamed: 0,nationality,player_count
0,USA,195
1,BRA,14
2,CAN,13
3,JPN,7
4,,7
5,NGA,6
6,ESP,6
7,FRA,5
8,ZAM,4
9,MEX,4



--- Players by Position ---


Unnamed: 0,pos,player_count
0,DF,78
1,MF,69
2,"FW,MF",54
3,"MF,FW",30
4,FW,26
5,GK,22
6,"DF,MF",14
7,"DF,FW",8
8,"MF,DF",6
9,"FW,DF",2



🎉 Player dimension table created successfully!
   - Contains only unchanging player attributes
   - Squad/team relationships will come from match data
   - Ready for your relational model!


In [26]:
# ══════════════════════════════════════════════════════════════════════════════
#  CREATE PLAYER TABLE - Pure Biographical Data (No Position Column)
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd
from bs4 import BeautifulSoup
import re
from pathlib import Path

# Connect to your database
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)
cur = conn.cursor()

# 1) Drop existing Player table if it exists (to recreate with proper schema)
cur.execute("DROP TABLE IF EXISTS Player;")

# 2) Create Player table with ONLY unchanging biographical attributes
cur.execute("""
CREATE TABLE Player (
    player_id    TEXT PRIMARY KEY, 
    player_name  TEXT NOT NULL,
    nationality  TEXT,
    dob          DATE,
    preferred_foot TEXT            
);
""")

print("✅ Created Player table - pure biographical data (no position column)")

# 3) Parse the commented HTML to extract player data
html_content = Path("/Users/thomasmcmillan/projects/nwsl_data/data/raw_player_pages/NWSL Player Stats _ FBref.com.html").read_text(encoding="utf-8")

comment_pattern = r'<!--\s*.*?<table[^>]*id="stats_standard".*?</table>.*?-->'
comment_match = re.search(comment_pattern, html_content, re.DOTALL)

if comment_match:
    commented_html = comment_match.group(0)
    table_html = commented_html.replace('<!--', '').replace('-->', '')
    soup = BeautifulSoup(table_html, 'lxml')

    # 4) Extract ONLY the core biographical data (no position)
    players_data = []

    # Find all table rows with player data
    rows = soup.find_all('tr')

    for row in rows:
        # Look for the player cell with data-append-csv attribute (contains player ID)
        player_cell = row.find('td', {'data-stat': 'player'})
        if player_cell:
            # Extract player ID from data-append-csv
            player_id = player_cell.get('data-append-csv')

            # Extract player name from the link text
            player_link = player_cell.find('a')
            player_name = player_link.get_text(strip=True) if player_link else None

            # Extract nationality
            nationality_cell = row.find('td', {'data-stat': 'nationality'})
            nationality = None
            if nationality_cell:
                # Extract country code (e.g., "USA" from "us USA")
                nat_text = nationality_cell.get_text(strip=True)
                # Get the 2-3 letter country code at the end
                nat_match = re.search(r'[A-Z]{2,3}$', nat_text)
                nationality = nat_match.group() if nat_match else nat_text

            # Extract birth year and convert to DOB
            birth_year_cell = row.find('td', {'data-stat': 'birth_year'})
            birth_year = birth_year_cell.get_text(strip=True) if birth_year_cell else None
            dob = f"{birth_year}-01-01" if birth_year and birth_year.isdigit() else None

            # Only add if we have both player_id and player_name
            if player_id and player_name:
                players_data.append({
                    'player_id': player_id,
                    'player_name': player_name,
                    'nationality': nationality,
                    'dob': dob,
                    'preferred_foot': None  # Not available in this dataset
                })

    # 5) Convert to DataFrame and display sample
    df_players = pd.DataFrame(players_data)

    print(f"✅ Extracted {len(df_players)} players with biographical data only")
    print(f"\n--- Sample Player Data (Biographical Only) ---")
    display(df_players.head(10))

    # 6) Insert into database
    print(f"📝 Inserting {len(df_players)} players into biographical dimension table...")

    for _, row in df_players.iterrows():
        cur.execute("""
            INSERT OR REPLACE INTO Player (player_id, player_name, nationality, dob, preferred_foot)
            VALUES (?, ?, ?, ?, ?)
        """, (
            row['player_id'],
            row['player_name'],
            row['nationality'],
            row['dob'],
            row['preferred_foot']
        ))

    conn.commit()
    print(f"✅ Successfully inserted all players")

    # 7) Verification queries
    print(f"\n--- Player Table Summary ---")
    summary = pd.read_sql_query("""
        SELECT 
            COUNT(*) as total_players,
            COUNT(DISTINCT nationality) as unique_countries,
            COUNT(dob) as players_with_dob,
            COUNT(preferred_foot) as players_with_foot_data
        FROM Player
    """, conn)
    display(summary)

    print(f"\n--- Sample Players from Database ---")
    sample = pd.read_sql_query("""
        SELECT player_id, player_name, nationality, dob
        FROM Player 
        ORDER BY player_name
        LIMIT 10
    """, conn)
    display(sample)

    # 8) Verify Kerry Abello specifically
    print(f"\n--- Verify Kerry Abello (ID: 45419c74) ---")
    kerry = pd.read_sql_query("""
        SELECT * FROM Player WHERE player_id = '45419c74'
    """, conn)
    display(kerry)

    # 9) Show nationality breakdown
    print(f"\n--- Players by Country ---")
    by_country = pd.read_sql_query("""
        SELECT nationality, COUNT(*) as player_count
        FROM Player 
        WHERE nationality IS NOT NULL
        GROUP BY nationality
        ORDER BY player_count DESC
        LIMIT 10
    """, conn)
    display(by_country)

else:
    print("❌ Could not find commented player stats section")

conn.close()
print(f"\n🎉 Clean biographical Player table created!")
print(f"   - Contains only unchanging attributes")
print(f"   - Position data will come from match lineups")
print(f"   - Ready for proper relational model!")

✅ Created Player table - pure biographical data (no position column)
✅ Extracted 310 players with biographical data only

--- Sample Player Data (Biographical Only) ---


Unnamed: 0,player_id,player_name,nationality,dob,preferred_foot
0,45419c74,Kerry Abello,USA,1999-01-01,
1,e64b3c35,Emeri Adames,USA,2006-01-01,
2,a4c093fc,Marie-Yasmine Alidou,CAN,1995-01-01,
3,51e267dc,Aline,BRA,2005-01-01,
4,f3e2324d,Emmie Allen,USA,2002-01-01,
5,b03e80f6,Michelle Alozie,NGA,1997-01-01,
6,06115ef0,Angelina Anderson,USA,2001-01-01,
7,53cef818,Hannah Anderson,USA,2001-01-01,
8,edb4b4b1,Joelle Anderson,USA,1998-01-01,
9,49d6d4a8,Angelina,BRA,2000-01-01,


📝 Inserting 310 players into biographical dimension table...
✅ Successfully inserted all players

--- Player Table Summary ---


Unnamed: 0,total_players,unique_countries,players_with_dob,players_with_foot_data
0,309,37,302,0



--- Sample Players from Database ---


Unnamed: 0,player_id,player_name,nationality,dob
0,52b58405,Abby Dahlkemper,USA,1993-01-01
1,947d22fb,Abby Smith,USA,1993-01-01
2,15c7bfb7,Adriana Leon,CAN,1992-01-01
3,220b9b36,Ainsley Mccammon,USA,2007-01-01
4,abf96cef,Aisha Solórzano,GUA,1998-01-01
5,7ca7c385,Alana Cook,USA,1997-01-01
6,8252bc62,Alanna Kennedy,AUS,1995-01-01
7,ea429c45,Alex Loera,USA,1999-01-01
8,0567dc96,Alexa Spaanstra,USA,2000-01-01
9,51e267dc,Aline,BRA,2005-01-01



--- Verify Kerry Abello (ID: 45419c74) ---


Unnamed: 0,player_id,player_name,nationality,dob,preferred_foot
0,45419c74,Kerry Abello,USA,1999-01-01,



--- Players by Country ---


Unnamed: 0,nationality,player_count
0,USA,195
1,BRA,14
2,CAN,13
3,JPN,7
4,,7
5,NGA,6
6,ESP,6
7,FRA,5
8,ZAM,4
9,MEX,4



🎉 Clean biographical Player table created!
   - Contains only unchanging attributes
   - Position data will come from match lineups
   - Ready for proper relational model!


In [27]:
# ══════════════════════════════════════════════════════════════════════════════
#  EXPORT PLAYER DATA TO CSV for Manual DOB Research
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd
from pathlib import Path

# Connect to database
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

# Extract all players with their current "age" data from the original HTML
html_content = Path("/Users/thomasmcmillan/projects/nwsl_data/data/raw_player_pages/NWSL Player Stats _ FBref.com.html").read_text(encoding="utf-8")

# Parse to get age data
from bs4 import BeautifulSoup
import re

comment_pattern = r'<!--\s*.*?<table[^>]*id="stats_standard".*?</table>.*?-->'
comment_match = re.search(comment_pattern, html_content, re.DOTALL)

players_export = []

if comment_match:
    commented_html = comment_match.group(0)
    table_html = commented_html.replace('<!--', '').replace('-->', '')
    soup = BeautifulSoup(table_html, 'lxml')

    rows = soup.find_all('tr')

    for row in rows:
        player_cell = row.find('td', {'data-stat': 'player'})
        if player_cell:
            # Extract player ID
            player_id = player_cell.get('data-append-csv')

            # Extract player name
            player_link = player_cell.find('a')
            player_name = player_link.get_text(strip=True) if player_link else None

            # Extract age in YY-DDD format
            age_cell = row.find('td', {'data-stat': 'age'})
            age_formatted = age_cell.get_text(strip=True) if age_cell else None

            # Extract birth year
            birth_year_cell = row.find('td', {'data-stat': 'birth_year'})
            birth_year = birth_year_cell.get_text(strip=True) if birth_year_cell else None

            # Extract nationality
            nationality_cell = row.find('td', {'data-stat': 'nationality'})
            nationality = None
            if nationality_cell:
                nat_text = nationality_cell.get_text(strip=True)
                nat_match = re.search(r'[A-Z]{2,3}$', nat_text)
                nationality = nat_match.group() if nat_match else nat_text

            if player_id and player_name:
                players_export.append({
                    'player_id': player_id,
                    'player_name': player_name,
                    'nationality': nationality,
                    'birth_year': birth_year,
                    'age_formatted': age_formatted,  
                    'notes': '' 
                })

# Create DataFrame and export to CSV
df_export = pd.DataFrame(players_export)

# Sort by player name for easier manual lookup
df_export = df_export.sort_values('player_name')

# Export to project root
csv_path = "players_for_dob_research.csv"
df_export.to_csv(csv_path, index=False)

print(f"✅ Exported {len(df_export)} players to {csv_path}")
print(f"\n--- Sample of exported data ---")
print(df_export.head(10))

print(f"\n📋 CSV Columns:")
print(f"   - player_id: FBref hex ID")
print(f"   - player_name: Full name")
print(f"   - nationality: Country code")
print(f"   - birth_year: Year from FBref")
print(f"   - age_formatted: Current age (YY-DDD format)")
print(f"   - notes: Empty for your research notes")

print(f"\n🔍 Next steps:")
print(f"   1. Open {csv_path} in Excel/Google Sheets")
print(f"   2. Research actual DOBs (Wikipedia, team rosters, etc.)")
print(f"   3. Add DOB in YYYY-MM-DD format to a new 'dob' column")
print(f"   4. Return the updated CSV and we'll import it!")

✅ Exported 310 players to players_for_dob_research.csv

--- Sample of exported data ---
    player_id       player_name nationality birth_year age_formatted notes
67   52b58405   Abby Dahlkemper         USA       1993        32-071      
267  947d22fb        Abby Smith         USA       1993        31-292      
160  15c7bfb7      Adriana Leon         CAN       1992        32-294      
184  220b9b36  Ainsley Mccammon         USA       2007        17-341      
269  abf96cef   Aisha Solórzano         GUA       1998        27-101      
56   7ca7c385        Alana Cook         USA       1997        28-103      
145  8252bc62    Alanna Kennedy         AUS       1995        30-183      
163  ea429c45        Alex Loera         USA       1999        26-034      
272  0567dc96   Alexa Spaanstra         USA       2000        25-172      
3    51e267dc             Aline         BRA       2005        20-016      

📋 CSV Columns:
   - player_id: FBref hex ID
   - player_name: Full name
   - nationali

In [29]:
# ══════════════════════════════════════════════════════════════════════════════
#  SCRAPE INDIVIDUAL PLAYER PAGES for Real DOBs
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re
from pathlib import Path

# Connect to database to get our player list
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

# Get all players from our database
players_df = pd.read_sql_query("""
    SELECT player_id, player_name 
    FROM Player 
    ORDER BY player_name
""", conn)

print(f"📋 Found {len(players_df)} players to scrape DOBs for")

# Function to convert player name to URL format
def name_to_url_format(name):
    """Convert 'Kerry Abello' to 'Kerry-Abello'"""
    return name.replace(' ', '-').replace("'", "")

# Function to scrape DOB from individual player page
def scrape_player_dob(player_id, player_name):
    """Scrape DOB from individual FBref player page"""

    # Build URL
    url_name = name_to_url_format(player_name)
    url = f"https://fbref.com/en/players/{player_id}/{url_name}"

    try:
        # Add headers to look like a real browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        # Look for DOB in various common locations on FBref player pages
        # Method 1: Look for "Born:" text
        born_text = soup.find(string=re.compile(r'Born:'))
        if born_text:
            # Extract date after "Born:"
            parent = born_text.parent
            if parent:
                text = parent.get_text()
                # Look for date pattern like "Born: May 15, 1999"
                date_match = re.search(r'Born:\s*([A-Za-z]+ \d{1,2}, \d{4})', text)
                if date_match:
                    return date_match.group(1)

        # Method 2: Look in meta tags
        dob_meta = soup.find('meta', {'name': 'description'})
        if dob_meta:
            content = dob_meta.get('content', '')
            date_match = re.search(r'born ([A-Za-z]+ \d{1,2}, \d{4})', content, re.IGNORECASE)
            if date_match:
                return date_match.group(1)

        # Method 3: Look for span with birth info
        for span in soup.find_all('span'):
            text = span.get_text()
            if 'born' in text.lower():
                date_match = re.search(r'([A-Za-z]+ \d{1,2}, \d{4})', text)
                if date_match:
                    return date_match.group(1)

        return None

    except Exception as e:
        print(f"❌ Error scraping {player_name} ({player_id}): {str(e)}")
        return None

# Test with Kerry Abello first
print("🧪 Testing with Kerry Abello...")
test_dob = scrape_player_dob('45419c74', 'Kerry Abello')
print(f"Kerry Abello DOB: {test_dob}")

if test_dob:
    print("✅ Test successful! Proceeding with batch scraping...")

    # Scrape all players (with rate limiting)
    dob_results = []

    for i, row in players_df.head(10).iterrows():  # Start with first 10 for testing
        player_id = row['player_id']
        player_name = row['player_name']

        print(f"🔍 Scraping {i+1}/{len(players_df)}: {player_name}")

        dob = scrape_player_dob(player_id, player_name)

        dob_results.append({
            'player_id': player_id,
            'player_name': player_name,
            'scraped_dob': dob,
            'status': 'success' if dob else 'failed'
        })

        # Rate limiting - be nice to FBref
        time.sleep(2)  # 2 second delay between requests

    # Save results
    results_df = pd.DataFrame(dob_results)
    results_df.to_csv('scraped_dobs_test.csv', index=False)

    print(f"\n📊 Scraping Results (First 10):")
    print(f"✅ Successful: {len(results_df[results_df['status'] == 'success'])}")
    print(f"❌ Failed: {len(results_df[results_df['status'] == 'failed'])}")

    print(f"\n--- Sample Results ---")
    print(results_df[['player_name', 'scraped_dob', 'status']])

else:
    print("❌ Test failed. Need to adjust scraping method.")

conn.close()

📋 Found 309 players to scrape DOBs for
🧪 Testing with Kerry Abello...
Kerry Abello DOB: None
❌ Test failed. Need to adjust scraping method.


In [31]:
# ════════════════════════════════════════════════════════════════════
#  DEBUG: Inspect Kerry Abello's Page Structure
# ════════════════════════════════════════════════════════════════════

import requests
from bs4 import BeautifulSoup
import re

def debug_player_page(player_id, player_name):
    """Debug what's actually on a player's page"""

    url_name = player_name.replace(' ', '-').replace("'", "")
    url = f"https://fbref.com/en/players/{player_id}/{url_name}"

    print(f"🔍 Fetching: {url}")

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        print(f"📡 Response status: {response.status_code}")

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Save the page content to inspect
            with open('kerry_abello_debug.html', 'w',
encoding='utf-8') as f:
                f.write(str(soup))
            print("💾 Saved page content to 'kerry_abello_debug.html'")

            # Look for any text containing "born" or dates
            print("\n🔍 Searching for birth-related text...")

            # Method 1: Look for any text containing "born"
            born_elements = soup.find_all(string=re.compile(r'born',
re.IGNORECASE))
            if born_elements:
                print(f"Found {len(born_elements)} elements with 'born':")
                for i, elem in enumerate(born_elements[:5]):  # Show first 5
                    print(f"  {i+1}: {elem.strip()}")

            # Method 2: Look for date patterns anywhere on page
            all_text = soup.get_text()
            date_patterns = re.findall(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}\b', all_text)
            if date_patterns:
                print(f"\n📅 Found date patterns:")
                for date in date_patterns[:5]:  # Show first 5
                    print(f"  {date}")

            # Method 3: Look in meta tags
            meta_desc = soup.find('meta', {'name': 'description'})
            if meta_desc:
                print(f"\n📝 Meta description: {meta_desc.get('content', '')}")

            # Method 4: Look for specific bio section
            bio_sections = soup.find_all(['div', 'p', 'span'], string=re.compile(r'born', re.IGNORECASE))
            if bio_sections:
                print(f"\n👤 Bio sections:")
                for section in bio_sections[:3]:
                    print(f"  {section.get_text().strip()}")

        else:
            print(f"❌ Failed to fetch page: HTTP {response.status_code}")

    except Exception as e:
        print(f"❌ Error: {str(e)}")

# Debug Kerry Abello's page
debug_player_page('45419c74', 'Kerry Abello')

🔍 Fetching: https://fbref.com/en/players/45419c74/Kerry-Abello
📡 Response status: 200
💾 Saved page content to 'kerry_abello_debug.html'

🔍 Searching for birth-related text...
Found 1 elements with 'born':
  1: Born:

📅 Found date patterns:
  September 17, 1999
  August 03, 2025
  June 13, 2018
  July 21, 2025
  July 22, 2025


In [33]:
# ══════════════════════════════════════════════════════════════════════════════
#  FULL DOB SCRAPING - All 309 Players
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re
from datetime import datetime
from tqdm import tqdm  # For progress bar

print("🚀 Starting full DOB scraping for all 309 players...")
print("⏱️  Estimated time: ~10-15 minutes with rate limiting")
print("💾 Results will be saved periodically in case of interruption")

# Connect to database
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

players_df = pd.read_sql_query("""
    SELECT player_id, player_name 
    FROM Player 
    ORDER BY player_name
""", conn)

def scrape_player_dob_final(player_id, player_name):
    """Final DOB scraping function"""

    url_name = player_name.replace(' ', '-').replace("'", "")
    url = f"https://fbref.com/en/players/{player_id}/{url_name}"

    headers = {
        'User-Agent': (
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
            'AppleWebKit/537.36 (KHTML, like Gecko) '
            'Chrome/91.0.4472.124 Safari/537.36'
        )
    }

    try:
        response = requests.get(url, headers=headers, timeout=15)
        if response.status_code != 200:
            return None, f"HTTP {response.status_code}"

        soup = BeautifulSoup(response.content, 'html.parser')

        # Check for "Born:" text
        born_text = soup.find(string=re.compile(r'Born:', re.IGNORECASE))
        if not born_text:
            return None, "No 'Born:' text found"

        # Extract date patterns
        all_text = soup.get_text()
        date_patterns = re.findall(
            r'\b((?:January|February|March|April|May|June|July|August|'
            r'September|October|November|December)\s+\d{1,2},\s+\d{4})\b',
            all_text
        )

        if date_patterns:
            for date_str in date_patterns:
                try:
                    date_obj = datetime.strptime(date_str, '%B %d, %Y')
                    if 1985 <= date_obj.year <= 2010:
                        return date_str, "success"
                except:
                    continue

        return None, "No valid birth date found"

    except Exception as e:
        return None, f"Error: {str(e)}"

# Initialize results list
all_results = []
checkpoint_interval = 50  # Save progress every 50 players

# Scrape all players with progress bar
for i, row in tqdm(players_df.iterrows(), total=len(players_df), desc="Scraping DOBs"):
    player_id = row['player_id']
    player_name = row['player_name']

    # Scrape DOB
    dob, status = scrape_player_dob_final(player_id, player_name)

    result = {
        'player_id': player_id,
        'player_name': player_name,
        'scraped_dob': dob,
        'status': status,
        'success': status == "success"
    }

    all_results.append(result)

    # Save checkpoint every 50 players
    if (i + 1) % checkpoint_interval == 0:
        checkpoint_df = pd.DataFrame(all_results)
        checkpoint_df.to_csv(f'dob_scraping_checkpoint_{i+1}.csv', index=False)

        success_so_far = len(checkpoint_df[checkpoint_df['success']])
        print(f"\n📊 Checkpoint {i+1}/{len(players_df)}: {success_so_far}/{i+1} successful "
              f"({success_so_far/(i+1)*100:.1f}%)")

    # Rate limiting - be respectful to FBref
    time.sleep(2)

# Final results
final_results_df = pd.DataFrame(all_results)

# Save final results
final_results_df.to_csv('player_dobs_final.csv', index=False)

# Analysis
total_players = len(final_results_df)
successful_scrapes = len(final_results_df[final_results_df['success']])
failed_scrapes = total_players - successful_scrapes

print(f"\n🎉 SCRAPING COMPLETE!")
print(f"📊 Final Results:")
print(f"   ✅ Successful: {successful_scrapes}/{total_players} "
      f"({successful_scrapes/total_players*100:.1f}%)")
print(f"   ❌ Failed: {failed_scrapes}/{total_players} "
      f"({failed_scrapes/total_players*100:.1f}%)")

# Show successful scrapes
successful_df = final_results_df[final_results_df['success']]
print(f"\n--- Sample Successful DOBs ---")
print(successful_df[['player_name', 'scraped_dob']].head(10))

# Show failure reasons
if failed_scrapes > 0:
    print(f"\n--- Failure Analysis ---")
    failure_reasons = final_results_df[~final_results_df['success']]['status'].value_counts()
    print(failure_reasons)

print(f"\n💾 Results saved to 'player_dobs_final.csv'")
print(f"🔄 Ready to import DOBs into database!")

conn.close()


🚀 Starting full DOB scraping for all 309 players...
⏱️  Estimated time: ~10-15 minutes with rate limiting
💾 Results will be saved periodically in case of interruption


Scraping DOBs:  16%|█▌        | 49/309 [01:48<09:02,  2.09s/it]


📊 Checkpoint 50/309: 20/50 successful (40.0%)


Scraping DOBs:  32%|███▏      | 99/309 [03:32<07:19,  2.09s/it]


📊 Checkpoint 100/309: 21/100 successful (21.0%)


Scraping DOBs:  48%|████▊     | 149/309 [05:17<05:34,  2.09s/it]


📊 Checkpoint 150/309: 21/150 successful (14.0%)


Scraping DOBs:  64%|██████▍   | 199/309 [07:01<03:49,  2.09s/it]


📊 Checkpoint 200/309: 21/200 successful (10.5%)


Scraping DOBs:  81%|████████  | 249/309 [08:46<02:05,  2.09s/it]


📊 Checkpoint 250/309: 21/250 successful (8.4%)


Scraping DOBs:  97%|█████████▋| 299/309 [10:31<00:20,  2.10s/it]


📊 Checkpoint 300/309: 21/300 successful (7.0%)


Scraping DOBs: 100%|██████████| 309/309 [10:51<00:00,  2.11s/it]


🎉 SCRAPING COMPLETE!
📊 Final Results:
   ✅ Successful: 21/309 (6.8%)
   ❌ Failed: 288/309 (93.2%)

--- Sample Successful DOBs ---
        player_name       scraped_dob
0   Abby Dahlkemper      May 13, 1993
1        Abby Smith   October 4, 1993
2      Adriana Leon   October 2, 1992
3  Ainsley Mccammon   August 16, 2007
4   Aisha Solórzano    April 13, 1998
5        Alana Cook    April 11, 1997
6    Alanna Kennedy  January 21, 1995
7        Alex Loera     June 19, 1999
8   Alexa Spaanstra  February 1, 2000
9             Aline      July 7, 2005

--- Failure Analysis ---
status
HTTP 429    281
HTTP 403      7
Name: count, dtype: int64

💾 Results saved to 'player_dobs_final.csv'
🔄 Ready to import DOBs into database!





In [34]:
# ══════════════════════════════════════════════════════════════════════════════
#  UPDATE PLAYER TABLE - Add DOB Column and Import Successful DOBs
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd
from datetime import datetime

# Connect to database
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)
cur = conn.cursor()

# 1) Check current Player table structure
print("📋 Current Player table structure:")
table_info = pd.read_sql_query("PRAGMA table_info(Player);", conn)
print(table_info)

# 2) Check sample DOBs from the Player table
cur.execute("""
    SELECT player_id, player_name, dob 
    FROM Player 
    WHERE player_id IN ('45419c74', '52b58405', '947d22fb') 
    LIMIT 3
""")
sample_current = cur.fetchall()
print(f"\n📊 Sample current DOBs:")
for row in sample_current:
    print(f"  {row[1]}: {row[2]}")

# 3) Load the scraped DOB results
scraped_dobs_df = pd.read_csv('player_dobs_final.csv')
successful_dobs = scraped_dobs_df[scraped_dobs_df['success'] == True].copy()

print(f"\n✅ Found {len(successful_dobs)} successful DOB scrapes to import")
print("\n--- Successful DOBs to Import ---")
print(successful_dobs[['player_name', 'scraped_dob']].head())

# 4) Convert scraped dates to proper YYYY-MM-DD format
def convert_dob_format(date_str):
    """Convert 'September 17, 1999' to '1999-09-17'"""
    try:
        date_obj = datetime.strptime(date_str, '%B %d, %Y')
        return date_obj.strftime('%Y-%m-%d')
    except:
        return None

successful_dobs['dob_formatted'] = successful_dobs['scraped_dob'].apply(convert_dob_format)

print("\n--- DOBs in Database Format ---")
print(successful_dobs[['player_name', 'scraped_dob', 'dob_formatted']].head())

# 5) Update the Player table with real DOBs
print(f"\n📝 Updating Player table with {len(successful_dobs)} real DOBs...")

updated_count = 0
for _, row in successful_dobs.iterrows():
    player_id = row['player_id']
    formatted_dob = row['dob_formatted']

    if formatted_dob:
        cur.execute("""
            UPDATE Player 
            SET dob = ? 
            WHERE player_id = ?
        """, (formatted_dob, player_id))
        updated_count += 1

conn.commit()
print(f"✅ Successfully updated {updated_count} player DOBs")

# 6) Verification - show the updated records
print(f"\n--- Verification: Updated Player DOBs ---")
verification_query = """
    SELECT player_id, player_name, dob
    FROM Player 
    WHERE player_id IN ({})
    ORDER BY player_name
""".format(','.join([f"'{pid}'" for pid in successful_dobs['player_id']]))

updated_players = pd.read_sql_query(verification_query, conn)
print(updated_players.head())

# 7) Calculate some derived data from the real DOBs
print(f"\n--- Age Analysis from Real DOBs ---")
today = datetime.now()

age_analysis = []
for _, row in updated_players.iterrows():
    if row['dob']:
        try:
            birth_date = datetime.strptime(row['dob'], '%Y-%m-%d')
            age_years = today.year - birth_date.year
            if (today.month, today.day) < (birth_date.month, birth_date.day):
                age_years -= 1
            age_analysis.append({
                'player_name': row['player_name'],
                'dob': row['dob'],
                'current_age': age_years,
                'birth_year': birth_date.year
            })
        except:
            continue

age_df = pd.DataFrame(age_analysis)
if not age_df.empty:
    print(f"\n📊 Age Statistics for Updated Players:")
    youngest = age_df.loc[age_df['current_age'].idxmin()]
    oldest = age_df.loc[age_df['current_age'].idxmax()]
    print(f"   Youngest: {youngest['current_age']} years ({youngest['player_name']})")
    print(f"   Oldest: {oldest['current_age']} years ({oldest['player_name']})")
    print(f"   Average age: {age_df['current_age'].mean():.1f} years")
    print(age_df.sort_values('current_age').head())

# 8) Summary statistics
print(f"\n📈 Player Table Summary:")
summary = pd.read_sql_query("""
    SELECT 
        COUNT(*) as total_players,
        COUNT(dob) as players_with_real_dob,
        COUNT(*) - COUNT(dob) as players_missing_dob
    FROM Player
""", conn)
print(summary)

print(f"\n🎉 DOB Import Complete!")
print(f"   ✅ {updated_count} players now have real DOBs")
print(f"   📅 Ready to calculate current ages dynamically")
print(f"   🔄 {309 - updated_count} players still need DOB research")

conn.close()


📋 Current Player table structure:
   cid            name  type  notnull dflt_value  pk
0    0       player_id  TEXT        0       None   1
1    1     player_name  TEXT        1       None   0
2    2     nationality  TEXT        0       None   0
3    3             dob  DATE        0       None   0
4    4  preferred_foot  TEXT        0       None   0

📊 Sample current DOBs:
  Kerry Abello: 1999-01-01
  Abby Dahlkemper: 1993-01-01
  Abby Smith: 1993-01-01

✅ Found 21 successful DOB scrapes to import

--- Successful DOBs to Import ---
        player_name      scraped_dob
0   Abby Dahlkemper     May 13, 1993
1        Abby Smith  October 4, 1993
2      Adriana Leon  October 2, 1992
3  Ainsley Mccammon  August 16, 2007
4   Aisha Solórzano   April 13, 1998

--- DOBs in Database Format ---
        player_name      scraped_dob dob_formatted
0   Abby Dahlkemper     May 13, 1993    1993-05-13
1        Abby Smith  October 4, 1993    1993-10-04
2      Adriana Leon  October 2, 1992    1992-10-02
3  

In [36]:
# ══════════════════════════════════════════════════════════════════════════════
#  EXPLORE MATCH REPORT STRUCTURE - Build from Local Data
# ══════════════════════════════════════════════════════════════════════════════

from pathlib import Path
from bs4 import BeautifulSoup
import pandas as pd

# Pick one match report to examine
match_files = list(Path("data/raw_match_pages").glob("*.html"))
print(f"📁 Found {len(match_files)} match report files")

# Use known sample file with data
sample_match = Path(
    "/Users/thomasmcmillan/projects/nwsl_data/data/raw_match_pages/match_report_4acb5c69.html"
)

print(f"🔍 Examining: {sample_match.name}")

# Load and parse the HTML file
try:
    soup = BeautifulSoup(sample_match.read_text(encoding="utf-8"), "lxml")
except Exception as e:
    print(f"❌ Failed to read HTML file: {e}")
    exit(1)

# Find all tables in this match report
tables = soup.select("table")
print(f"✅ Found {len(tables)} tables in this match report")

# Extract each table and show structure
for i, table in enumerate(tables):
    try:
        df = pd.read_html(str(table))[0]
        table_id = table.get('id', f'no-id-{i}')
        print(f"\n── Table {i:02d}: {table_id} (shape: {df.shape}) ──")
        print(f"Columns: {list(df.columns)}")
        if not df.empty:
            print("Sample data:")
            print(df.head(3))
    except Exception as e:
        print(f"── Table {i:02d}: Could not parse ({str(e)[:50]}) ──")

print("\n🎯 Next: Build extractors for key tables (Match, Lineup, Stats, Events)")


📁 Found 0 match report files
🔍 Examining: match_report_4acb5c69.html
✅ Found 20 tables in this match report

── Table 00: no-id-0 (shape: (21, 2)) ──
Columns: ['Current (4-2-3-1)', 'Current (4-2-3-1).1']
Sample data:
  Current (4-2-3-1) Current (4-2-3-1).1
0                 1       Almuth Schult
1                 4         Hailie Mace
2                 6      Temwa Chaŵinga

── Table 01: no-id-1 (shape: (21, 2)) ──
Columns: ['Gotham FC (4-1-4-1)', 'Gotham FC (4-1-4-1).1']
Sample data:
  Gotham FC (4-1-4-1) Gotham FC (4-1-4-1).1
0                  30     Ann-Katrin Berger
1                   2     Jenna Nighswonger
2                   6         Emily Sonnett

── Table 02: no-id-2 (shape: (9, 2)) ──
Columns: [('Current', 'Possession'), ('Gotham FC', 'Possession')]
Sample data:
            Current         Gotham FC
         Possession        Possession
0               42%               58%
1  Passing Accuracy  Passing Accuracy
2  294 of 407 — 72%  77% — 438 of 568

── Table 03: stats_6f66

  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]


In [37]:
# ══════════════════════════════════════════════════════════════════════════════
#  EXTRACT CORE MATCH DATA from Local HTML Files
# ══════════════════════════════════════════════════════════════════════════════

from pathlib import Path
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import re
from datetime import datetime

def extract_match_data_from_html(html_path):
    """Extract core match information from a match report HTML file"""

    soup = BeautifulSoup(html_path.read_text(encoding="utf-8"), "lxml")

    # Extract match metadata from the page title/headers
    h1_title = soup.find('h1')
    title = h1_title.get_text(strip=True) if h1_title else html_path.name

    # Parse teams and date from title like:
    # "Current vs Gotham FC Match Report – Saturday April 12, 2025"
    match_pattern = (
        r'(.+?)\s+vs\.?\s+(.+?)\s+Match Report.*?'
        r'([A-Za-z]+)\s+([A-Za-z]+)\s+(\d{1,2}),\s+(\d{4})'
    )
    match_info = re.search(match_pattern, title)

    if match_info:
        home_team = match_info.group(1).strip()
        away_team = match_info.group(2).strip()
        day = match_info.group(3)
        month = match_info.group(4)
        date_num = match_info.group(5)
        year = match_info.group(6)

        try:
            match_date = pd.to_datetime(f"{month} {date_num}, {year}").strftime('%Y-%m-%d')
        except:
            match_date = None
    else:
        home_team = away_team = match_date = None

    # Find the lineup tables (first two tables are usually lineups)
    tables = soup.select("table")
    home_lineup = None
    away_lineup = None

    if len(tables) >= 2:
        try:
            home_lineup_df = pd.read_html(str(tables[0]))[0]
            away_lineup_df = pd.read_html(str(tables[1]))[0]

            home_lineup = {
                'formation': home_lineup_df.columns[0] if len(home_lineup_df.columns) > 0 else None,
                'players': home_lineup_df.iloc[:, 1].dropna().tolist() if home_lineup_df.shape[1] > 1 else []
            }

            away_lineup = {
                'formation': away_lineup_df.columns[0] if len(away_lineup_df.columns) > 0 else None,
                'players': away_lineup_df.iloc[:, 1].dropna().tolist() if away_lineup_df.shape[1] > 1 else []
            }
        except Exception as e:
            print(f"⚠️ Could not parse lineup tables: {e}")

    # Find player stats tables (look for table IDs like 'stats_[teamid]_summary')
    player_stats_tables = []
    for table in tables:
        table_id = table.get('id', '')
        if 'stats_' in table_id and '_summary' in table_id:
            try:
                stats_df = pd.read_html(str(table))[0]
                team_id_match = re.search(r'stats_([a-f0-9]{8})_', table_id)
                team_id = team_id_match.group(1) if team_id_match else None

                player_stats_tables.append({
                    'team_id': team_id,
                    'table_id': table_id,
                    'data': stats_df
                })
            except Exception as e:
                print(f"⚠️ Could not parse stats table {table_id}: {e}")

    return {
        'file_name': html_path.name,
        'home_team': home_team,
        'away_team': away_team,
        'match_date': match_date,
        'home_lineup': home_lineup,
        'away_lineup': away_lineup,
        'player_stats_tables': player_stats_tables,
        'title': title
    }

# Test with the sample file we examined
sample_file = Path("/Users/thomasmcmillan/projects/nwsl_data/data/raw_match_pages/match_report_4acb5c69.html")

if sample_file.exists():
    print("🔍 Testing match data extraction...")

    match_data = extract_match_data_from_html(sample_file)

    print(f"\n📊 Extracted Match Data:")
    print(f"   File: {match_data['file_name']}")
    print(f"   Title: {match_data['title']}")
    print(f"   Home Team: {match_data['home_team']}")
    print(f"   Away Team: {match_data['away_team']}")
    print(f"   Match Date: {match_data['match_date']}")

    if match_data['home_lineup']:
        print(f"   Home Formation: {match_data['home_lineup']['formation']}")
        print(f"   Home Players: {len(match_data['home_lineup']['players'])} players")
        print(f"   Sample: {match_data['home_lineup']['players'][:3]}")

    if match_data['away_lineup']:
        print(f"   Away Formation: {match_data['away_lineup']['formation']}")
        print(f"   Away Players: {len(match_data['away_lineup']['players'])} players")

    print(f"   Player Stats Tables: {len(match_data['player_stats_tables'])}")
    for table_info in match_data['player_stats_tables']:
        print(f"     - {table_info['table_id']}: Team {table_info['team_id']} "
              f"({table_info['data'].shape})")

else:
    print("❌ Sample file not found. Let's check available files:")
    match_files = list(Path("data/raw_match_pages").glob("*.html"))
    print(f"Available files: {[f.name for f in match_files[:5]]}")


🔍 Testing match data extraction...

📊 Extracted Match Data:
   File: match_report_4acb5c69.html
   Title: Kansas City Current vs. Gotham FC Match Report – Saturday September 28, 2024
   Home Team: Kansas City Current
   Away Team: Gotham FC
   Match Date: 2024-09-28
   Home Formation: Current (4-2-3-1)
   Home Players: 21 players
   Sample: ['Almuth Schult', 'Hailie Mace', 'Temwa Chaŵinga']
   Away Formation: Gotham FC (4-1-4-1)
   Away Players: 21 players
   Player Stats Tables: 2
     - stats_6f666306_summary: Team 6f666306 ((16, 31))
     - stats_8e306dc6_summary: Team 8e306dc6 ((16, 31))


  home_lineup_df = pd.read_html(str(tables[0]))[0]
  away_lineup_df = pd.read_html(str(tables[1]))[0]
  stats_df = pd.read_html(str(table))[0]
  stats_df = pd.read_html(str(table))[0]


In [38]:
# ══════════════════════════════════════════════════════════════════════════════
#  CREATE MATCH TABLE and Populate from Local HTML Files
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd
from pathlib import Path
import hashlib
from io import StringIO

# Connect to database
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)
cur = conn.cursor()

# 1) Create Match table according to your schema
cur.execute("DROP TABLE IF EXISTS Match;")

cur.execute("""
CREATE TABLE Match (
    match_id      TEXT PRIMARY KEY,        -- Generate from team+date hash
    season_id     INTEGER,                 -- Will add Season table later
    match_date    DATE,
    match_time    TIME,                    -- NULL for now
    home_team_id  TEXT REFERENCES teams(team_id),
    away_team_id  TEXT REFERENCES teams(team_id),
    home_goals    INTEGER,                 -- Will extract from stats
    away_goals    INTEGER,                 -- Will extract from stats
    home_xg       REAL,                    -- Will extract from stats
    away_xg       REAL,                    -- Will extract from stats
    attendance    INTEGER,                 -- NULL for now
    venue         TEXT,                    -- NULL for now
    referee       TEXT,                    -- NULL for now
    temperature   REAL                     -- NULL for now
);
""")

print("✅ Created Match table")

# 2) Helper function to generate match_id
def generate_match_id(home_team_id, away_team_id, match_date):
    """Generate unique match_id from team IDs and date"""
    combined = f"{home_team_id}_{away_team_id}_{match_date}"
    return hashlib.md5(combined.encode()).hexdigest()[:8]

# 3) Map team names to team IDs (from your existing teams table)
team_mapping = pd.read_sql_query("SELECT team_id, team_name FROM teams", conn)
team_name_to_id = dict(zip(team_mapping['team_name'], team_mapping['team_id']))

print(f"📋 Loaded {len(team_name_to_id)} teams for mapping")

# 4) Process all match files
match_files = list(Path("data/raw_match_pages").glob("*.html"))
print(f"📁 Found {len(match_files)} match report files to process")

matches_to_insert = []
failed_extractions = []

# Process first 5 files as test
for i, html_file in enumerate(match_files[:5]):
    print(f"🔍 Processing {i+1}/5: {html_file.name}")

    try:
        # Extract match data using your extraction function
        match_data = extract_match_data_from_html(html_file)

        home_team_id = team_name_to_id.get(match_data['home_team'])
        away_team_id = team_name_to_id.get(match_data['away_team'])

        if home_team_id and away_team_id and match_data['match_date']:
            match_id = generate_match_id(home_team_id, away_team_id, match_data['match_date'])

            matches_to_insert.append({
                'match_id': match_id,
                'match_date': match_data['match_date'],
                'home_team_id': home_team_id,
                'away_team_id': away_team_id,
                'home_team_name': match_data['home_team'],
                'away_team_name': match_data['away_team'],
                'file_name': html_file.name
            })
        else:
            failed_extractions.append({
                'file': html_file.name,
                'reason': f"Missing mapping - home: {match_data['home_team']} ({home_team_id}), "
                          f"away: {match_data['away_team']} ({away_team_id})"
            })

    except Exception as e:
        failed_extractions.append({
            'file': html_file.name,
            'reason': f"Extraction error: {str(e)[:50]}"
        })

# 5) Insert matches into database
print(f"\n📝 Inserting {len(matches_to_insert)} matches...")

for match in matches_to_insert:
    cur.execute("""
        INSERT OR REPLACE INTO Match (
            match_id, match_date, home_team_id, away_team_id,
            season_id, match_time, home_goals, away_goals, 
            home_xg, away_xg, attendance, venue, referee, temperature
        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    """, (
        match['match_id'],
        match['match_date'],
        match['home_team_id'],
        match['away_team_id'],
        None,  # season_id
        None,  # match_time
        None,  # home_goals
        None,  # away_goals
        None,  # home_xg
        None,  # away_xg
        None,  # attendance
        None,  # venue
        None,  # referee
        None   # temperature
    ))

conn.commit()
print(f"✅ Successfully inserted {len(matches_to_insert)} matches")

# 6) Show failures
if failed_extractions:
    print(f"\n❌ Failed extractions ({len(failed_extractions)}):")
    for failure in failed_extractions:
        print(f"   {failure['file']}: {failure['reason']}")

# 7) Verify inserted matches
print(f"\n--- Verification: Matches in Database ---")
verification = pd.read_sql_query("""
    SELECT 
        m.match_id,
        m.match_date,
        ht.team_name as home_team,
        at.team_name as away_team
    FROM Match m
    JOIN teams ht ON m.home_team_id = ht.team_id
    JOIN teams at ON m.away_team_id = at.team_id
    ORDER BY m.match_date DESC
""", conn)

print(verification)

print(f"\n🎉 Match table foundation complete!")
print("   - Ready to add scores, xG, and other match details")
print("   - Ready to extract lineup and player stats data")

conn.close()


✅ Created Match table
📋 Loaded 16 teams for mapping
📁 Found 0 match report files to process

📝 Inserting 0 matches...
✅ Successfully inserted 0 matches

--- Verification: Matches in Database ---
Empty DataFrame
Columns: [match_id, match_date, home_team, away_team]
Index: []

🎉 Match table foundation complete!
   - Ready to add scores, xG, and other match details
   - Ready to extract lineup and player stats data


In [39]:
# ══════════════════════════════════════════════════════════════════════════════
#  CREATE MATCH TABLE and Populate from Local HTML Files (FIXED PATHS)
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd
from pathlib import Path
import hashlib

# Connect to database
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)
cur = conn.cursor()

# 1) Check that Match table exists (should be created already)
print("✅ Using existing Match table")

# 2) Helper function to generate match_id
def generate_match_id(home_team_id, away_team_id, match_date):
    """Generate unique match_id from team IDs and date"""
    combined = f"{home_team_id}_{away_team_id}_{match_date}"
    return hashlib.md5(combined.encode()).hexdigest()[:8]

# 3) Map team names to team IDs (from your existing teams table)
team_mapping = pd.read_sql_query("SELECT team_id, team_name FROM teams", conn)
team_name_to_id = dict(zip(team_mapping['team_name'], team_mapping['team_id']))

print(f"📋 Loaded {len(team_name_to_id)} teams for mapping")

# 4) Use the correct absolute path to your match files
match_files_dir = Path("/Users/thomasmcmillan/projects/nwsl_data/data/raw_match_pages")
print(f"🔍 Looking in: {match_files_dir}")

# Find all HTML and HTM files
html_files = list(match_files_dir.glob("*.html"))
htm_files = list(match_files_dir.glob("*.htm"))
all_match_files = html_files + htm_files

print(f"📁 Found {len(html_files)} .html files and {len(htm_files)} .htm files")
print(f"📁 Total match files: {len(all_match_files)}")

if all_match_files:
    print("📋 Sample files:")
    for i, file in enumerate(all_match_files[:3]):
        print(f"   {i + 1}. {file.name}")

# 5) Process first 10 files as test
matches_to_insert = []
failed_extractions = []

print("\n🔍 Processing first 10 files...")

for i, html_file in enumerate(all_match_files[:10]):
    print(f"   Processing {i + 1}/10: {html_file.name}")

    try:
        # Extract match data using your extractor
        match_data = extract_match_data_from_html(html_file)

        home_team_id = team_name_to_id.get(match_data['home_team'])
        away_team_id = team_name_to_id.get(match_data['away_team'])

        if home_team_id and away_team_id and match_data['match_date']:
            match_id = generate_match_id(home_team_id, away_team_id, match_data['match_date'])

            matches_to_insert.append({
                'match_id': match_id,
                'match_date': match_data['match_date'],
                'home_team_id': home_team_id,
                'away_team_id': away_team_id,
                'home_team_name': match_data['home_team'],
                'away_team_name': match_data['away_team'],
                'file_name': html_file.name
            })

            print(f"     ✅ {match_data['home_team']} vs {match_data['away_team']} on {match_data['match_date']}")
        else:
            failed_extractions.append({
                'file': html_file.name,
                'reason': (
                    f"Missing mapping - home: {match_data['home_team']} ({home_team_id}), "
                    f"away: {match_data['away_team']} ({away_team_id})"
                )
            })
            print(f"     ❌ Failed: {match_data['home_team']} vs {match_data['away_team']}")

    except Exception as e:
        failed_extractions.append({
            'file': html_file.name,
            'reason': f"Extraction error: {str(e)[:50]}"
        })
        print(f"     ❌ Error: {str(e)[:50]}")

# 6) Insert matches into database
print(f"\n📝 Inserting {len(matches_to_insert)} matches...")

for match in matches_to_insert:
    cur.execute("""
        INSERT OR REPLACE INTO Match (
            match_id, match_date, home_team_id, away_team_id,
            season_id, match_time, home_goals, away_goals, 
            home_xg, away_xg, attendance, venue, referee, temperature
        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    """, (
        match['match_id'],
        match['match_date'],
        match['home_team_id'],
        match['away_team_id'],
        None,  # season_id
        None,  # match_time
        None,  # home_goals
        None,  # away_goals
        None,  # home_xg
        None,  # away_xg
        None,  # attendance
        None,  # venue
        None,  # referee
        None   # temperature
    ))

conn.commit()
print(f"✅ Successfully inserted {len(matches_to_insert)} matches")

# 7) Show results
if failed_extractions:
    print(f"\n❌ Failed extractions ({len(failed_extractions)}):")
    for failure in failed_extractions:
        print(f"   {failure['file']}: {failure['reason']}")

# 8) Verify inserted matches
print("\n--- Verification: Matches in Database ---")
verification = pd.read_sql_query("""
    SELECT 
        m.match_id,
        m.match_date,
        ht.team_name AS home_team,
        at.team_name AS away_team
    FROM Match m
    JOIN teams ht ON m.home_team_id = ht.team_id
    JOIN teams at ON m.away_team_id = at.team_id
    ORDER BY m.match_date DESC
""", conn)

print(verification)

print(f"\n🎉 Match extraction complete!")
print(f"   ✅ Processed {len(all_match_files)} total files")
print(f"   ✅ Successfully inserted {len(matches_to_insert)} matches")
print(f"   ❌ Failed: {len(failed_extractions)} files")

conn.close()


✅ Using existing Match table
📋 Loaded 16 teams for mapping
🔍 Looking in: /Users/thomasmcmillan/projects/nwsl_data/data/raw_match_pages
📁 Found 1537 .html files and 0 .htm files
📁 Total match files: 1537
📋 Sample files:
   1. match_report_ec7c8e61.html
   2. match_report_2023_0b11cd8c.html
   3. match_report_2021_bce84cc5.html

🔍 Processing first 10 files...
   Processing 1/10: match_report_ec7c8e61.html
     ✅ Portland Thorns FC vs Washington Spirit on 2024-05-04
   Processing 2/10: match_report_2023_0b11cd8c.html


  home_lineup_df = pd.read_html(str(tables[0]))[0]
  away_lineup_df = pd.read_html(str(tables[1]))[0]
  stats_df = pd.read_html(str(table))[0]
  stats_df = pd.read_html(str(table))[0]
  home_lineup_df = pd.read_html(str(tables[0]))[0]
  away_lineup_df = pd.read_html(str(tables[1]))[0]
  stats_df = pd.read_html(str(table))[0]
  stats_df = pd.read_html(str(table))[0]


     ❌ Failed: Gotham FC vs Chicago Red Stars
   Processing 3/10: match_report_2021_bce84cc5.html
     ❌ Failed: Gotham FC vs Kansas City
   Processing 4/10: match_report_2019_e6c77eac.html
     ❌ Failed: None vs None
   Processing 5/10: match_report_2016_a8e80918.html
     ❌ Failed: None vs None
   Processing 6/10: match_report_2021_7cdce4c7.html


  home_lineup_df = pd.read_html(str(tables[0]))[0]
  away_lineup_df = pd.read_html(str(tables[1]))[0]
  stats_df = pd.read_html(str(table))[0]
  stats_df = pd.read_html(str(table))[0]
  home_lineup_df = pd.read_html(str(tables[0]))[0]
  away_lineup_df = pd.read_html(str(tables[1]))[0]
  stats_df = pd.read_html(str(table))[0]
  stats_df = pd.read_html(str(table))[0]
  home_lineup_df = pd.read_html(str(tables[0]))[0]
  away_lineup_df = pd.read_html(str(tables[1]))[0]
  stats_df = pd.read_html(str(table))[0]
  stats_df = pd.read_html(str(table))[0]


     ✅ Houston Dash vs Orlando Pride on 2021-06-26
   Processing 7/10: match_report_2021_ed763b80.html
     ❌ Failed: OL Reign vs Houston Dash
   Processing 8/10: match_report_2015_fe8efcf6.html
     ❌ Failed: None vs None
   Processing 9/10: match_report_2023_57b3750d.html
     ❌ Failed: Racing Louisville vs San Diego Wave
   Processing 10/10: match_report_f1855d1d.html
     ✅ North Carolina Courage vs Washington Spirit on 2024-11-02

📝 Inserting 3 matches...
✅ Successfully inserted 3 matches

❌ Failed extractions (7):
   match_report_2023_0b11cd8c.html: Missing mapping - home: Gotham FC (8e306dc6), away: Chicago Red Stars (None)
   match_report_2021_bce84cc5.html: Missing mapping - home: Gotham FC (8e306dc6), away: Kansas City (None)
   match_report_2019_e6c77eac.html: Missing mapping - home: None (None), away: None (None)
   match_report_2016_a8e80918.html: Missing mapping - home: None (None), away: None (None)
   match_report_2021_ed763b80.html: Missing mapping - home: OL Reign (No

  home_lineup_df = pd.read_html(str(tables[0]))[0]
  away_lineup_df = pd.read_html(str(tables[1]))[0]
  stats_df = pd.read_html(str(table))[0]
  stats_df = pd.read_html(str(table))[0]
  home_lineup_df = pd.read_html(str(tables[0]))[0]
  away_lineup_df = pd.read_html(str(tables[1]))[0]
  stats_df = pd.read_html(str(table))[0]
  stats_df = pd.read_html(str(table))[0]


In [40]:
# ══════════════════════════════════════════════════════════════════════════════
#  IMPROVED TEAM MAPPING - Handle Historical/Alternative Team Names  
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd

# Connect to database
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

# 1) Get current team mapping
team_mapping = pd.read_sql_query("SELECT team_id, team_name FROM teams", conn)
team_name_to_id = dict(zip(team_mapping['team_name'], team_mapping['team_id']))

print("📋 Current team names in database:")
for name in sorted(team_mapping['team_name']):
    print(f"   - {name}")

# 2) Create expanded mapping with historical/alternative names
def create_expanded_team_mapping():
    """Create mapping that includes historical and alternative team names"""

    # Start with current team names
    expanded_mapping = team_name_to_id.copy()

    # Add historical/alternative names → current team IDs
    historical_names = {
        # Chicago Stars variations
        "Chicago Red Stars": team_name_to_id.get("Chicago Stars FC"),
        "Red Stars": team_name_to_id.get("Chicago Stars FC"),

        # Kansas City variations  
        "Kansas City": team_name_to_id.get("Kansas City Current"),
        "KC Current": team_name_to_id.get("Kansas City Current"),

        # Seattle Reign variations
        "OL Reign": team_name_to_id.get("Seattle Reign FC"),
        "Reign FC": team_name_to_id.get("Seattle Reign FC"),
        "Seattle Reign": team_name_to_id.get("Seattle Reign FC"),

        # San Diego Wave variations
        "San Diego Wave": team_name_to_id.get("San Diego Wave FC"),
        "Wave FC": team_name_to_id.get("San Diego Wave FC"),

        # Other common variations
        "Portland Thorns": team_name_to_id.get("Portland Thorns FC"),
        "Washington": team_name_to_id.get("Washington Spirit"),
        "Orlando": team_name_to_id.get("Orlando Pride"),
        "Houston": team_name_to_id.get("Houston Dash"),
        "North Carolina": team_name_to_id.get("North Carolina Courage"),
        "NC Courage": team_name_to_id.get("North Carolina Courage"),
        "Angel City": team_name_to_id.get("Angel City FC"),
        "Bay Area": team_name_to_id.get("Bay FC"),
        "Utah": team_name_to_id.get("Utah Royals"),
        "Racing": team_name_to_id.get("Racing Louisville"),
        "Louisville": team_name_to_id.get("Racing Louisville"),
        "Gotham": team_name_to_id.get("Gotham FC"),
        "Sky Blue": team_name_to_id.get("Gotham FC"),  # Historical name
        "Sky Blue FC": team_name_to_id.get("Gotham FC"),
    }

    # Add valid mappings
    for alt_name, team_id in historical_names.items():
        if team_id:
            expanded_mapping[alt_name] = team_id

    return expanded_mapping

# 3) Create the expanded mapping
expanded_team_mapping = create_expanded_team_mapping()

print(f"\n📈 Expanded mapping: {len(team_name_to_id)} → {len(expanded_team_mapping)} team name variations")

# 4) Test the mapping with known problem cases
failed_teams = [
    "Chicago Red Stars", "Kansas City", "OL Reign",
    "San Diego Wave", "Racing Louisville"
]

print(f"\n🧪 Testing failed team names:")
for team_name in failed_teams:
    team_id = expanded_team_mapping.get(team_name)
    status = "✅ Found" if team_id else "❌ Missing"
    print(f"   {team_name} → {team_id} ({status})")

print(f"\n🔧 Ready to rerun extraction with improved team mapping!")

conn.close()


📋 Current team names in database:
   - Angel City FC
   - Bay FC
   - Boston Breakers
   - Chicago Stars FC
   - Gotham FC
   - Houston Dash
   - Kansas City Current
   - North Carolina Courage
   - Orlando Pride
   - Portland Thorns FC
   - Racing Louisville
   - San Diego Wave FC
   - Seattle Reign FC
   - Utah Royals
   - Washington Spirit
   - Western New York Flash

📈 Expanded mapping: 16 → 39 team name variations

🧪 Testing failed team names:
   Chicago Red Stars → d976a235 (✅ Found)
   Kansas City → 6f666306 (✅ Found)
   OL Reign → 257fad2b (✅ Found)
   San Diego Wave → bf961da0 (✅ Found)
   Racing Louisville → da19ebd1 (✅ Found)

🔧 Ready to rerun extraction with improved team mapping!


In [42]:
# ══════════════════════════════════════════════════════════════════════════════
#  PROCESS ALL MATCH REPORTS - Using Improved Team Mapping
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
import re
from datetime import datetime
from tqdm import tqdm

# Connect to database
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)
cur = conn.cursor()

# 1) Get expanded team mapping from database
team_mapping_query = """
SELECT team_name, team_id FROM teams
UNION ALL
SELECT team_name_short, team_id FROM teams WHERE team_name_short IS NOT NULL
UNION ALL
SELECT team_name_alias_1, team_id FROM teams WHERE team_name_alias_1 IS NOT NULL
UNION ALL
SELECT team_name_alias_2, team_id FROM teams WHERE team_name_alias_2 IS NOT NULL
"""
team_mapping_df = pd.read_sql_query(team_mapping_query, conn)
expanded_team_mapping = dict(zip(team_mapping_df['team_name'], team_mapping_df['team_id']))

# Add additional mappings
additional_mappings = {
    "San Diego Wave": "bf961da0",
    "OL Reign": "257fad2b",
    "Red Stars": "d976a235",
    "Current": "6f666306"
}
expanded_team_mapping.update(additional_mappings)

print(f"✅ Loaded expanded team mapping with {len(expanded_team_mapping)} variations")

# 2) Ensure Match table exists
cur.execute("""
CREATE TABLE IF NOT EXISTS Match (
    match_id INTEGER PRIMARY KEY AUTOINCREMENT,
    season_id INTEGER,
    match_date DATE,
    home_team_id TEXT REFERENCES teams(team_id),
    away_team_id TEXT REFERENCES teams(team_id),
    home_xg REAL,
    away_xg REAL,
    attendance INTEGER,
    venue TEXT,
    referee TEXT,
    home_formation TEXT,
    away_formation TEXT,
    filename TEXT,
    extraction_status TEXT
)
""")

# 3) Resolve team name
def resolve_team_name(team_name):
    if not team_name:
        return None
    if team_name in expanded_team_mapping:
        return expanded_team_mapping[team_name]
    clean_name = team_name.replace(" FC", "").replace(" SC", "").strip()
    return expanded_team_mapping.get(clean_name)

# 4) Match extraction function
def extract_match_data_from_html(html_path):
    try:
        soup = BeautifulSoup(html_path.read_text(encoding="utf-8"), "lxml")
        headline = soup.find("h1")
        if not headline:
            return None, "No headline found"

        headline_text = headline.get_text(strip=True)
        match_pattern = r"(.+?)\s+vs\.?\s+(.+?)\s+Match Report.*?(\w+)\s+(\w+)\s+(\d{1,2}),?\s+(\d{4})"
        match = re.search(match_pattern, headline_text)
        if not match:
            return None, f"Could not parse headline: {headline_text}"

        home_name, away_name, _, month, day, year = match.groups()
        home_name = home_name.strip()
        away_name = away_name.strip()

        home_team_id = resolve_team_name(home_name)
        away_team_id = resolve_team_name(away_name)

        if not home_team_id:
            return None, f"Could not resolve home team: '{home_name}'"
        if not away_team_id:
            return None, f"Could not resolve away team: '{away_name}'"

        try:
            match_date = datetime.strptime(f"{month} {day} {year}", "%B %d %Y").date()
        except ValueError:
            try:
                match_date = datetime.strptime(f"{month} {day} {year}", "%b %d %Y").date()
            except ValueError:
                return None, f"Could not parse date: {month} {day} {year}"

        # xG
        xg_text = soup.find(string=re.compile(r"xG"))
        home_xg = away_xg = None
        if xg_text:
            xg_numbers = re.findall(r"(\d+\.\d+)", str(xg_text))
            if len(xg_numbers) >= 2:
                home_xg, away_xg = float(xg_numbers[0]), float(xg_numbers[1])

        # Attendance
        attendance = None
        attendance_text = soup.find(string=re.compile(r"Attendance:"))
        if attendance_text:
            att_match = re.search(r"(\d[\d,]*)", attendance_text)
            if att_match:
                attendance = int(att_match.group(1).replace(",", ""))

        # Venue
        venue = None
        venue_text = soup.find(string=re.compile(r"Venue:"))
        if venue_text and ":" in venue_text:
            venue = venue_text.split(":", 1)[1].strip()

        # Referee
        referee = None
        referee_text = soup.find(string=re.compile(r"Referee:"))
        if referee_text and ":" in referee_text:
            referee = referee_text.split(":", 1)[1].strip()

        return {
            'match_date': match_date,
            'home_team_id': home_team_id,
            'away_team_id': away_team_id,
            'home_team_name': home_name,
            'away_team_name': away_name,
            'home_xg': home_xg,
            'away_xg': away_xg,
            'attendance': attendance,
            'venue': venue,
            'referee': referee,
            'season_year': int(year)
        }, "success"

    except Exception as e:
        return None, f"Exception: {str(e)}"

# 5) Process HTML files
html_dir = Path("/Users/thomasmcmillan/projects/nwsl_data/data/raw_match_pages")
html_files = list(html_dir.glob("*.html"))
print(f"🔍 Found {len(html_files)} HTML files to process")

# 6) Process and extract
successful_extractions = []
failed_extractions = []

print("🚀 Processing all match reports...")

for html_file in tqdm(html_files, desc="Processing matches"):
    match_data, status = extract_match_data_from_html(html_file)
    if match_data:
        match_data['filename'] = html_file.name
        match_data['extraction_status'] = status
        successful_extractions.append(match_data)
    else:
        failed_extractions.append({
            'filename': html_file.name,
            'error': status
        })

# 7) Results summary
total_files = len(html_files)
successful_count = len(successful_extractions)
failed_count = len(failed_extractions)

print(f"\n📊 Extraction Results:")
print(f"   ✅ Successful: {successful_count}/{total_files} ({successful_count/total_files*100:.1f}%)")
print(f"   ❌ Failed: {failed_count}/{total_files} ({failed_count/total_files*100:.1f}%)")

# 8) Insert into DB
if successful_extractions:
    print(f"📝 Inserting {successful_count} matches into database...")

    cur.execute("DELETE FROM Match")

    for match_data in successful_extractions:
        cur.execute("""
            INSERT INTO Match (
                season_id, match_date, home_team_id, away_team_id,
                home_xg, away_xg, attendance, venue, referee,
                filename, extraction_status
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """, (
            match_data['season_year'],
            match_data['match_date'],
            match_data['home_team_id'],
            match_data['away_team_id'],
            match_data['home_xg'],
            match_data['away_xg'],
            match_data['attendance'],
            match_data['venue'],
            match_data['referee'],
            match_data['filename'],
            match_data['extraction_status']
        ))

    conn.commit()
    print(f"✅ Successfully inserted {successful_count} matches")

# 9) Sample results
if successful_extractions:
    print("\n--- Sample Successful Extractions ---")
    df_sample = pd.DataFrame(successful_extractions[:5])
    print(df_sample[['match_date', 'home_team_name', 'away_team_name', 'home_xg', 'away_xg']])

# 10) Analyze failures
if failed_extractions:
    print("\n--- Failure Analysis ---")
    failure_df = pd.DataFrame(failed_extractions)
    print("Top failure reasons:")
    print(failure_df['error'].value_counts().head(10))
    print("\n--- Sample Failed Files ---")
    print(failure_df.head())

# 11) Verify database
print("\n--- Database Verification ---")
matches_in_db = pd.read_sql_query("""
    SELECT COUNT(*) AS total_matches,
           COUNT(DISTINCT home_team_id) + COUNT(DISTINCT away_team_id) AS unique_teams,
           MIN(match_date) AS earliest_match,
           MAX(match_date) AS latest_match,
           COUNT(DISTINCT season_id) AS seasons
    FROM Match
""", conn)
print(matches_in_db)

# Show recent matches
sample_matches = pd.read_sql_query("""
    SELECT m.match_date,
           h.team_name AS home_team,
           a.team_name AS away_team,
           m.home_xg, m.away_xg,
           m.venue
    FROM Match m
    JOIN teams h ON m.home_team_id = h.team_id
    JOIN teams a ON m.away_team_id = a.team_id
    ORDER BY m.match_date DESC
    LIMIT 10
""", conn)
print("\n--- Recent Matches in Database ---")
print(sample_matches)

conn.close()

print("\n🎉 Match extraction complete!")
print(f"   📊 {successful_count} matches successfully processed")
print("   🗂️ Ready for lineup and player stats extraction")


✅ Loaded expanded team mapping with 39 variations
🔍 Found 1537 HTML files to process
🚀 Processing all match reports...


Processing matches: 100%|██████████| 1537/1537 [01:54<00:00, 13.39it/s]


📊 Extraction Results:
   ✅ Successful: 746/1537 (48.5%)
   ❌ Failed: 791/1537 (51.5%)
📝 Inserting 746 matches into database...





OperationalError: table Match has no column named filename

In [43]:
# ══════════════════════════════════════════════════════════════════════════════
#  FIX MATCH TABLE STRUCTURE AND COMPLETE INSERTION
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd

# Connect to database
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)
cur = conn.cursor()

# 1) Check current table structure
print("📋 Current Match table structure:")
table_info = pd.read_sql_query("PRAGMA table_info(Match);", conn)
print(table_info)

# 2) Add missing columns
missing_columns = ['filename', 'extraction_status']
existing_columns = table_info['name'].tolist()

for col in missing_columns:
    if col not in existing_columns:
        print(f"➕ Adding missing column: {col}")
        cur.execute(f"ALTER TABLE Match ADD COLUMN {col} TEXT;")

conn.commit()
print("✅ Table structure updated")

# 3) Verify new structure
print("\n📋 Updated Match table structure:")
updated_info = pd.read_sql_query("PRAGMA table_info(Match);", conn)
print(updated_info)

conn.close()


📋 Current Match table structure:
    cid          name     type  notnull dflt_value  pk
0     0      match_id     TEXT        0       None   1
1     1     season_id  INTEGER        0       None   0
2     2    match_date     DATE        0       None   0
3     3    match_time     TIME        0       None   0
4     4  home_team_id     TEXT        0       None   0
5     5  away_team_id     TEXT        0       None   0
6     6    home_goals  INTEGER        0       None   0
7     7    away_goals  INTEGER        0       None   0
8     8       home_xg     REAL        0       None   0
9     9       away_xg     REAL        0       None   0
10   10    attendance  INTEGER        0       None   0
11   11         venue     TEXT        0       None   0
12   12       referee     TEXT        0       None   0
13   13   temperature     REAL        0       None   0
➕ Adding missing column: filename


OperationalError: database is locked

In [46]:
# ══════════════════════════════════════════════════════════════════════════════
#  FIX DATABASE LOCK AND COMPLETE MATCH INSERTION
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd
import time

# 1) Close any existing connections first
try:
    conn.close()
    print("✅ Closed existing connection")
except:
    pass

# 2) Wait briefly for any locks to clear
time.sleep(1)

# 3) Connect fresh to the database with timeout
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path, timeout=30)
cur = conn.cursor()

# 4) Validate that 'successful_extractions' exists
try:
    print(f"📝 Ready to insert {len(successful_extractions)} matches")
except NameError:
    print("❌ 'successful_extractions' not found. Run the extraction code first.")
    conn.close()
    raise

# 5) Safely clear and insert match data
try:
    conn.execute("BEGIN")
    cur.execute("DELETE FROM Match")
    conn.commit()
    print("✅ Cleared existing match data")

    for i, match_data in enumerate(successful_extractions):
        cur.execute("""
            INSERT INTO Match (
                season_id, match_date, home_team_id, away_team_id,
                home_xg, away_xg, attendance, venue, referee,
                filename, extraction_status
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """, (
            match_data['season_year'],
            match_data['match_date'],
            match_data['home_team_id'],
            match_data['away_team_id'],
            match_data['home_xg'],
            match_data['away_xg'],
            match_data['attendance'],
            match_data['venue'],
            match_data['referee'],
            match_data['filename'],
            match_data['extraction_status']
        ))

        if (i + 1) % 100 == 0:
            conn.commit()
            print(f"  ✅ Inserted {i + 1}/{len(successful_extractions)} matches...")

    conn.commit()
    print(f"✅ Successfully inserted {len(successful_extractions)} matches")

except Exception as e:
    conn.rollback()
    print(f"❌ Error during insertion: {e}")
    raise

# 6) Database verification
print("\n--- Database Verification ---")
matches_in_db = pd.read_sql_query("""
    SELECT 
        COUNT(*) AS total_matches,
        COUNT(DISTINCT home_team_id) + COUNT(DISTINCT away_team_id) AS unique_teams,
        MIN(match_date) AS earliest_match,
        MAX(match_date) AS latest_match,
        COUNT(DISTINCT season_id) AS seasons
    FROM Match
""", conn)
print(matches_in_db)

# 7) Show recent match samples
sample_matches = pd.read_sql_query("""
    SELECT 
        m.match_date, 
        h.team_name AS home_team,
        a.team_name AS away_team,
        m.home_xg, m.away_xg,
        m.venue
    FROM Match m
    JOIN teams h ON m.home_team_id = h.team_id
    JOIN teams a ON m.away_team_id = a.team_id
    ORDER BY m.match_date DESC
    LIMIT 10
""", conn)

print("\n--- Recent Matches in Database ---")
print(sample_matches)

# 8) Final summary
print(f"\n🎉 Match extraction complete!")
print(f"   📊 {len(successful_extractions)} matches successfully processed")
print(f"   📅 Covering {matches_in_db.iloc[0]['seasons']} seasons")
print(f"   🏟️  From {matches_in_db.iloc[0]['earliest_match']} to {matches_in_db.iloc[0]['latest_match']}")
print("   🗂️  Ready for lineup and player stats extraction")

# 9) Close DB connection
conn.close()
print("✅ Database connection closed")



✅ Closed existing connection
📝 Ready to insert 746 matches
❌ Error during insertion: database is locked


OperationalError: database is locked

In [47]:
# ══════════════════════════════════════════════════════════════════════════════
#  FORCE UNLOCK DATABASE AND COMPLETE INSERTION
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd
import time
import gc

# 1) Force cleanup of any existing connections
gc.collect()  # Trigger garbage collection

# 2) Try unlocking by touching and closing a quick connection
db_path = "data/processed/nwsldata.db"
try:
    temp_conn = sqlite3.connect(db_path, timeout=1)
    temp_conn.close()
except Exception:
    pass

time.sleep(2)  # Give OS time to release locks

# 3) Connect with WAL mode (more robust for concurrent access)
conn = sqlite3.connect(db_path, timeout=60)
conn.execute("PRAGMA journal_mode=WAL;")         # Enable Write-Ahead Logging
conn.execute("PRAGMA synchronous=NORMAL;")       # Balance performance vs durability
cur = conn.cursor()

print("✅ Connected with WAL mode enabled")

# 4) Check for required variable
try:
    print(f"📝 Ready to insert {len(successful_extractions)} matches")
except NameError:
    print("❌ 'successful_extractions' not found. Restart your kernel and re-run extraction.")
    conn.close()
    raise

# 5) Use INSERT OR REPLACE to avoid DELETE lock
print("📝 Using INSERT OR REPLACE approach to avoid DELETE lock...")

inserted_count = 0
for i, match_data in enumerate(successful_extractions):
    try:
        cur.execute("""
            INSERT OR REPLACE INTO Match (
                season_id, match_date, home_team_id, away_team_id,
                home_xg, away_xg, attendance, venue, referee,
                filename, extraction_status
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """, (
            match_data['season_year'],
            match_data['match_date'],
            match_data['home_team_id'],
            match_data['away_team_id'],
            match_data['home_xg'],
            match_data['away_xg'],
            match_data['attendance'],
            match_data['venue'],
            match_data['referee'],
            match_data['filename'],
            match_data['extraction_status']
        ))
        inserted_count += 1

        # Commit periodically to reduce lock duration
        if (i + 1) % 50 == 0:
            conn.commit()
            print(f"  ✅ Inserted {i + 1}/{len(successful_extractions)} matches...")

    except Exception as e:
        print(f"❌ Failed to insert match {i + 1}: {e}")
        continue

# Final commit
conn.commit()
print(f"✅ Successfully inserted {inserted_count} matches")

# 6) Database verification
print("\n--- Database Stats ---")
matches_in_db = pd.read_sql_query("""
    SELECT 
        COUNT(*) AS total_matches,
        MIN(match_date) AS earliest_match,
        MAX(match_date) AS latest_match,
        COUNT(DISTINCT season_id) AS seasons
    FROM Match
""", conn)
print(matches_in_db)

# 7) Cleanup
conn.close()
print("✅ Database connection closed")
print(f"🎉 {inserted_count} matches now in database!")


✅ Connected with WAL mode enabled
📝 Ready to insert 746 matches
📝 Using INSERT OR REPLACE approach to avoid DELETE lock...
❌ Failed to insert match 1: table Match has no column named filename
❌ Failed to insert match 2: table Match has no column named filename
❌ Failed to insert match 3: table Match has no column named filename
❌ Failed to insert match 4: table Match has no column named filename
❌ Failed to insert match 5: table Match has no column named filename
❌ Failed to insert match 6: table Match has no column named filename
❌ Failed to insert match 7: table Match has no column named filename
❌ Failed to insert match 8: table Match has no column named filename
❌ Failed to insert match 9: table Match has no column named filename
❌ Failed to insert match 10: table Match has no column named filename
❌ Failed to insert match 11: table Match has no column named filename
❌ Failed to insert match 12: table Match has no column named filename
❌ Failed to insert match 13: table Match has n

In [48]:
# ══════════════════════════════════════════════════════════════════════════════
#  FIX MATCH TABLE STRUCTURE – Add Missing Columns
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd

# Connect to database
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path, timeout=60)
cur = conn.cursor()

# 1) Check current table structure
print("📋 Current Match table structure:")
table_info = pd.read_sql_query("PRAGMA table_info(Match);", conn)
print(table_info)

# 2) Add missing columns if necessary
missing_columns = ['filename', 'extraction_status']
existing_columns = table_info['name'].tolist()

for col in missing_columns:
    if col not in existing_columns:
        print(f"➕ Adding missing column: {col}")
        cur.execute(f"ALTER TABLE Match ADD COLUMN {col} TEXT;")
    else:
        print(f"✅ Column '{col}' already exists")

conn.commit()

# 3) Verify updated structure
print("\n📋 Updated Match table structure:")
updated_info = pd.read_sql_query("PRAGMA table_info(Match);", conn)
print(updated_info)

conn.close()
print("✅ Table structure updated successfully!")


📋 Current Match table structure:
    cid          name     type  notnull dflt_value  pk
0     0      match_id     TEXT        0       None   1
1     1     season_id  INTEGER        0       None   0
2     2    match_date     DATE        0       None   0
3     3    match_time     TIME        0       None   0
4     4  home_team_id     TEXT        0       None   0
5     5  away_team_id     TEXT        0       None   0
6     6    home_goals  INTEGER        0       None   0
7     7    away_goals  INTEGER        0       None   0
8     8       home_xg     REAL        0       None   0
9     9       away_xg     REAL        0       None   0
10   10    attendance  INTEGER        0       None   0
11   11         venue     TEXT        0       None   0
12   12       referee     TEXT        0       None   0
13   13   temperature     REAL        0       None   0
➕ Adding missing column: filename
➕ Adding missing column: extraction_status

📋 Updated Match table structure:
    cid               name     

In [49]:
# ══════════════════════════════════════════════════════════════════════════════
#  INSERT MATCH DATA WITH PROPER COLUMNS
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd

# Connect to database with WAL mode for concurrency safety
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path, timeout=60)
conn.execute("PRAGMA journal_mode=WAL;")
cur = conn.cursor()

# Check if successful_extractions is available
try:
    print(f"📝 Ready to insert {len(successful_extractions)} matches")
except NameError:
    print("❌ 'successful_extractions' not found. Please re-run the extraction code first.")
    conn.close()
    raise

# Insert match data with error handling
inserted_count = 0
for i, match_data in enumerate(successful_extractions):
    try:
        cur.execute("""
            INSERT OR REPLACE INTO Match (
                season_id, match_date, home_team_id, away_team_id,
                home_xg, away_xg, attendance, venue, referee,
                filename, extraction_status
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """, (
            match_data['season_year'],
            match_data['match_date'],
            match_data['home_team_id'],
            match_data['away_team_id'],
            match_data['home_xg'],
            match_data['away_xg'],
            match_data['attendance'],
            match_data['venue'],
            match_data['referee'],
            match_data['filename'],
            match_data['extraction_status']
        ))

        inserted_count += 1

        if (i + 1) % 100 == 0:
            conn.commit()
            print(f"  ✅ Inserted {i + 1}/{len(successful_extractions)} matches...")

    except Exception as e:
        print(f"❌ Failed to insert match {i + 1}: {e}")
        break  # Stop early on error to investigate

# Final commit
conn.commit()
print(f"\n🎉 Successfully inserted {inserted_count} matches!")

# Verification
matches_count = pd.read_sql_query("SELECT COUNT(*) AS total FROM Match", conn)
print(f"📊 Total matches in database: {matches_count.iloc[0]['total']}")

# Close connection
conn.close()
print("✅ Database connection closed")


📝 Ready to insert 746 matches
  ✅ Inserted 100/746 matches...
  ✅ Inserted 200/746 matches...
  ✅ Inserted 300/746 matches...
  ✅ Inserted 400/746 matches...
  ✅ Inserted 500/746 matches...
  ✅ Inserted 600/746 matches...
  ✅ Inserted 700/746 matches...

🎉 Successfully inserted 746 matches!
📊 Total matches in database: 749
✅ Database connection closed


In [50]:
# ══════════════════════════════════════════════════════════════════════════════
#  MATCH TABLE DIAGNOSTIC CHECKS
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd
from datetime import datetime

# Connect to database
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

print("🔍 COMPREHENSIVE MATCH TABLE DIAGNOSTICS\n")

# 1) Basic table info
print("=" * 60)
print("📊 BASIC TABLE INFORMATION")
print("=" * 60)

table_info = pd.read_sql_query("PRAGMA table_info(Match);", conn)
print("Table Structure:")
print(table_info)

basic_stats = pd.read_sql_query("""
    SELECT 
        COUNT(*) AS total_matches,
        COUNT(DISTINCT season_id) AS seasons,
        COUNT(DISTINCT home_team_id) AS home_teams,
        COUNT(DISTINCT away_team_id) AS away_teams,
        MIN(match_date) AS earliest_match,
        MAX(match_date) AS latest_match
    FROM Match
""", conn)
print("\nBasic Statistics:")
print(basic_stats)

# 2) Sample match records
print("\n" + "=" * 60)
print("📝 SAMPLE MATCHES")
print("=" * 60)

sample_matches = pd.read_sql_query("""
    SELECT 
        m.match_date,
        h.team_name AS home_team,
        a.team_name AS away_team,
        m.home_xg,
        m.away_xg,
        m.attendance,
        m.venue,
        m.referee
    FROM Match m
    JOIN teams h ON m.home_team_id = h.team_id  
    JOIN teams a ON m.away_team_id = a.team_id
    ORDER BY m.match_date DESC
    LIMIT 10
""", conn)
print("Recent Matches:")
print(sample_matches.to_string(index=False))

# 3) Data quality checks
print("\n" + "=" * 60)
print("🔬 DATA QUALITY ANALYSIS")
print("=" * 60)

missing_data = pd.read_sql_query("""
    SELECT 
        'home_xg' AS field,
        SUM(CASE WHEN home_xg IS NULL THEN 1 ELSE 0 END) AS missing_count,
        ROUND(SUM(CASE WHEN home_xg IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS missing_pct
    FROM Match
    UNION ALL
    SELECT 
        'away_xg',
        SUM(CASE WHEN away_xg IS NULL THEN 1 ELSE 0 END),
        ROUND(SUM(CASE WHEN away_xg IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1)
    FROM Match
    UNION ALL
    SELECT 
        'attendance',
        SUM(CASE WHEN attendance IS NULL THEN 1 ELSE 0 END),
        ROUND(SUM(CASE WHEN attendance IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1)
    FROM Match
    UNION ALL
    SELECT 
        'venue',
        SUM(CASE WHEN venue IS NULL THEN 1 ELSE 0 END),
        ROUND(SUM(CASE WHEN venue IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1)
    FROM Match
    UNION ALL
    SELECT 
        'referee',
        SUM(CASE WHEN referee IS NULL THEN 1 ELSE 0 END),
        ROUND(SUM(CASE WHEN referee IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1)
    FROM Match
""", conn)
print("Missing Data Analysis:")
print(missing_data.to_string(index=False))

# 4) Season breakdown
print("\n" + "=" * 60)
print("📅 MATCHES BY SEASON")
print("=" * 60)

season_breakdown = pd.read_sql_query("""
    SELECT 
        season_id AS season,
        COUNT(*) AS match_count,
        COUNT(DISTINCT home_team_id) + COUNT(DISTINCT away_team_id) AS unique_teams,
        MIN(match_date) AS season_start,
        MAX(match_date) AS season_end,
        ROUND(AVG(home_xg), 2) AS avg_home_xg,
        ROUND(AVG(away_xg), 2) AS avg_away_xg
    FROM Match
    WHERE season_id IS NOT NULL
    GROUP BY season_id
    ORDER BY season_id DESC
""", conn)
print(season_breakdown.to_string(index=False))

# 5) Team participation analysis
print("\n" + "=" * 60)
print("⚽ TEAM PARTICIPATION ANALYSIS")
print("=" * 60)

team_participation = pd.read_sql_query("""
    WITH team_matches AS (
        SELECT home_team_id AS team_id FROM Match
        UNION ALL 
        SELECT away_team_id AS team_id FROM Match
    )
    SELECT 
        t.team_name,
        COUNT(tm.team_id) AS total_matches,
        SUM(CASE WHEN m1.home_team_id = t.team_id THEN 1 ELSE 0 END) AS home_matches,
        SUM(CASE WHEN m2.away_team_id = t.team_id THEN 1 ELSE 0 END) AS away_matches
    FROM teams t
    LEFT JOIN team_matches tm ON t.team_id = tm.team_id
    LEFT JOIN Match m1 ON t.team_id = m1.home_team_id
    LEFT JOIN Match m2 ON t.team_id = m2.away_team_id
    GROUP BY t.team_id, t.team_name
    HAVING total_matches > 0
    ORDER BY total_matches DESC
""", conn)
print(team_participation.to_string(index=False))

# 6) xG statistics
print("\n" + "=" * 60)
print("⚽ xG STATISTICS")
print("=" * 60)

xg_stats = pd.read_sql_query("""
    SELECT 
        COUNT(*) AS matches_with_xg,
        ROUND(MIN(home_xg), 2) AS min_home_xg,
        ROUND(MAX(home_xg), 2) AS max_home_xg,
        ROUND(AVG(home_xg), 2) AS avg_home_xg,
        ROUND(MIN(away_xg), 2) AS min_away_xg,  
        ROUND(MAX(away_xg), 2) AS max_away_xg,
        ROUND(AVG(away_xg), 2) AS avg_away_xg,
        ROUND(AVG(home_xg + away_xg), 2) AS avg_total_xg
    FROM Match 
    WHERE home_xg IS NOT NULL AND away_xg IS NOT NULL
""", conn)
print("xG Statistics:")
print(xg_stats.to_string(index=False))

# 7) Attendance analysis
print("\n" + "=" * 60)
print("🏟️ ATTENDANCE ANALYSIS")
print("=" * 60)

attendance_stats = pd.read_sql_query("""
    SELECT 
        COUNT(*) AS matches_with_attendance,
        MIN(attendance) AS min_attendance,
        MAX(attendance) AS max_attendance,
        ROUND(AVG(attendance), 0) AS avg_attendance,
        COUNT(CASE WHEN attendance > 10000 THEN 1 END) AS matches_over_10k,
        COUNT(CASE WHEN attendance > 20000 THEN 1 END) AS matches_over_20k
    FROM Match 
    WHERE attendance IS NOT NULL
""", conn)
print("Attendance Statistics:")
print(attendance_stats.to_string(index=False))

# Highest attended matches
high_attendance = pd.read_sql_query("""
    SELECT 
        m.match_date,
        h.team_name AS home_team,
        a.team_name AS away_team,
        m.attendance,
        m.venue
    FROM Match m
    JOIN teams h ON m.home_team_id = h.team_id
    JOIN teams a ON m.away_team_id = a.team_id
    WHERE m.attendance IS NOT NULL
    ORDER BY m.attendance DESC
    LIMIT 5
""", conn)
print("\nHighest Attendance Matches:")
print(high_attendance.to_string(index=False))

# 8) Venue analysis
print("\n" + "=" * 60)
print("🏟️ VENUE ANALYSIS")
print("=" * 60)

venue_stats = pd.read_sql_query("""
    SELECT 
        venue,
        COUNT(*) AS match_count,
        ROUND(AVG(attendance), 0) AS avg_attendance
    FROM Match 
    WHERE venue IS NOT NULL
    GROUP BY venue
    HAVING match_count >= 3
    ORDER BY match_count DESC
    LIMIT 10
""", conn)
print("Most Used Venues (3+ matches):")
print(venue_stats.to_string(index=False))

# 9) Data integrity checks
print("\n" + "=" * 60)
print("🔍 DATA INTEGRITY CHECKS")
print("=" * 60)

# Duplicate check
duplicates = pd.read_sql_query("""
    SELECT 
        match_date, 
        home_team_id, 
        away_team_id, 
        COUNT(*) AS duplicate_count
    FROM Match
    GROUP BY match_date, home_team_id, away_team_id
    HAVING COUNT(*) > 1
""", conn)
if len(duplicates) > 0:
    print("⚠️ Duplicate matches found:")
    print(duplicates.to_string(index=False))
else:
    print("✅ No duplicate matches found")

# Invalid team references
team_check = pd.read_sql_query("""
    SELECT COUNT(*) AS invalid_teams
    FROM Match m
    WHERE m.home_team_id NOT IN (SELECT team_id FROM teams)
       OR m.away_team_id NOT IN (SELECT team_id FROM teams)
""", conn)
if team_check.iloc[0]['invalid_teams'] > 0:
    print(f"⚠️ Found {team_check.iloc[0]['invalid_teams']} matches with invalid team references")
else:
    print("✅ All team references are valid")

conn.close()

print("\n" + "=" * 60)
print("🎉 DIAGNOSTIC COMPLETE")
print("=" * 60)


🔍 COMPREHENSIVE MATCH TABLE DIAGNOSTICS

📊 BASIC TABLE INFORMATION
Table Structure:
    cid               name     type  notnull dflt_value  pk
0     0           match_id     TEXT        0       None   1
1     1          season_id  INTEGER        0       None   0
2     2         match_date     DATE        0       None   0
3     3         match_time     TIME        0       None   0
4     4       home_team_id     TEXT        0       None   0
5     5       away_team_id     TEXT        0       None   0
6     6         home_goals  INTEGER        0       None   0
7     7         away_goals  INTEGER        0       None   0
8     8            home_xg     REAL        0       None   0
9     9            away_xg     REAL        0       None   0
10   10         attendance  INTEGER        0       None   0
11   11              venue     TEXT        0       None   0
12   12            referee     TEXT        0       None   0
13   13        temperature     REAL        0       None   0
14   14         

In [51]:
# ══════════════════════════════════════════════════════════════════════════════
#  CLEAR INCORRECT xG DATA FROM MATCH TABLE
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd

# Connect to database
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)
cur = conn.cursor()

print("🧹 CLEARING INCORRECT xG DATA\n")

# 1) Check current xG values before clearing
print("📊 Before clearing:")
xg_before = pd.read_sql_query("""
    SELECT 
        COUNT(*) AS total_matches,
        COUNT(CASE WHEN home_xg IS NOT NULL THEN 1 END) AS matches_with_home_xg,
        COUNT(CASE WHEN away_xg IS NOT NULL THEN 1 END) AS matches_with_away_xg,
        home_xg, away_xg
    FROM Match
    WHERE home_xg IS NOT NULL OR away_xg IS NOT NULL
    LIMIT 5
""", conn)
print(xg_before)

# 2) Clear the xG values
print("\n🔄 Setting all home_xg and away_xg values to NULL...")

cur.execute("UPDATE Match SET home_xg = NULL")
cur.execute("UPDATE Match SET away_xg = NULL")
conn.commit()

print("✅ xG values cleared")

# 3) Verify the change
print("\n📊 After clearing:")
xg_after = pd.read_sql_query("""
    SELECT 
        COUNT(*) AS total_matches,
        COUNT(CASE WHEN home_xg IS NOT NULL THEN 1 END) AS matches_with_home_xg,
        COUNT(CASE WHEN away_xg IS NOT NULL THEN 1 END) AS matches_with_away_xg
    FROM Match
""", conn)
print(xg_after)

# 4) Quick verification of cleaned sample
print("\n📋 Updated Match Table Sample:")
clean_sample = pd.read_sql_query("""
    SELECT 
        m.match_date,
        h.team_name AS home_team,
        a.team_name AS away_team,
        m.home_xg,
        m.away_xg,
        m.season_id
    FROM Match m
    JOIN teams h ON m.home_team_id = h.team_id  
    JOIN teams a ON m.away_team_id = a.team_id
    ORDER BY m.match_date DESC
    LIMIT 5
""", conn)
print(clean_sample.to_string(index=False))

conn.close()

print("\n🎉 xG data successfully cleared!")
print("   💡 These values will be calculated later from shot-level data")
print("   ✅ Match table now contains clean core match information")


🧹 CLEARING INCORRECT xG DATA

📊 Before clearing:
   total_matches  matches_with_home_xg  matches_with_away_xg  home_xg  away_xg
0            746                   746                   746      1.9     1.28

🔄 Setting all home_xg and away_xg values to NULL...
✅ xG values cleared

📊 After clearing:
   total_matches  matches_with_home_xg  matches_with_away_xg
0            749                     0                     0

📋 Updated Match Table Sample:
match_date              home_team         away_team home_xg away_xg  season_id
2025-06-22      San Diego Wave FC Washington Spirit    None    None       2025
2025-06-21              Gotham FC            Bay FC    None    None       2025
2025-06-21            Utah Royals  Seattle Reign FC    None    None       2025
2025-06-21 North Carolina Courage      Houston Dash    None    None       2025
2025-06-21     Portland Thorns FC  Chicago Stars FC    None    None       2025

🎉 xG data successfully cleared!
   💡 These values will be calculated late

In [52]:
# ══════════════════════════════════════════════════════════════════════════════
#  CREATE TEAM_SEASONS TABLE FROM REFERENCE DATA
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd

# Connect to database
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)
cur = conn.cursor()

print("🏗️ CREATING TEAM_SEASONS REFERENCE TABLE\n")

# 1) Create the team_seasons table
cur.execute("DROP TABLE IF EXISTS team_seasons")
cur.execute("""
    CREATE TABLE team_seasons (
        season INTEGER,
        team_short_name TEXT,
        team_id TEXT,
        FOREIGN KEY (team_id) REFERENCES teams(team_id)
    )
""")

# 2) Manual mapping from teams_seasons.md to team_id
team_name_mapping = {
    # Current/recent names
    'Angel City': 'ae38d267',
    'Bay FC': '231a532f',
    'Chicago Stars': 'd976a235',  # 2025 rename
    'Courage': '85c458aa',
    'Current': '6f666306',        # Kansas City Current
    'Dash': 'e813709a',
    'Gotham FC': '8e306dc6',
    'Louisville': 'da19ebd1',
    'Pride': '2a6178ac',
    'Reign': '257fad2b',
    'Royals': 'd4c130bc',
    'Spirit': 'e442aad0',
    'Thorns': 'df9a10a1',
    'Wave': 'bf961da0',

    # Historical names
    'Red Stars': 'd976a235',         # Before 2025
    'Kansas City': '6f666306',       # Before 2022
    'Sky Blue FC': '8e306dc6',       # Became Gotham FC
    'Boston Breakers': 'ab757728',
    'WNY Flash': '5f911568'
}

# 3) Season-team participation data
seasons_data = {
    2013: ['Boston Breakers', 'Kansas City', 'Red Stars', 'Reign', 'Sky Blue FC', 'Spirit', 'Thorns', 'WNY Flash'],
    2014: ['Boston Breakers', 'Dash', 'Kansas City', 'Red Stars', 'Reign', 'Sky Blue FC', 'Spirit', 'Thorns', 'WNY Flash'],
    2015: ['Boston Breakers', 'Dash', 'Kansas City', 'Red Stars', 'Reign', 'Sky Blue FC', 'Spirit', 'Thorns', 'WNY Flash'],
    2016: ['Boston Breakers', 'Dash', 'Kansas City', 'Pride', 'Red Stars', 'Reign', 'Sky Blue FC', 'Spirit', 'Thorns', 'WNY Flash'],
    2017: ['Boston Breakers', 'Courage', 'Dash', 'Kansas City', 'Pride', 'Red Stars', 'Reign', 'Sky Blue FC', 'Spirit', 'Thorns'],
    2018: ['Courage', 'Dash', 'Pride', 'Red Stars', 'Reign', 'Royals', 'Sky Blue FC', 'Spirit', 'Thorns'],
    2019: ['Courage', 'Dash', 'Pride', 'Red Stars', 'Reign', 'Royals', 'Sky Blue FC', 'Spirit', 'Thorns'],
    2020: ['Courage', 'Dash', 'Pride', 'Red Stars', 'Reign', 'Royals', 'Sky Blue FC', 'Spirit', 'Thorns'],
    2021: ['Courage', 'Dash', 'Gotham FC', 'Kansas City', 'Louisville', 'Pride', 'Red Stars', 'Reign', 'Spirit', 'Thorns'],
    2022: ['Angel City', 'Courage', 'Current', 'Dash', 'Gotham FC', 'Louisville', 'Pride', 'Red Stars', 'Reign', 'Spirit', 'Thorns', 'Wave'],
    2023: ['Angel City', 'Courage', 'Current', 'Dash', 'Gotham FC', 'Louisville', 'Pride', 'Red Stars', 'Reign', 'Spirit', 'Thorns', 'Wave'],
    2024: ['Angel City', 'Bay FC', 'Courage', 'Current', 'Dash', 'Gotham FC', 'Louisville', 'Pride', 'Red Stars', 'Reign', 'Royals', 'Spirit', 'Thorns', 'Wave'],
    2025: ['Angel City', 'Bay FC', 'Chicago Stars', 'Courage', 'Current', 'Dash', 'Gotham FC', 'Louisville', 'Pride', 'Reign', 'Royals', 'Spirit', 'Thorns', 'Wave']
}

# 4) Insert records
total_inserted = 0
for season, teams in seasons_data.items():
    for team_short in teams:
        if team_short in team_name_mapping:
            team_id = team_name_mapping[team_short]
            cur.execute("INSERT INTO team_seasons VALUES (?, ?, ?)", (season, team_short, team_id))
            total_inserted += 1
        else:
            print(f"⚠️ Warning: No mapping found for '{team_short}' in season {season}")

conn.commit()
print(f"✅ Inserted {total_inserted} team-season records")

# 5) Verify summary
print("\n📊 Team-Season Summary:")
summary = pd.read_sql_query("""
    SELECT 
        season,
        COUNT(*) AS team_count,
        GROUP_CONCAT(team_short_name, ', ') AS teams
    FROM team_seasons
    GROUP BY season
    ORDER BY season
""", conn)

for _, row in summary.iterrows():
    print(f"{row['season']}: {row['team_count']} teams - {row['teams']}")

# 6) Corrected season analysis
print(f"\n🔧 CORRECTED SEASON ANALYSIS:")
corrected_analysis = pd.read_sql_query("""
    SELECT 
        m.season_id AS season,
        COUNT(*) AS match_count,
        ts.team_count AS actual_teams_in_season,
        MIN(m.match_date) AS season_start,
        MAX(m.match_date) AS season_end
    FROM Match m
    LEFT JOIN (
        SELECT season, COUNT(*) AS team_count 
        FROM team_seasons 
        GROUP BY season
    ) ts ON m.season_id = ts.season
    WHERE m.season_id IS NOT NULL
    GROUP BY m.season_id, ts.team_count
    ORDER BY m.season_id DESC
""", conn)

print(corrected_analysis.to_string(index=False))

conn.close()
print("\n🎉 Team-seasons reference table created successfully!")


🏗️ CREATING TEAM_SEASONS REFERENCE TABLE

✅ Inserted 135 team-season records

📊 Team-Season Summary:
2013: 8 teams - Boston Breakers, Kansas City, Red Stars, Reign, Sky Blue FC, Spirit, Thorns, WNY Flash
2014: 9 teams - Boston Breakers, Dash, Kansas City, Red Stars, Reign, Sky Blue FC, Spirit, Thorns, WNY Flash
2015: 9 teams - Boston Breakers, Dash, Kansas City, Red Stars, Reign, Sky Blue FC, Spirit, Thorns, WNY Flash
2016: 10 teams - Boston Breakers, Dash, Kansas City, Pride, Red Stars, Reign, Sky Blue FC, Spirit, Thorns, WNY Flash
2017: 10 teams - Boston Breakers, Courage, Dash, Kansas City, Pride, Red Stars, Reign, Sky Blue FC, Spirit, Thorns
2018: 9 teams - Courage, Dash, Pride, Red Stars, Reign, Royals, Sky Blue FC, Spirit, Thorns
2019: 9 teams - Courage, Dash, Pride, Red Stars, Reign, Royals, Sky Blue FC, Spirit, Thorns
2020: 9 teams - Courage, Dash, Pride, Red Stars, Reign, Royals, Sky Blue FC, Spirit, Thorns
2021: 10 teams - Courage, Dash, Gotham FC, Kansas City, Louisville, Pr

In [53]:
# ══════════════════════════════════════════════════════════════════════════════
#  INVESTIGATE MATCH CATEGORIZATION POSSIBILITIES
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd
import re

# Connect to database
db_path = "data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

print("🔍 INVESTIGATING MATCH CATEGORIZATION POSSIBILITIES\n")

# 1) Analyze filenames for clues
print("=" * 60)
print("📂 ANALYZING FILENAMES FOR MATCH TYPE CLUES")
print("=" * 60)

filename_sample = pd.read_sql_query("""
    SELECT filename, match_date, season_id
    FROM Match 
    WHERE filename IS NOT NULL
    ORDER BY match_date DESC
    LIMIT 20
""", conn)

print("Sample filenames (recent matches):")
for _, row in filename_sample.iterrows():
    print(f"{row['match_date']}: {row['filename']}")

# 2) Search for playoff-related keywords in filenames
print("\n" + "=" * 60)
print("🔍 SEARCHING FOR PLAYOFF INDICATORS IN FILENAMES")
print("=" * 60)

playoff_keywords = ['playoff', 'final', 'semi', 'quarter', 'championship', 'cup']
playoff_matches = []

for keyword in playoff_keywords:
    matches = pd.read_sql_query(f"""
        SELECT filename, match_date, season_id
        FROM Match 
        WHERE filename LIKE '%{keyword}%'
           OR filename LIKE '%{keyword.title()}%'
           OR filename LIKE '%{keyword.upper()}%'
        ORDER BY match_date DESC
        LIMIT 10
    """, conn)

    if not matches.empty:
        print(f"\n🔍 Files containing '{keyword}':")
        for _, row in matches.iterrows():
            print(f"  {row['match_date']}: {row['filename']}")
        playoff_matches.extend(matches.to_dict('records'))

# 3) Analyze timing patterns by month
print("\n" + "=" * 60)
print("📅 ANALYZING MATCH TIMING PATTERNS")
print("=" * 60)

timing_analysis = pd.read_sql_query("""
    SELECT 
        season_id,
        strftime('%m', match_date) AS month,
        COUNT(*) AS match_count,
        MIN(match_date) AS first_match,
        MAX(match_date) AS last_match
    FROM Match
    WHERE season_id IN (2021, 2022, 2023, 2024)
    GROUP BY season_id, month
    ORDER BY season_id DESC, month DESC
""", conn)

print("Matches by month (recent seasons):")
print(timing_analysis.to_string(index=False))

# 4) Examine matches in Oct-Dec for potential playoffs
print("\n" + "=" * 60)
print("🏆 POTENTIAL PLAYOFF MATCHES (Late Season)")
print("=" * 60)

potential_playoffs = pd.read_sql_query("""
    SELECT 
        m.match_date,
        h.team_name AS home_team,
        a.team_name AS away_team,
        m.season_id,
        m.filename
    FROM Match m
    JOIN teams h ON m.home_team_id = h.team_id
    JOIN teams a ON m.away_team_id = a.team_id
    WHERE strftime('%m', m.match_date) IN ('10', '11', '12')
      AND m.season_id >= 2021
    ORDER BY m.season_id DESC, m.match_date DESC
    LIMIT 20
""", conn)

print("Late-season matches (Oct–Dec, likely playoffs):")
print(potential_playoffs[['match_date', 'home_team', 'away_team', 'season_id']].to_string(index=False))

# 5) Analyze filenames of late-season matches for playoff tags
print("\n" + "=" * 60)
print("🔍 OTHER METADATA ANALYSIS")
print("=" * 60)

sample_with_context = pd.read_sql_query("""
    SELECT filename, match_date, season_id
    FROM Match
    WHERE strftime('%m', match_date) IN ('10', '11')
      AND season_id >= 2022
    ORDER BY season_id DESC, match_date DESC
    LIMIT 10
""", conn)

print("Sample late-season filename analysis:")
for _, row in sample_with_context.iterrows():
    filename = row['filename']
    indicators = []

    if 'final' in filename.lower(): indicators.append('FINAL')
    if 'semi' in filename.lower(): indicators.append('SEMI')
    if 'quarter' in filename.lower(): indicators.append('QUARTER')
    if 'playoff' in filename.lower(): indicators.append('PLAYOFF')
    if 'championship' in filename.lower(): indicators.append('CHAMPIONSHIP')

    tag = f" [{', '.join(indicators)}]" if indicators else ""
    print(f"{row['match_date']}: {filename}{tag}")

# 6) Count total matches per season and split by month
print("\n" + "=" * 60)
print("📊 SEASON MATCH TOTALS (Estimate Regular vs Playoff)")
print("=" * 60)

season_totals = pd.read_sql_query("""
    SELECT 
        season_id,
        COUNT(*) AS total_matches,
        COUNT(CASE WHEN strftime('%m', match_date) <= '09' THEN 1 END) AS likely_regular_season,
        COUNT(CASE WHEN strftime('%m', match_date) >= '10' THEN 1 END) AS likely_playoffs
    FROM Match
    WHERE season_id >= 2021
    GROUP BY season_id
    ORDER BY season_id DESC
""", conn)

print(season_totals.to_string(index=False))

conn.close()

# 7) Summary & Recommendations
print("\n" + "=" * 60)
print("💡 RECOMMENDATIONS")
print("=" * 60)
print("Based on this analysis:")
print("- Use keywords in filenames (e.g., 'final', 'semifinal') to flag playoffs")
print("- Assume matches in Oct–Dec as likely playoffs if keywords are missing")
print("- Cross-reference season totals to estimate playoff count vs regular season")
print("- Consider adding a 'match_type' column with values like 'regular', 'playoff', 'championship'")


🔍 INVESTIGATING MATCH CATEGORIZATION POSSIBILITIES

📂 ANALYZING FILENAMES FOR MATCH TYPE CLUES
Sample filenames (recent matches):
2025-06-22: San Diego Wave vs. Washington Spirit Match Report – Sunday June 22, 2025 _ FBref.com.html
2025-06-21: Gotham FC vs. Bay FC Match Report – Saturday June 21, 2025 _ FBref.com.html
2025-06-21: Utah Royals vs. Seattle Reign FC Match Report – Saturday June 21, 2025 _ FBref.com.html
2025-06-21: North Carolina Courage vs. Houston Dash Match Report – Saturday June 21, 2025 _ FBref.com.html
2025-06-21: Portland Thorns FC vs. Chicago Stars Match Report – Saturday June 21, 2025 _ FBref.com.html
2025-06-21: 6be469ca.html
2025-06-20: Kansas City Current vs. Angel City FC Match Report – Friday June 20, 2025 _ FBref.com.html
2025-06-20: Racing Louisville vs. Orlando Pride Match Report – Friday June 20, 2025 _ FBref.com.html
2025-06-15: Portland Thorns FC vs. Washington Spirit Match Report – Sunday June 15, 2025 _ FBref.com.html
2025-06-14: Angel City FC vs. Nor

In [54]:
# ══════════════════════════════════════════════════════════════════════════════
#  EXPLORE SEASON OVERVIEW PAGES FOR MATCH CATEGORIZATION
# ══════════════════════════════════════════════════════════════════════════════

import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path
import re

print("🔍 EXPLORING SEASON OVERVIEW PAGES FOR MATCH TYPES\n")

# 1) Get all season overview files
season_pages_dir = Path("/Users/thomasmcmillan/projects/nwsl_data/data/raw_season_pages")
season_files = list(season_pages_dir.glob("*.html"))

print(f"📂 Found {len(season_files)} season overview files:")
for file in sorted(season_files):
    print(f"   {file.name}")

# 2) Extract match data from a single overview file
def extract_season_matches(html_path):
    """Extract all matches from a season overview page"""

    print(f"\n🔍 Analyzing: {html_path.name}")
    soup = BeautifulSoup(html_path.read_text(encoding="utf-8"), "lxml")

    fixtures_table = soup.find("table", {"id": "sched_all"})
    if not fixtures_table:
        print("   ❌ No fixtures table found")
        return []

    matches = []
    rows = fixtures_table.find_all("tr")

    for row in rows[1:]:  # Skip header row
        cells = row.find_all(["td", "th"])
        if len(cells) >= 10:
            try:
                round_cell = row.find(attrs={"data-stat": "round"})
                date_cell = row.find(attrs={"data-stat": "date"})
                home_cell = row.find(attrs={"data-stat": "home_team"})
                away_cell = row.find(attrs={"data-stat": "away_team"})
                gameweek_cell = row.find(attrs={"data-stat": "gameweek"})

                if round_cell and date_cell and home_cell and away_cell:
                    match_round = round_cell.get_text(strip=True)
                    match_date = date_cell.get_text(strip=True)
                    home_team = home_cell.get_text(strip=True)
                    away_team = away_cell.get_text(strip=True)
                    gameweek = gameweek_cell.get_text(strip=True) if gameweek_cell else ""

                    matches.append({
                        'round': match_round,
                        'date': match_date,
                        'home_team': home_team,
                        'away_team': away_team,
                        'gameweek': gameweek,
                        'source_file': html_path.name
                    })
            except Exception:
                continue

    print(f"   ✅ Extracted {len(matches)} matches")
    return matches

# 3) Extract from selected recent files
test_files = [
    "2024 NWSL Scores & Fixtures _ FBref.com.html",
    "2023 NWSL Scores & Fixtures _ FBref.com.html",
    "2022 NWSL Scores & Fixtures _ FBref.com.html"
]

all_matches = []
for filename in test_files:
    file_path = season_pages_dir / filename
    if file_path.exists():
        matches = extract_season_matches(file_path)
        all_matches.extend(matches)

# 4) Convert to DataFrame and analyze
if all_matches:
    df_matches = pd.DataFrame(all_matches)

    print(f"\n📊 EXTRACTED MATCH SUMMARY")
    print(f"Total matches extracted: {len(df_matches)}")

    # Analyze round types
    print(f"\n🏆 ROUND TYPES FOUND:")
    round_counts = df_matches['round'].value_counts()
    for round_type, count in round_counts.items():
        print(f"   {round_type}: {count} matches")

    # Show sample matches by round type
    print(f"\n📝 SAMPLE MATCHES BY ROUND TYPE:")
    for round_type in round_counts.index[:5]:
        print(f"\n--- {round_type} ---")
        sample_matches = df_matches[df_matches['round'] == round_type].head(3)
        for _, match in sample_matches.iterrows():
            print(f"   {match['date']}: {match['home_team']} vs {match['away_team']}")

    # Look for playoff indicators
    print(f"\n🔍 SEARCHING FOR PLAYOFF MATCHES:")
    playoff_keywords = ['playoff', 'final', 'semi', 'quarter', 'championship']

    playoff_matches = df_matches[
        df_matches['round'].str.lower().str.contains('|'.join(playoff_keywords), na=False)
    ]

    if not playoff_matches.empty:
        print(f"   ✅ Found {len(playoff_matches)} playoff matches!")
        print("\n   Playoff Round Types:")
        for round_type in playoff_matches['round'].unique():
            count = len(playoff_matches[playoff_matches['round'] == round_type])
            print(f"      {round_type}: {count} matches")
    else:
        print("   ❌ No explicit playoff matches found in extracted data")
        print("   💡 Playoff data might be loaded dynamically via JavaScript")

else:
    print("❌ No matches extracted from any files")

# 5) Recommendations
print("\n" + "=" * 60)
print("💡 NEXT STEPS")
print("=" * 60)
print("Based on this analysis, we can determine the best approach for match categorization:")
print("- If we found playoff data: Use the 'round' field directly")
print("- If no playoff data: Use temporal classification (month-based)")
print("- Consider extracting more season files or using JavaScript rendering for dynamic tables")


🔍 EXPLORING SEASON OVERVIEW PAGES FOR MATCH TYPES

📂 Found 13 season overview files:
   2013 NWSL Scores & Fixtures _ FBref.com.html
   2014 NWSL Scores & Fixtures _ FBref.com.html
   2015 NWSL Scores & Fixtures _ FBref.com.html
   2016 NWSL Scores & Fixtures _ FBref.com.html
   2017 NWSL Scores & Fixtures _ FBref.com.html
   2018 NWSL Scores & Fixtures _ FBref.com.html
   2019 NWSL Scores & Fixtures _ FBref.com.html
   2020 NWSL Scores & Fixtures _ FBref.com.html
   2021 NWSL Scores & Fixtures _ FBref.com.html
   2022 NWSL Scores & Fixtures _ FBref.com.html
   2023 NWSL Scores & Fixtures _ FBref.com.html
   2024 NWSL Scores & Fixtures _ FBref.com.html
   NWSL Scores & Fixtures _ FBref.com.html

🔍 Analyzing: 2024 NWSL Scores & Fixtures _ FBref.com.html
   ✅ Extracted 218 matches

🔍 Analyzing: 2023 NWSL Scores & Fixtures _ FBref.com.html
   ✅ Extracted 159 matches

🔍 Analyzing: 2022 NWSL Scores & Fixtures _ FBref.com.html
   ✅ Extracted 142 matches

📊 EXTRACTED MATCH SUMMARY
Total match

In [55]:
# ══════════════════════════════════════════════════════════════════════════════
#  DIAGNOSE AND FIX SEASON OVERVIEW PARSING ISSUES
# ══════════════════════════════════════════════════════════════════════════════

import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path

print("🔧 DIAGNOSING SEASON OVERVIEW PARSING ISSUES\n")

# 1) Enhanced extraction function with debugging
def extract_season_matches_detailed(html_path):
    """Extract matches with detailed debugging info"""

    print(f"\n🔍 Deep analysis: {html_path.name}")

    soup = BeautifulSoup(html_path.read_text(encoding="utf-8"), "lxml")

    # Try multiple ways to locate the fixtures table
    possible_tables = [
        soup.find("table", {"id": "sched_all"}),
        soup.find("table", {"class": "stats_table"}),
        soup.find("table", attrs={"data-cols-to-freeze": True}),
    ]

    fixtures_table = next((table for table in possible_tables if table), None)

    if not fixtures_table:
        print("   ❌ No fixtures table found with any method")
        return [], []

    print(f"   ✅ Found fixtures table")
    rows = fixtures_table.find_all("tr")
    print(f"   📊 Total rows found: {len(rows)}")

    if rows:
        headers = [th.get_text(strip=True) for th in rows[0].find_all(["th", "td"])]
        print(f"   📋 Table headers: {headers[:8]}...")

    good_matches = []
    problem_matches = []

    for i, row in enumerate(rows[1:], 1):  # Skip header row
        cells = row.find_all(["td", "th"])
        if len(cells) < 8:
            continue

        try:
            # Method 1: Use data-stat attributes
            round_cell = row.find(attrs={"data-stat": "round"})
            date_cell = row.find(attrs={"data-stat": "date"})
            home_cell = row.find(attrs={"data-stat": "home_team"})
            away_cell = row.find(attrs={"data-stat": "away_team"})
            gameweek_cell = row.find(attrs={"data-stat": "gameweek"})

            # Fallback to column positions if missing
            if not round_cell and len(cells) > 0:
                round_cell = cells[0]
            if not date_cell and len(cells) > 3:
                date_cell = cells[3]
            if not home_cell and len(cells) > 5:
                home_cell = cells[5]
            if not away_cell and len(cells) > 9:
                away_cell = cells[9]

            # Extract text safely
            match_data = {
                'round': round_cell.get_text(strip=True) if round_cell else "",
                'date': date_cell.get_text(strip=True) if date_cell else "",
                'home_team': home_cell.get_text(strip=True) if home_cell else "",
                'away_team': away_cell.get_text(strip=True) if away_cell else "",
                'gameweek': gameweek_cell.get_text(strip=True) if gameweek_cell else "",
                'source_file': html_path.name,
                'row_number': i,
                'issues': []
            }

            # Validation
            is_good = True
            if not match_data['round']:
                match_data['issues'].append("empty_round")
                is_good = False
            if not match_data['date'] or match_data['date'] == "Date":
                match_data['issues'].append("empty_date")
                is_good = False
            if not match_data['home_team'] or match_data['home_team'] in ["Home", ""]:
                match_data['issues'].append("empty_home_team")
                is_good = False
            if not match_data['away_team'] or match_data['away_team'] in ["Away", ""]:
                match_data['issues'].append("empty_away_team")
                is_good = False

            if is_good:
                good_matches.append(match_data)
            else:
                problem_matches.append(match_data)

        except Exception as e:
            problem_matches.append({
                'round': 'ERROR',
                'date': 'ERROR',
                'home_team': 'ERROR',
                'away_team': 'ERROR',
                'gameweek': '',
                'source_file': html_path.name,
                'row_number': i,
                'issues': [f'exception: {str(e)}']
            })

    print(f"   ✅ Good matches: {len(good_matches)}")
    print(f"   ⚠️  Problem matches: {len(problem_matches)}")

    if problem_matches:
        print("   🔍 Problem match issues:")
        issue_counts = {}
        for match in problem_matches:
            for issue in match['issues']:
                issue_counts[issue] = issue_counts.get(issue, 0) + 1
        for issue, count in issue_counts.items():
            print(f"      {issue}: {count} matches")

    return good_matches, problem_matches

# 2) Run test on a sample file
test_file = Path("/Users/thomasmcmillan/projects/nwsl_data/data/raw_season_pages/2024 NWSL Scores & Fixtures _ FBref.com.html")

good_matches, problem_matches = extract_season_matches_detailed(test_file)

# 3) Show detailed problem match analysis
if problem_matches:
    print(f"\n🔍 DETAILED PROBLEM ANALYSIS")
    print("First 5 problem matches:")
    for match in problem_matches[:5]:
        print(f"\n   Row {match['row_number']}: {match['issues']}")
        print(f"   Round: '{match['round']}'")
        print(f"   Date: '{match['date']}'")
        print(f"   Teams: '{match['home_team']}' vs '{match['away_team']}'")

# 4) Show sample good matches
print(f"\n✅ SAMPLE GOOD MATCHES")
for match in good_matches[:3]:
    print(f"   {match['date']}: {match['home_team']} vs {match['away_team']} ({match['round']})")

print("\n" + "=" * 60)
print("🔧 NEXT: Based on this analysis, we'll fix the parsing logic")
print("=" * 60)


🔧 DIAGNOSING SEASON OVERVIEW PARSING ISSUES


🔍 Deep analysis: 2024 NWSL Scores & Fixtures _ FBref.com.html
   ✅ Found fixtures table
   📊 Total rows found: 219
   📋 Table headers: ['Round', 'Wk', 'Day', 'Date', 'Time', 'Home', 'xG', 'Score']...
   ✅ Good matches: 189
   ⚠️  Problem matches: 29
   🔍 Problem match issues:
      empty_round: 29 matches
      empty_date: 29 matches
      empty_home_team: 29 matches
      empty_away_team: 29 matches

🔍 DETAILED PROBLEM ANALYSIS
First 5 problem matches:

   Row 7: ['empty_round', 'empty_date', 'empty_home_team', 'empty_away_team']
   Round: ''
   Date: ''
   Teams: '' vs ''

   Row 15: ['empty_round', 'empty_date', 'empty_home_team', 'empty_away_team']
   Round: ''
   Date: ''
   Teams: '' vs ''

   Row 23: ['empty_round', 'empty_date', 'empty_home_team', 'empty_away_team']
   Round: ''
   Date: ''
   Teams: '' vs ''

   Row 31: ['empty_round', 'empty_date', 'empty_home_team', 'empty_away_team']
   Round: ''
   Date: ''
   Teams: '' vs ''



In [56]:
# ══════════════════════════════════════════════════════════════════════════════
#  FIXED SEASON OVERVIEW EXTRACTION – SKIP EMPTY ROWS
# ══════════════════════════════════════════════════════════════════════════════

import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path

def extract_season_matches_clean(html_path):
    """Extract matches, automatically skipping empty HTML artifacts"""

    print(f"🔍 Processing: {html_path.name}")

    soup = BeautifulSoup(html_path.read_text(encoding="utf-8"), "lxml")
    fixtures_table = soup.find("table", {"id": "sched_all"})

    if not fixtures_table:
        print("   ❌ No fixtures table found")
        return []

    matches = []
    rows = fixtures_table.find_all("tr")[1:]  # Skip header

    for row in rows:
        # Extract match data using data-stat attributes
        round_cell = row.find(attrs={"data-stat": "round"})
        date_cell = row.find(attrs={"data-stat": "date"})
        home_cell = row.find(attrs={"data-stat": "home_team"})
        away_cell = row.find(attrs={"data-stat": "away_team"})
        gameweek_cell = row.find(attrs={"data-stat": "gameweek"})

        # Get text content
        match_round = round_cell.get_text(strip=True) if round_cell else ""
        match_date = date_cell.get_text(strip=True) if date_cell else ""
        home_team = home_cell.get_text(strip=True) if home_cell else ""
        away_team = away_cell.get_text(strip=True) if away_cell else ""
        gameweek = gameweek_cell.get_text(strip=True) if gameweek_cell else ""

        # KEY FIX: Skip completely empty rows (HTML artifacts)
        if not any([match_round, match_date, home_team, away_team]):
            continue

        # Keep only rows with required match data
        if match_date and home_team and away_team:
            matches.append({
                'round': match_round,
                'date': match_date,
                'home_team': home_team,
                'away_team': away_team,
                'gameweek': gameweek,
                'source_file': html_path.name
            })

    print(f"   ✅ Extracted {len(matches)} clean matches")
    return matches

# Test the fix on known files
test_files = [
    "2024 NWSL Scores & Fixtures _ FBref.com.html",
    "2023 NWSL Scores & Fixtures _ FBref.com.html",
    "2022 NWSL Scores & Fixtures _ FBref.com.html"
]

season_pages_dir = Path("/Users/thomasmcmillan/projects/nwsl_data/data/raw_season_pages")
all_clean_matches = []

for filename in test_files:
    file_path = season_pages_dir / filename
    if file_path.exists():
        matches = extract_season_matches_clean(file_path)
        all_clean_matches.extend(matches)

# Analyze cleaned results
if all_clean_matches:
    df_clean = pd.DataFrame(all_clean_matches)

    print(f"\n🎉 CLEAN EXTRACTION RESULTS")
    print(f"Total clean matches: {len(df_clean)}")

    print(f"\n🏆 ROUND TYPES (CLEAN):")
    round_counts = df_clean['round'].value_counts()
    for round_type, count in round_counts.items():
        print(f"   {round_type}: {count} matches")

    # Look for playoff match indicators
    playoff_matches = df_clean[
        df_clean['round'].str.contains('final|playoff|quarter|semi', case=False, na=False)
    ]

    print(f"\n🏆 PLAYOFF MATCHES FOUND: {len(playoff_matches)}")
    for _, match in playoff_matches.head(10).iterrows():
        print(f"   {match['date']}: {match['home_team']} vs {match['away_team']} ({match['round']})")

print("\n✅ Empty rows filtered out – data is now clean!")


🔍 Processing: 2024 NWSL Scores & Fixtures _ FBref.com.html
   ✅ Extracted 189 clean matches
🔍 Processing: 2023 NWSL Scores & Fixtures _ FBref.com.html
   ✅ Extracted 137 clean matches
🔍 Processing: 2022 NWSL Scores & Fixtures _ FBref.com.html
   ✅ Extracted 142 clean matches

🎉 CLEAN EXTRACTION RESULTS
Total clean matches: 468

🏆 ROUND TYPES (CLEAN):
   Regular Season: 446 matches
   Semifinals: 6 matches
   Round: 5 matches
   Quarterfinals: 4 matches
   First Round: 4 matches
   Final: 3 matches

🏆 PLAYOFF MATCHES FOUND: 13
   2024-11-09: Current vs Courage (Quarterfinals)
   2024-11-08: Pride vs Red Stars (Quarterfinals)
   2024-11-10: Spirit vs Bay FC (Quarterfinals)
   2024-11-10: Gotham FC vs Thorns (Quarterfinals)
   2024-11-16: Spirit vs Gotham FC (Semifinals)
   2024-11-17: Pride vs Current (Semifinals)
   2024-11-23: Pride vs Spirit (Final)
   2023-11-05: Thorns vs Gotham FC (Semifinals)
   2023-11-05: Wave vs Reign (Semifinals)
   2023-11-11: Reign vs Gotham FC (Final)

✅ Em

In [57]:
# ══════════════════════════════════════════════════════════════════════════════
#  SIMPLE BINARY CLASSIFICATION: REGULAR SEASON vs NON-REGULAR SEASON
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup

print("🎯 SIMPLE MATCH CLASSIFICATION: Regular Season vs Non-Regular Season\n")

def extract_with_simple_classification():
    """Extract matches with simple regular/non-regular classification"""
    
    season_pages_dir = Path("/Users/thomasmcmillan/projects/nwsl_data/data/raw_season_pages")
    season_files = list(season_pages_dir.glob("*NWSL Scores*.html"))

    all_matches = []

    for file_path in sorted(season_files):
        if "Scores & Fixtures" in file_path.name:
            year_match = file_path.name.split()[0]
            if not year_match.isdigit():
                continue
            season_year = int(year_match)

            print(f"📂 Processing {season_year}...")

            soup = BeautifulSoup(file_path.read_text(encoding="utf-8"), "lxml")
            fixtures_table = soup.find("table", {"id": "sched_all"})

            if not fixtures_table:
                print(f"   ❌ No table found")
                continue

            regular_count = 0
            non_regular_count = 0
            rows = fixtures_table.find_all("tr")[1:]  # Skip header row

            for row in rows:
                round_cell = row.find(attrs={"data-stat": "round"})
                date_cell = row.find(attrs={"data-stat": "date"})
                home_cell = row.find(attrs={"data-stat": "home_team"})
                away_cell = row.find(attrs={"data-stat": "away_team"})

                match_round = round_cell.get_text(strip=True) if round_cell else ""
                match_date = date_cell.get_text(strip=True) if date_cell else ""
                home_team = home_cell.get_text(strip=True) if home_cell else ""
                away_team = away_cell.get_text(strip=True) if away_cell else ""

                # Skip empty rows
                if not any([match_round, match_date, home_team, away_team]):
                    continue

                # Only process valid match rows
                if match_date and home_team and away_team:
                    is_regular_season = (match_round.lower().strip() == 'regular season')
                    match_data = {
                        'season': season_year,
                        'date': match_date,
                        'home_team': home_team,
                        'away_team': away_team,
                        'round': match_round,
                        'is_regular_season': is_regular_season,
                        'match_type': 'regular_season' if is_regular_season else 'non_regular_season'
                    }

                    all_matches.append(match_data)

                    if is_regular_season:
                        regular_count += 1
                    else:
                        non_regular_count += 1

            total = regular_count + non_regular_count
            regular_pct = (regular_count / total * 100) if total > 0 else 0

            print(f"   ✅ {total} matches: {regular_count} regular ({regular_pct:.1f}%), {non_regular_count} non-regular")

    return all_matches

# Run classification
all_matches = extract_with_simple_classification()

if all_matches:
    df = pd.DataFrame(all_matches)

    print(f"\n📊 OVERALL SUMMARY")
    print(f"Total matches extracted: {len(df)}")

    # Summary by match type
    print(f"\n🎯 MATCH TYPE BREAKDOWN:")
    type_summary = df['match_type'].value_counts()
    total_matches = len(df)

    for match_type, count in type_summary.items():
        pct = count / total_matches * 100
        print(f"   {match_type}: {count} matches ({pct:.1f}%)")

    # Summary by season
    print(f"\n📅 BY SEASON:")
    season_summary = df.groupby(['season', 'match_type']).size().unstack(fill_value=0)
    season_summary['total'] = season_summary.sum(axis=1)
    season_summary['regular_pct'] = (season_summary.get('regular_season', 0) / season_summary['total'] * 100).round(1)
    print(season_summary)

    # Round types for non-regular matches
    print(f"\n⚠️ NON-REGULAR SEASON ROUND TYPES:")
    non_regular = df[df['match_type'] == 'non_regular_season']
    round_types = non_regular['round'].value_counts()

    for round_type, count in round_types.items():
        print(f"   '{round_type}': {count} matches")

    # Database comparison (static for now)
    print(f"\n🔍 DATABASE COMPARISON:")
    print(f"Season overview regular season matches: {len(df[df['match_type'] == 'regular_season'])}")
    print(f"Season overview non-regular matches: {len(df[df['match_type'] == 'non_regular_season'])}")
    print(f"Our database total matches: 749")

    print(f"\n💡 This will help us understand what we're missing!")

else:
    print("❌ No matches extracted")


🎯 SIMPLE MATCH CLASSIFICATION: Regular Season vs Non-Regular Season

📂 Processing 2013...
   ✅ 94 matches: 88 regular (93.6%), 6 non-regular
📂 Processing 2014...
   ✅ 115 matches: 108 regular (93.9%), 7 non-regular
📂 Processing 2015...
   ✅ 96 matches: 90 regular (93.8%), 6 non-regular
📂 Processing 2016...
   ✅ 107 matches: 100 regular (93.5%), 7 non-regular
📂 Processing 2017...
   ✅ 127 matches: 120 regular (94.5%), 7 non-regular
📂 Processing 2018...
   ✅ 115 matches: 108 regular (93.9%), 7 non-regular
📂 Processing 2019...
   ✅ 115 matches: 108 regular (93.9%), 7 non-regular
📂 Processing 2020...
   ❌ No table found
📂 Processing 2021...
   ✅ 130 matches: 120 regular (92.3%), 10 non-regular
📂 Processing 2022...
   ✅ 142 matches: 132 regular (93.0%), 10 non-regular
📂 Processing 2023...
   ✅ 137 matches: 132 regular (96.4%), 5 non-regular
📂 Processing 2024...
   ✅ 189 matches: 182 regular (96.3%), 7 non-regular

📊 OVERALL SUMMARY
Total matches extracted: 1367

🎯 MATCH TYPE BREAKDOWN:
   r

In [58]:
# ══════════════════════════════════════════════════════════════════════════════
#  4-CATEGORY NWSL MATCH CLASSIFICATION SYSTEM
# ══════════════════════════════════════════════════════════════════════════════

import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
from datetime import datetime

print("🏆 NWSL 4-CATEGORY MATCH CLASSIFICATION\n")

def categorize_nwsl_match(round_text, match_date, season):
    """
    Classify NWSL matches into exactly 4 categories:
    1. regular_season
    2. playoffs
    3. challenge_cup
    4. fall_series (2020 only)
    """
    if not round_text:
        return 'unknown'

    round_lower = round_text.lower().strip()

    try:
        if isinstance(match_date, str):
            date_obj = datetime.strptime(match_date, "%Y-%m-%d")
            month = date_obj.month
        else:
            month = match_date.month
    except:
        month = 0

    if 'regular season' in round_lower:
        return 'regular_season'

    if season == 2020:
        if month >= 9 and 'challenge' not in round_lower:
            if round_lower in ['round', ''] or 'fall' in round_lower:
                return 'fall_series'

    if season >= 2020:
        challenge_indicators = [
            'group stage', 'knockout stage', 'preliminary', 'group',
            'first round', 'round'
        ]
        if any(ind in round_lower for ind in challenge_indicators):
            return 'challenge_cup'
        if month <= 7 and round_lower in ['round', 'first round']:
            return 'challenge_cup'

    playoff_indicators = ['quarterfinal', 'semifinal', 'final', 'championship']
    if any(ind in round_lower for ind in playoff_indicators):
        return 'challenge_cup' if (season >= 2020 and month <= 7) else 'playoffs'

    if month >= 10:
        return 'playoffs'
    elif season >= 2020 and month <= 7:
        return 'challenge_cup'

    return 'unknown'

def extract_all_nwsl_matches_categorized():
    """Extract all NWSL matches with 4-category classification"""
    season_pages_dir = Path("/Users/thomasmcmillan/projects/nwsl_data/data/raw_season_pages")
    season_files = list(season_pages_dir.glob("*NWSL Scores*.html"))
    all_matches = []

    for file_path in sorted(season_files):
        if "Scores & Fixtures" not in file_path.name:
            continue

        year_match = file_path.name.split()[0]
        if not year_match.isdigit():
            continue

        season_year = int(year_match)
        print(f"📂 Processing {season_year}...")

        soup = BeautifulSoup(file_path.read_text(encoding="utf-8"), "lxml")
        fixtures_table = soup.find("table", {"id": "sched_all"})

        if not fixtures_table:
            print("   ❌ No table found")
            continue

        category_counts = {
            'regular_season': 0,
            'playoffs': 0,
            'challenge_cup': 0,
            'fall_series': 0,
            'unknown': 0
        }

        rows = fixtures_table.find_all("tr")[1:]

        for row in rows:
            round_cell = row.find(attrs={"data-stat": "round"})
            date_cell = row.find(attrs={"data-stat": "date"})
            home_cell = row.find(attrs={"data-stat": "home_team"})
            away_cell = row.find(attrs={"data-stat": "away_team"})

            match_round = round_cell.get_text(strip=True) if round_cell else ""
            match_date = date_cell.get_text(strip=True) if date_cell else ""
            home_team = home_cell.get_text(strip=True) if home_cell else ""
            away_team = away_cell.get_text(strip=True) if away_cell else ""

            if not any([match_round, match_date, home_team, away_team]):
                continue

            if match_date and home_team and away_team:
                category = categorize_nwsl_match(match_round, match_date, season_year)
                category_counts[category] += 1

                match_data = {
                    'season': season_year,
                    'date': match_date,
                    'home_team': home_team,
                    'away_team': away_team,
                    'round': match_round,
                    'category': category
                }
                all_matches.append(match_data)

        total = sum(category_counts.values())
        print(f"   ✅ {total} matches categorized:")
        for cat, count in category_counts.items():
            if count > 0:
                pct = count / total * 100
                print(f"      {cat}: {count} ({pct:.1f}%)")

    return all_matches

# Extract and categorize all matches
print("🔄 Extracting and categorizing all NWSL matches...")
all_categorized_matches = extract_all_nwsl_matches_categorized()

if all_categorized_matches:
    df_all = pd.DataFrame(all_categorized_matches)

    print(f"\n📊 COMPLETE NWSL MATCH CATEGORIZATION")
    print(f"Total matches: {len(df_all)}")

    print(f"\n🏆 OVERALL CATEGORY BREAKDOWN:")
    category_summary = df_all['category'].value_counts()
    for category, count in category_summary.items():
        pct = count / len(df_all) * 100
        print(f"   {category}: {count} matches ({pct:.1f}%)")

    print(f"\n📅 CATEGORY BY SEASON MATRIX:")
    season_category_matrix = df_all.groupby(['season', 'category']).size().unstack(fill_value=0)
    season_category_matrix['TOTAL'] = season_category_matrix.sum(axis=1)
    print(season_category_matrix)

    # Validation checks
    print(f"\n🔍 VALIDATION CHECKS:")

    # Fall Series
    fall_series_seasons = df_all[df_all['category'] == 'fall_series']['season'].unique()
    print(f"   Fall Series found in seasons: {list(fall_series_seasons)}")
    if list(fall_series_seasons) == [2020]:
        print("   ✅ Fall Series correctly limited to 2020")
    else:
        print("   ⚠️  Fall Series found outside 2020!")

    # Challenge Cup
    challenge_cup_seasons = df_all[df_all['category'] == 'challenge_cup']['season'].unique()
    pre_2020_challenge = [s for s in challenge_cup_seasons if s < 2020]
    print(f"   Challenge Cup seasons: {sorted(challenge_cup_seasons)}")
    if not pre_2020_challenge:
        print("   ✅ Challenge Cup correctly starts from 2020")
    else:
        print(f"   ⚠️  Challenge Cup found before 2020: {pre_2020_challenge}")

    # Regular and playoffs per season
    seasons_with_regular = df_all[df_all['category'] == 'regular_season']['season'].unique()
    seasons_with_playoffs = df_all[df_all['category'] == 'playoffs']['season'].unique()
    print(f"   Regular season in {len(seasons_with_regular)} seasons: {sorted(seasons_with_regular)}")
    print(f"   Playoffs in {len(seasons_with_playoffs)} seasons: {sorted(seasons_with_playoffs)}")

    # Samples
    print(f"\n📝 SAMPLE MATCHES BY CATEGORY:")
    for category in ['regular_season', 'playoffs', 'challenge_cup', 'fall_series']:
        cat_matches = df_all[df_all['category'] == category]
        if not cat_matches.empty:
            print(f"\n--- {category.upper()} ({len(cat_matches)} total) ---")
            for _, match in cat_matches.head(3).iterrows():
                print(f"   {match['season']} {match['date']}: {match['home_team']} vs {match['away_team']} ({match['round']})")

else:
    print("❌ No matches extracted")

print("\n🎯 Ready for validation and database insertion!")


🏆 NWSL 4-CATEGORY MATCH CLASSIFICATION

🔄 Extracting and categorizing all NWSL matches...
📂 Processing 2013...
   ✅ 94 matches categorized:
      regular_season: 88 (93.6%)
      playoffs: 3 (3.2%)
      unknown: 3 (3.2%)
📂 Processing 2014...
   ✅ 115 matches categorized:
      regular_season: 108 (93.9%)
      playoffs: 3 (2.6%)
      unknown: 4 (3.5%)
📂 Processing 2015...
   ✅ 96 matches categorized:
      regular_season: 90 (93.8%)
      playoffs: 3 (3.1%)
      unknown: 3 (3.1%)
📂 Processing 2016...
   ✅ 107 matches categorized:
      regular_season: 100 (93.5%)
      playoffs: 3 (2.8%)
      unknown: 4 (3.7%)
📂 Processing 2017...
   ✅ 127 matches categorized:
      regular_season: 120 (94.5%)
      playoffs: 3 (2.4%)
      unknown: 4 (3.1%)
📂 Processing 2018...
   ✅ 115 matches categorized:
      regular_season: 108 (93.9%)
      playoffs: 3 (2.6%)
      unknown: 4 (3.5%)
📂 Processing 2019...
   ✅ 115 matches categorized:
      regular_season: 108 (93.9%)
      playoffs: 3 (2.6%)


In [59]:
# ══════════════════════════════════════════════════════════════════════════════
#  INVESTIGATE UNKNOWN MATCHES FOR EASY CLASSIFICATION FIXES
# ══════════════════════════════════════════════════════════════════════════════

import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
from datetime import datetime

print("🔍 INVESTIGATING THE 26 UNKNOWN MATCHES\n")

def extract_unknown_matches_detailed():
    """Extract all matches marked as 'unknown' with full context"""

    season_pages_dir = Path("/Users/thomasmcmillan/projects/nwsl_data/data/raw_season_pages")
    season_files = list(season_pages_dir.glob("*NWSL Scores*.html"))

    unknown_matches = []

    for file_path in sorted(season_files):
        if "Scores & Fixtures" not in file_path.name:
            continue

        year_match = file_path.name.split()[0]
        if not year_match.isdigit():
            continue
        season_year = int(year_match)

        soup = BeautifulSoup(file_path.read_text(encoding="utf-8"), "lxml")
        fixtures_table = soup.find("table", {"id": "sched_all"})

        if not fixtures_table:
            continue

        rows = fixtures_table.find_all("tr")[1:]

        for row in rows:
            round_cell = row.find(attrs={"data-stat": "round"})
            date_cell = row.find(attrs={"data-stat": "date"})
            home_cell = row.find(attrs={"data-stat": "home_team"})
            away_cell = row.find(attrs={"data-stat": "away_team"})
            gameweek_cell = row.find(attrs={"data-stat": "gameweek"})

            match_round = round_cell.get_text(strip=True) if round_cell else ""
            match_date = date_cell.get_text(strip=True) if date_cell else ""
            home_team = home_cell.get_text(strip=True) if home_cell else ""
            away_team = away_cell.get_text(strip=True) if away_cell else ""
            gameweek = gameweek_cell.get_text(strip=True) if gameweek_cell else ""

            if not any([match_round, match_date, home_team, away_team]):
                continue

            def categorize_nwsl_match(round_text, match_date, season):
                if not round_text:
                    return 'unknown'

                round_lower = round_text.lower().strip()

                try:
                    if isinstance(match_date, str):
                        date_obj = datetime.strptime(match_date, "%Y-%m-%d")
                        month = date_obj.month
                    else:
                        month = match_date.month
                except:
                    month = 0

                if 'regular season' in round_lower:
                    return 'regular_season'

                if season == 2020 and month >= 9 and 'challenge' not in round_lower:
                    if round_lower in ['round', ''] or 'fall' in round_lower:
                        return 'fall_series'

                if season >= 2020:
                    challenge_indicators = [
                        'group stage', 'knockout stage', 'preliminary', 'group',
                        'first round', 'round'
                    ]
                    if any(indicator in round_lower for indicator in challenge_indicators):
                        return 'challenge_cup'
                    if month <= 7 and round_lower in ['round', 'first round']:
                        return 'challenge_cup'

                playoff_indicators = ['quarterfinal', 'semifinal', 'final', 'championship']
                if any(indicator in round_lower for indicator in playoff_indicators):
                    return 'challenge_cup' if (season >= 2020 and month <= 7) else 'playoffs'

                if month >= 10:
                    return 'playoffs'
                elif season >= 2020 and month <= 7:
                    return 'challenge_cup'

                return 'unknown'

            category = categorize_nwsl_match(match_round, match_date, season_year)

            if category == 'unknown':
                try:
                    date_obj = datetime.strptime(match_date, "%Y-%m-%d")
                    month = date_obj.month
                    month_name = date_obj.strftime("%B")
                except:
                    month = 0
                    month_name = "Unknown"

                unknown_matches.append({
                    'season': season_year,
                    'date': match_date,
                    'month': month,
                    'month_name': month_name,
                    'home_team': home_team,
                    'away_team': away_team,
                    'round': match_round,
                    'gameweek': gameweek
                })

    return unknown_matches

# Extract all unknown matches
unknown_matches = extract_unknown_matches_detailed()

if unknown_matches:
    df_unknown = pd.DataFrame(unknown_matches)

    print(f"📊 FOUND {len(df_unknown)} UNKNOWN MATCHES")

    print(f"\n🔍 UNKNOWN MATCHES BY SEASON:")
    for season, count in df_unknown['season'].value_counts().sort_index().items():
        print(f"   {season}: {count} unknown matches")

    print(f"\n🔍 UNKNOWN MATCHES BY ROUND TEXT:")
    for round_text, count in df_unknown['round'].value_counts().items():
        print(f"   '{round_text}': {count} matches")

    print(f"\n🔍 UNKNOWN MATCHES BY MONTH:")
    for month, count in df_unknown['month_name'].value_counts().items():
        print(f"   {month}: {count} matches")

    print(f"\n📅 DETAILED BREAKDOWN BY SEASON:")
    for season in sorted(df_unknown['season'].unique()):
        season_matches = df_unknown[df_unknown['season'] == season]
        print(f"\n--- {season} ({len(season_matches)} unknown matches) ---")
        for _, match in season_matches.iterrows():
            print(f"   {match['date']} ({match['month_name']}): {match['home_team']} vs {match['away_team']}")
            print(f"      Round: '{match['round']}' | Gameweek: '{match['gameweek']}'")

    print(f"\n💡 PATTERNS TO INVESTIGATE:")
    print("1. Are these 'First Round' playoff matches that need better detection?")
    print("2. Are these Challenge Cup matches with different round names?")
    print("3. Are these other tournament types we haven't accounted for?")
    print("4. Are these data quality issues (missing round text)?")

    print(f"\n🔧 READY FOR MANUAL INVESTIGATION!")
    print("Look up a few of these matches on FBref to determine their actual category.")
else:
    print("✅ No unknown matches found!")


🔍 INVESTIGATING THE 26 UNKNOWN MATCHES

📊 FOUND 26 UNKNOWN MATCHES

🔍 UNKNOWN MATCHES BY SEASON:
   2013: 3 unknown matches
   2014: 4 unknown matches
   2015: 3 unknown matches
   2016: 4 unknown matches
   2017: 4 unknown matches
   2018: 4 unknown matches
   2019: 4 unknown matches

🔍 UNKNOWN MATCHES BY ROUND TEXT:
   'Round': 26 matches

🔍 UNKNOWN MATCHES BY MONTH:
   Unknown: 26 matches

📅 DETAILED BREAKDOWN BY SEASON:

--- 2013 (3 unknown matches) ---
   Date (Unknown): Home vs Away
      Round: 'Round' | Gameweek: ''
   Date (Unknown): Home vs Away
      Round: 'Round' | Gameweek: ''
   Date (Unknown): Home vs Away
      Round: 'Round' | Gameweek: ''

--- 2014 (4 unknown matches) ---
   Date (Unknown): Home vs Away
      Round: 'Round' | Gameweek: ''
   Date (Unknown): Home vs Away
      Round: 'Round' | Gameweek: ''
   Date (Unknown): Home vs Away
      Round: 'Round' | Gameweek: ''
   Date (Unknown): Home vs Away
      Round: 'Round' | Gameweek: ''

--- 2015 (3 unknown matches

In [61]:
import sqlite3
import pandas as pd

# Connect to database
conn = sqlite3.connect('/Users/thomasmcmillan/projects/nwsl_data/data/processed/nwsldata.db')

# Check what tables exist
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

print("Available tables:")
for table in tables:
    print(f"  - {table[0]}")

# If there are tables, print the schema of the first one
if tables:
    table_name = tables[0][0]
    cursor.execute(f"PRAGMA table_info({table_name})")
    columns = cursor.fetchall()

    print(f"\nSchema for '{table_name}':")
    for col in columns:
        print(f"  {col[1]} ({col[2]})")

conn.close()



Available tables:


In [62]:
import sqlite3
import pandas as pd
import os

# Check if the database file exists and its size
db_path = '/Users/thomasmcmillan/projects/nwsl_data/data/processed/nwsldata.db'

if os.path.exists(db_path):
    file_size = os.path.getsize(db_path)
    print(f"✅ Database file exists: {db_path}")
    print(f"📦 File size: {file_size:,} bytes")
else:
    print(f"❌ Database file NOT found at: {db_path}")

# Try connecting to the database and inspecting its contents
try:
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Method 1: Get all table names
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    print(f"\n🔍 Method 1 - Found {len(tables)} tables:")
    for table in tables:
        print(f"  - {table[0]}")

    # Method 2: Get table schemas
    cursor.execute("SELECT sql FROM sqlite_master WHERE type='table';")
    schemas = cursor.fetchall()
    print(f"\n🧩 Method 2 - Found {len(schemas)} table schemas")

    # Method 3: Try querying a known table
    try:
        cursor.execute("SELECT COUNT(*) FROM Match")
        count = cursor.fetchone()[0]
        print(f"\n📊 Match table has {count} records")
    except Exception as e:
        print(f"\n⚠️ Couldn't query Match table: {e}")

    conn.close()

except Exception as e:
    print(f"\n❌ Database connection error: {e}")

# Check for any other .db files in the project directory
print("\n🗂️ Looking for other .db files in project...")
for root, dirs, files in os.walk('/Users/thomasmcmillan/projects/nwsl_data'):
    for file in files:
        if file.endswith('.db'):
            full_path = os.path.join(root, file)
            size = os.path.getsize(full_path)
            print(f"  📁 Found: {full_path} ({size:,} bytes)")


✅ Database file exists: /Users/thomasmcmillan/projects/nwsl_data/data/processed/nwsldata.db
📦 File size: 0 bytes

🔍 Method 1 - Found 0 tables:

🧩 Method 2 - Found 0 table schemas

⚠️ Couldn't query Match table: no such table: Match

🗂️ Looking for other .db files in project...
  📁 Found: /Users/thomasmcmillan/projects/nwsl_data/data/processed/nwsldata.db (0 bytes)
  📁 Found: /Users/thomasmcmillan/projects/nwsl_data/notebooks/data/processed/nwsldata.db (172,032 bytes)


In [63]:
import sqlite3
import pandas as pd

# Use the correct database path with actual data
conn = sqlite3.connect('/Users/thomasmcmillan/projects/nwsl_data/notebooks/data/processed/nwsldata.db')

# Check what tables are available
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

print("Available tables:")
for table in tables:
    print(f"  - {table[0]}")

# Now get the 2014 unknown matches to check for duplicates
query = """
SELECT match_id, season, round, 
       match_date, home_team, away_team, 
       home_score, away_score, 
       filename, extraction_status
FROM Match 
WHERE season = 2014 AND match_category = 'unknown'
ORDER BY match_date, home_team
"""

unknown_2014 = pd.read_sql_query(query, conn)
print(f"\nFound {len(unknown_2014)} unknown matches in 2014:")
print(unknown_2014.to_string(index=False))

# Check for potential duplicates (same date + teams)
print("\nChecking for duplicates...")
duplicates = unknown_2014.groupby(['match_date', 'home_team', 'away_team']).size()
duplicates = duplicates[duplicates > 1]

if len(duplicates) > 0:
    print("⚠️ Found potential duplicates:")
    print(duplicates)
else:
    print("✅ No exact duplicates found by date and teams")

# Show unique combinations for manual inspection
print("\nUnique match combinations:")
for _, row in unknown_2014.iterrows():
    print(f"{row['match_date']}: {row['home_team']} vs {row['away_team']} "
          f"({row['home_score']}-{row['away_score']})")

conn.close()


Available tables:
  - Season
  - sqlite_sequence
  - Team
  - teams
  - Player
  - Match
  - team_seasons


DatabaseError: Execution failed on sql '
SELECT match_id, season, round, 
       match_date, home_team, away_team, 
       home_score, away_score, 
       filename, extraction_status
FROM Match 
WHERE season = 2014 AND match_category = 'unknown'
ORDER BY match_date, home_team
': no such column: season

In [64]:
import sqlite3
import pandas as pd

# Connect to the known populated database
conn = sqlite3.connect('/Users/thomasmcmillan/projects/nwsl_data/notebooks/data/processed/nwsldata.db')
cursor = conn.cursor()

# 1. Inspect Match table schema
cursor.execute("PRAGMA table_info(Match)")
match_columns = cursor.fetchall()

print("📋 MATCH TABLE SCHEMA:")
for col in match_columns:
    print(f"  {col[1]} ({col[2]})")

# 2. Record count
cursor.execute("SELECT COUNT(*) FROM Match")
match_count = cursor.fetchone()[0]
print(f"\n📊 Match table has {match_count} records")

# 3. Sample records
cursor.execute("SELECT * FROM Match LIMIT 5")
sample_matches = cursor.fetchall()
print("\n🔍 Sample Match records:")
for i, match in enumerate(sample_matches):
    print(f"  Record {i + 1}: {match}")

# 4. Inspect other relevant tables
for table_name in ['Player', 'teams', 'team_seasons']:
    try:
        cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
        count = cursor.fetchone()[0]
        print(f"\n📊 {table_name} table has {count} records")
    except Exception as e:
        print(f"\n⚠️ Error checking {table_name}: {e}")

conn.close()


📋 MATCH TABLE SCHEMA:
  match_id (TEXT)
  season_id (INTEGER)
  match_date (DATE)
  match_time (TIME)
  home_team_id (TEXT)
  away_team_id (TEXT)
  home_goals (INTEGER)
  away_goals (INTEGER)
  home_xg (REAL)
  away_xg (REAL)
  attendance (INTEGER)
  venue (TEXT)
  referee (TEXT)
  temperature (REAL)
  filename (TEXT)
  extraction_status (TEXT)

📊 Match table has 749 records

🔍 Sample Match records:
  Record 1: ('a8f83687', None, '2024-05-04', None, 'df9a10a1', 'e442aad0', None, None, None, None, None, None, None, None, None, None)
  Record 2: ('c2b1f0dd', None, '2021-06-26', None, 'e813709a', '2a6178ac', None, None, None, None, None, None, None, None, None, None)
  Record 3: ('d5c1d8d0', None, '2024-11-02', None, '85c458aa', 'e442aad0', None, None, None, None, None, None, None, None, None, None)
  Record 4: (None, 2024, '2024-05-04', None, 'df9a10a1', 'e442aad0', None, None, None, None, None, None, None, None, 'match_report_ec7c8e61.html', 'success')
  Record 5: (None, 2023, '2023-06-

In [65]:
import sqlite3
import pandas as pd
import os

# 1. Look for database or data files across known directories
data_dirs = [
    '/Users/thomasmcmillan/projects/nwsl_data/data',
    '/Users/thomasmcmillan/projects/nwsl_data/notebooks',
    '/Users/thomasmcmillan/projects/nwsl_data'
]

print("🔍 SEARCHING FOR ALL DATA FILES...")
for data_dir in data_dirs:
    if os.path.exists(data_dir):
        for root, dirs, files in os.walk(data_dir):
            for file in files:
                if file.endswith(('.db', '.csv', '.json', '.pkl')):
                    full_path = os.path.join(root, file)
                    size = os.path.getsize(full_path)
                    print(f"  📁 {full_path} ({size:,} bytes)")

print("\n" + "=" * 50)
print("🧪 ANALYZING CURRENT DATABASE CONTENT...")

# 2. Connect to the known populated database
conn = sqlite3.connect('/Users/thomasmcmillan/projects/nwsl_data/notebooks/data/processed/nwsldata.db')
cursor = conn.cursor()

# 3. Check for seasons in Match table
cursor.execute("""
    SELECT season_id, COUNT(*) 
    FROM Match 
    WHERE season_id IS NOT NULL 
    GROUP BY season_id 
    ORDER BY season_id
""")
seasons = cursor.fetchall()

print("📅 SEASONS IN DATABASE:")
for season, count in seasons:
    print(f"  {season}: {count} matches")

# 4. Sample filenames from the Match table
cursor.execute("""
    SELECT filename, COUNT(*) 
    FROM Match 
    WHERE filename IS NOT NULL 
    GROUP BY filename 
    LIMIT 10
""")
files = cursor.fetchall()

print(f"\n📄 SAMPLE FILENAMES ({len(files)} unique):")
for filename, count in files[:5]:
    print(f"  {filename}: {count} matches")

conn.close()


🔍 SEARCHING FOR ALL DATA FILES...
  📁 /Users/thomasmcmillan/projects/nwsl_data/data/processed/nwsldata.db (0 bytes)
  📁 /Users/thomasmcmillan/projects/nwsl_data/notebooks/dob_scraping_checkpoint_150.csv (6,221 bytes)
  📁 /Users/thomasmcmillan/projects/nwsl_data/notebooks/dob_scraping_checkpoint_50.csv (2,295 bytes)
  📁 /Users/thomasmcmillan/projects/nwsl_data/notebooks/dob_scraping_checkpoint_250.csv (10,137 bytes)
  📁 /Users/thomasmcmillan/projects/nwsl_data/notebooks/player_dobs_final.csv (12,449 bytes)
  📁 /Users/thomasmcmillan/projects/nwsl_data/notebooks/dob_scraping_checkpoint_100.csv (4,275 bytes)
  📁 /Users/thomasmcmillan/projects/nwsl_data/notebooks/dob_scraping_checkpoint_300.csv (12,086 bytes)
  📁 /Users/thomasmcmillan/projects/nwsl_data/notebooks/dob_scraping_checkpoint_200.csv (8,148 bytes)
  📁 /Users/thomasmcmillan/projects/nwsl_data/notebooks/data/processed/nwsldata.db (172,032 bytes)
  📁 /Users/thomasmcmillan/projects/nwsl_data/.venv/lib/python3.13/site-packages/debugpy

In [66]:
import sqlite3
import pandas as pd
import os

# Directory containing raw season overview HTML files
season_pages_dir = '/Users/thomasmcmillan/projects/nwsl_data/data/raw_season_pages'

print("🔍 CHECKING FOR SEASON OVERVIEW SOURCE FILES...")

if os.path.exists(season_pages_dir):
    files = [f for f in os.listdir(season_pages_dir) if f.endswith('.html')]
    print(f"✅ Found {len(files)} season overview HTML files:")
    for file in sorted(files):
        file_path = os.path.join(season_pages_dir, file)
        size = os.path.getsize(file_path)
        print(f"  📄 {file} ({size:,} bytes)")
else:
    print(f"❌ Directory not found: {season_pages_dir}")

# Manual summary of known data state
print(f"\n📊 CURRENT DATABASE STATUS:")
print(f"✅ We have the base infrastructure (Player, teams, team_seasons tables)")
print(f"⚠️  Match table has 749 individual match reports (missing season overview data)")
print(f"❌ Missing match_category column for our 4-category classification")
print(f"❌ Missing comprehensive 2013–2016 data")

print(f"\n🎯 RECOMMENDED NEXT STEPS:")
print("1. Re-extract ALL matches from season overview pages (should get us back to ~1,367)")
print("2. Re-implement 4-category classification system")
print("3. Continue investigating the 2014 duplicate issue")
print("4. Ensure we don't lose the existing 749 individual match reports")


🔍 CHECKING FOR SEASON OVERVIEW SOURCE FILES...
✅ Found 13 season overview HTML files:
  📄 2013 NWSL Scores & Fixtures _ FBref.com.html (333,134 bytes)
  📄 2014 NWSL Scores & Fixtures _ FBref.com.html (383,243 bytes)
  📄 2015 NWSL Scores & Fixtures _ FBref.com.html (335,048 bytes)
  📄 2016 NWSL Scores & Fixtures _ FBref.com.html (358,909 bytes)
  📄 2017 NWSL Scores & Fixtures _ FBref.com.html (417,402 bytes)
  📄 2018 NWSL Scores & Fixtures _ FBref.com.html (385,191 bytes)
  📄 2019 NWSL Scores & Fixtures _ FBref.com.html (417,019 bytes)
  📄 2020 NWSL Scores & Fixtures _ FBref.com.html (255,577 bytes)
  📄 2021 NWSL Scores & Fixtures _ FBref.com.html (456,647 bytes)
  📄 2022 NWSL Scores & Fixtures _ FBref.com.html (492,687 bytes)
  📄 2023 NWSL Scores & Fixtures _ FBref.com.html (520,190 bytes)
  📄 2024 NWSL Scores & Fixtures _ FBref.com.html (673,392 bytes)
  📄 NWSL Scores & Fixtures _ FBref.com.html (337,158 bytes)

📊 CURRENT DATABASE STATUS:
✅ We have the base infrastructure (Player, tea

In [67]:
import sqlite3
import pandas as pd
import os

# Function to analyze a database
def analyze_database(db_path):
    print(f"\n=== ANALYZING DATABASE: {db_path} ===")
    
    if not os.path.exists(db_path):
        print(f"Database not found at {db_path}")
        return None
    
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # Get all tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    
    print(f"Found {len(tables)} tables:")
    for table in tables:
        print(f"  - {table[0]}")
    
    db_info = {}
    
    for table_name in [t[0] for t in tables]:
        print(f"\n--- TABLE: {table_name} ---")
        
        # Get table schema
        cursor.execute(f"PRAGMA table_info({table_name});")
        schema = cursor.fetchall()
        
        print("Schema:")
        for col in schema:
            print(f"  {col[1]} ({col[2]}) - PK: {bool(col[5])}, Not Null: {bool(col[3])}, Default: {col[4]}")
        
        # Get row count
        cursor.execute(f"SELECT COUNT(*) FROM {table_name};")
        row_count = cursor.fetchone()[0]
        print(f"Row count: {row_count}")
        
        # Get sample data
        if row_count > 0:
            df = pd.read_sql_query(f"SELECT * FROM {table_name} LIMIT 10", conn)
            print(f"Sample data (first {min(10, row_count)} rows):")
            print(df.to_string(index=False))
        else:
            print("No data in table")
        
        # Get indexes
        cursor.execute(f"PRAGMA index_list({table_name});")
        indexes = cursor.fetchall()
        if indexes:
            print("Indexes:")
            for idx in indexes:
                cursor.execute(f"PRAGMA index_info({idx[1]});")
                idx_info = cursor.fetchall()
                cols = [col[2] for col in idx_info]
                print(f"  {idx[1]} on columns: {', '.join(cols)} (unique: {bool(idx[2])})")
        
        # Get foreign keys
        cursor.execute(f"PRAGMA foreign_key_list({table_name});")
        fks = cursor.fetchall()
        if fks:
            print("Foreign Keys:")
            for fk in fks:
                print(f"  {fk[3]} -> {fk[2]}.{fk[4]}")
        
        # Store info for comparison
        db_info[table_name] = {
            'schema': schema,
            'row_count': row_count,
            'indexes': indexes,
            'foreign_keys': fks
        }
    
    conn.close()
    return db_info

# Analyze the notebooks database
notebooks_db_path = "/Users/thomasmcmillan/projects/nwsl_data/notebooks/data/processed/nwsldata.db"
notebooks_info = analyze_database(notebooks_db_path)


=== ANALYZING DATABASE: /Users/thomasmcmillan/projects/nwsl_data/notebooks/data/processed/nwsldata.db ===
Found 7 tables:
  - Season
  - sqlite_sequence
  - Team
  - teams
  - Player
  - Match
  - team_seasons

--- TABLE: Season ---
Schema:
  season_id (INTEGER) - PK: True, Not Null: False, Default: None
  season_year (INTEGER) - PK: False, Not Null: False, Default: None
  league_name (TEXT) - PK: False, Not Null: False, Default: None
Row count: 1
Sample data (first 1 rows):
 season_id  season_year league_name
         1         2025        None
Indexes:
  sqlite_autoindex_Season_1 on columns: season_year (unique: True)

--- TABLE: sqlite_sequence ---
Schema:
  name () - PK: False, Not Null: False, Default: None
  seq () - PK: False, Not Null: False, Default: None
Row count: 2
Sample data (first 2 rows):
  name  seq
Season    1
  Team    2

--- TABLE: Team ---
Schema:
  team_id (INTEGER) - PK: True, Not Null: False, Default: None
  team_name (TEXT) - PK: False, Not Null: False, Defaul

In [68]:
# Analyze the main database
main_db_path = "/Users/thomasmcmillan/projects/nwsl_data/data/processed/nwsldata.db"
main_info = analyze_database(main_db_path)


=== ANALYZING DATABASE: /Users/thomasmcmillan/projects/nwsl_data/data/processed/nwsldata.db ===
Found 0 tables:


In [69]:
# Check if the main database file exists
import os

main_db_path = "/Users/thomasmcmillan/projects/nwsl_data/data/processed/nwsldata.db"
notebooks_db_path = "/Users/thomasmcmillan/projects/nwsl_data/notebooks/data/processed/nwsldata.db"

print("Database file existence check:")
print(f"Main database ({main_db_path}): {os.path.exists(main_db_path)}")
print(f"Notebooks database ({notebooks_db_path}): {os.path.exists(notebooks_db_path)}")

if os.path.exists(main_db_path):
    print(f"Main database file size: {os.path.getsize(main_db_path)} bytes")
else:
    print("Main database file does not exist")

if os.path.exists(notebooks_db_path):
    print(f"Notebooks database file size: {os.path.getsize(notebooks_db_path)} bytes")
else:
    print("Notebooks database file does not exist")

Database file existence check:
Main database (/Users/thomasmcmillan/projects/nwsl_data/data/processed/nwsldata.db): True
Notebooks database (/Users/thomasmcmillan/projects/nwsl_data/notebooks/data/processed/nwsldata.db): True
Main database file size: 0 bytes
Notebooks database file size: 172032 bytes


In [70]:
# Detailed analysis summary
print("=== DATABASE COMPARISON SUMMARY ===\n")

print("1. FILE STATUS:")
print(f"   - Main database: EXISTS but is EMPTY (0 bytes)")
print(f"   - Notebooks database: EXISTS and contains data (172,032 bytes)")

print("\n2. NOTEBOOKS DATABASE ANALYSIS:")
print("   Contains 7 tables with the following structure:")

# Table summary
table_summary = {
    'Season': {'rows': 1, 'description': 'Single 2025 season record'},
    'Team': {'rows': 2, 'description': 'Two teams (Chicago Stars, Seattle Reign FC)'},
    'teams': {'rows': 16, 'description': 'Complete team registry with IDs and aliases'},
    'Player': {'rows': 309, 'description': 'Player records with basic info'},
    'Match': {'rows': 749, 'description': 'Match records across multiple seasons'},
    'team_seasons': {'rows': 135, 'description': 'Team participation by season'},
    'sqlite_sequence': {'rows': 2, 'description': 'SQLite internal sequence tracking'}
}

for table, info in table_summary.items():
    print(f"   - {table}: {info['rows']} rows - {info['description']}")

print("\n3. KEY RELATIONSHIPS:")
print("   - Foreign Keys:")
print("     * Match.home_team_id -> teams.team_id")
print("     * Match.away_team_id -> teams.team_id") 
print("     * team_seasons.team_id -> teams.team_id")

print("\n4. DATA QUALITY OBSERVATIONS:")
print("   - Match table has mixed data: some records have match_id, others have season_id")
print("   - Many Match records have NULL values for scores, xG, attendance, etc.")
print("   - Player table uses proper UUID-style IDs")
print("   - Team naming inconsistencies between 'Team' and 'teams' tables")

print("\n5. SCHEMA HIGHLIGHTS:")
print("   - teams table uses TEXT primary keys (UUIDs)")
print("   - Match table has comprehensive match data structure")
print("   - Proper indexing on unique constraints")
print("   - Date/time fields for temporal data")

print("\n6. DIFFERENCES BETWEEN DATABASES:")
print("   - Main database: COMPLETELY EMPTY")
print("   - Notebooks database: FULLY POPULATED with NWSL data")
print("   - No overlap or comparison possible due to empty main database")

=== DATABASE COMPARISON SUMMARY ===

1. FILE STATUS:
   - Main database: EXISTS but is EMPTY (0 bytes)
   - Notebooks database: EXISTS and contains data (172,032 bytes)

2. NOTEBOOKS DATABASE ANALYSIS:
   Contains 7 tables with the following structure:
   - Season: 1 rows - Single 2025 season record
   - Team: 2 rows - Two teams (Chicago Stars, Seattle Reign FC)
   - teams: 16 rows - Complete team registry with IDs and aliases
   - Player: 309 rows - Player records with basic info
   - Match: 749 rows - Match records across multiple seasons
   - team_seasons: 135 rows - Team participation by season
   - sqlite_sequence: 2 rows - SQLite internal sequence tracking

3. KEY RELATIONSHIPS:
   - Foreign Keys:
     * Match.home_team_id -> teams.team_id
     * Match.away_team_id -> teams.team_id
     * team_seasons.team_id -> teams.team_id

4. DATA QUALITY OBSERVATIONS:
   - Match table has mixed data: some records have match_id, others have season_id
   - Many Match records have NULL values f

In [71]:
# ══════════════════════════════════════════════════════════════════════════════
#  VERIFY DATABASE MIGRATION – ALL FILES MOVED SUCCESSFULLY
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd
from pathlib import Path

print("🔍 VERIFYING DATABASE MIGRATION\n")

# Define database paths
db_path = "../../data/processed/nwsldata.db"
db_file = Path(db_path)
shm_file = Path(f"{db_path}-shm")
wal_file = Path(f"{db_path}-wal")

# 1. Check if DB, WAL, and SHM files exist
print("📁 File existence check:")
for file_path, label in [(db_file, "Main DB"), (shm_file, "Shared Memory"), (wal_file, "WAL file")]:
    if file_path.exists():
        size = file_path.stat().st_size
        print(f"   ✅ {label}: {size:,} bytes")
    else:
        print(f"   ❌ {label}: Missing")

# 2. Test connection and functionality
print(f"\n🔌 Testing database connection...")
try:
    conn = sqlite3.connect(db_path)

    # a. Check journal mode (should be WAL)
    wal_mode = pd.read_sql_query("PRAGMA journal_mode", conn)
    print(f"   📝 Journal mode: {wal_mode.iloc[0, 0]}")

    # b. List all tables
    tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table'", conn)
    table_names = tables['name'].tolist()
    print(f"   📊 Found {len(table_names)} tables: {', '.join(table_names)}")

    # c. Quick row counts for critical tables
    print(f"\n📈 Row counts:")
    row_counts = {}
    for table in ['teams', 'Player', 'Match', 'team_seasons']:
        try:
            count = pd.read_sql_query(f"SELECT COUNT(*) AS count FROM {table}", conn).iloc[0]['count']
            row_counts[table] = count
        except Exception:
            row_counts[table] = "ERROR"

    for table, count in row_counts.items():
        print(f"   {table}: {count}")

    # d. Join query test to confirm relational integrity
    print(f"\n🧪 Testing table relationships...")
    test_query = """
        SELECT 
            m.match_date,
            h.team_name AS home_team,
            a.team_name AS away_team,
            m.season_id
        FROM Match m
        JOIN teams h ON m.home_team_id = h.team_id
        JOIN teams a ON m.away_team_id = a.team_id
        ORDER BY m.match_date DESC
        LIMIT 3
    """
    sample_data = pd.read_sql_query(test_query, conn)
    print("   ✅ Sample recent matches:")
    print(sample_data.to_string(index=False))

    conn.close()

    # e. Final confirmation summary
    print(f"\n🎉 DATABASE MIGRATION SUCCESSFUL!")
    print(f"   ✅ All files moved correctly")
    print(f"   ✅ Database connection works")
    print(f"   ✅ WAL mode preserved")
    print(f"   ✅ Data integrity confirmed")
    print(f"   ✅ Table relationships functional")

except Exception as e:
    print(f"   ❌ Error: {e}")


🔍 VERIFYING DATABASE MIGRATION

📁 File existence check:
   ❌ Main DB: Missing
   ❌ Shared Memory: Missing
   ❌ WAL file: Missing

🔌 Testing database connection...
   ❌ Error: unable to open database file


In [72]:
# ══════════════════════════════════════════════════════════════════════════════
#  FIXED: VERIFY DATABASE MIGRATION WITH CORRECT PATH
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd
from pathlib import Path

# CORRECTED PATH: Only go up one level from notebooks/
db_path = "../data/processed/nwsldata.db"
db_file = Path(db_path)

print("🔍 VERIFYING DATABASE MIGRATION (CORRECTED PATH)")
print(f"Database path: {db_path}")
print(f"Absolute path: {db_file.resolve()}")

if db_file.exists():
    size = db_file.stat().st_size
    print(f"✅ Database found: {size:,} bytes")

    # Check for companion WAL and SHM files
    shm_file = Path(str(db_file) + "-shm")
    wal_file = Path(str(db_file) + "-wal")

    print(f"✅ WAL files:")
    if shm_file.exists():
        print(f"   SHM: Present ({shm_file.stat().st_size:,} bytes)")
    else:
        print(f"   SHM: ❌ Missing")
    if wal_file.exists():
        print(f"   WAL: Present ({wal_file.stat().st_size:,} bytes)")
    else:
        print(f"   WAL: ❌ Missing")

    # Test connection and verify tables
    conn = sqlite3.connect(db_path)

    tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table'", conn)
    print(f"✅ Found {len(tables)} tables: {', '.join(tables['name'])}")

    # Row counts for key tables
    match_count = pd.read_sql_query("SELECT COUNT(*) AS count FROM Match", conn).iloc[0]['count']
    team_count = pd.read_sql_query("SELECT COUNT(*) AS count FROM teams", conn).iloc[0]['count']
    player_count = pd.read_sql_query("SELECT COUNT(*) AS count FROM Player", conn).iloc[0]['count']

    print(f"✅ Data check: {match_count} matches, {team_count} teams, {player_count} players")

    # Sample join query to test relational integrity
    sample_match = pd.read_sql_query("""
        SELECT 
            m.match_date, 
            h.team_name AS home_team, 
            a.team_name AS away_team
        FROM Match m
        JOIN teams h ON m.home_team_id = h.team_id
        JOIN teams a ON m.away_team_id = a.team_id
        ORDER BY m.match_date DESC
        LIMIT 3
    """, conn)

    print(f"\n📝 Sample recent matches:")
    print(sample_match.to_string(index=False))

    conn.close()
    print("\n🎉 MIGRATION SUCCESSFUL!")

else:
    print(f"❌ Database not found at {db_path}")
    print(f"   Full path attempted: {db_file.resolve()}")


🔍 VERIFYING DATABASE MIGRATION (CORRECTED PATH)
Database path: ../data/processed/nwsldata.db
Absolute path: /Users/thomasmcmillan/projects/nwsl_data/data/processed/nwsldata.db
✅ Database found: 172,032 bytes
✅ WAL files:
   SHM: Present (32,768 bytes)
   WAL: Present (0 bytes)
✅ Found 7 tables: Season, sqlite_sequence, Team, teams, Player, Match, team_seasons
✅ Data check: 749 matches, 16 teams, 309 players

📝 Sample recent matches:
match_date         home_team         away_team
2025-06-22 San Diego Wave FC Washington Spirit
2025-06-21         Gotham FC            Bay FC
2025-06-21       Utah Royals  Seattle Reign FC

🎉 MIGRATION SUCCESSFUL!


In [73]:
# ══════════════════════════════════════════════════════════════════════════════
#  DATABASE DIAGNOSTICS: TABLE INVENTORY
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd

# Connect to database
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

print("🔍 DATABASE DIAGNOSTICS: TABLE INVENTORY")
print("=" * 60)

# 1) Get all tables from sqlite_master
tables_query = """
    SELECT 
        name AS table_name,
        type,
        sql
    FROM sqlite_master 
    WHERE type='table'
    ORDER BY name
"""

tables_info = pd.read_sql_query(tables_query, conn)
print(f"📊 Found {len(tables_info)} tables:\n")

# 2) Loop through tables to count rows and preview schema
for i, row in tables_info.iterrows():
    table_name = row['table_name']
    print(f"  {i + 1}. {table_name}")

    try:
        # Count rows
        count_query = f"SELECT COUNT(*) as count FROM {table_name}"
        row_count = pd.read_sql_query(count_query, conn).iloc[0]['count']
        print(f"     Rows: {row_count:,}")

        # Preview schema
        create_sql = row['sql']
        if create_sql and '(' in create_sql:
            columns_part = create_sql.split('(', 1)[1].rsplit(')', 1)[0]
            preview = columns_part.strip().replace('\n', ' ')
            print(f"     Schema: {preview[:100]}..." if len(preview) > 100 else f"     Schema: {preview}")
        else:
            print(f"     Schema: Not available")

    except Exception as e:
        print(f"     ❌ Error reading table: {e}")

    print()

conn.close()

print("=" * 60)
print("✅ Table inventory complete!")


🔍 DATABASE DIAGNOSTICS: TABLE INVENTORY
📊 Found 7 tables:

  1. Match
     Rows: 749
     Schema: match_id      TEXT PRIMARY KEY,        -- Generate from team+date hash     season_id     INTEGER,   ...

  2. Player
     Rows: 309
     Schema: player_id    TEXT PRIMARY KEY,      player_name  TEXT NOT NULL,     nationality  TEXT,     dob      ...

  3. Season
     Rows: 1
     Schema: season_id   INTEGER PRIMARY KEY AUTOINCREMENT,     season_year INTEGER UNIQUE,     league_name TEXT

  4. Team
     Rows: 2
     Schema: team_id     INTEGER PRIMARY KEY AUTOINCREMENT,     team_name   TEXT UNIQUE , team_name_short TEXT

  5. sqlite_sequence
     Rows: 2
     Schema: name,seq

  6. team_seasons
     Rows: 135
     Schema: season INTEGER,         team_short_name TEXT,         team_id TEXT,         FOREIGN KEY (team_id) RE...

  7. teams
     Rows: 16
     Schema: team_id   TEXT PRIMARY KEY,             team_name TEXT NOT NULL         , team_name_short TEXT, team...

✅ Table inventory complete!

In [74]:
# ══════════════════════════════════════════════════════════════════════════════
#  DATABASE DIAGNOSTICS: DETAILED TABLE ANALYSIS
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd

db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

print("🔍 DATABASE DIAGNOSTICS: DETAILED ANALYSIS")
print("=" * 70)

# 1) TEAM TABLES COMPARISON (Team vs teams)
print("🚨 ISSUE 1: DUPLICATE TEAM TABLES")
print("-" * 40)

print("Team table (legacy, expected 2 rows):")
try:
    team_old = pd.read_sql_query("SELECT * FROM Team", conn)
    print(team_old.to_string(index=False))
except Exception as e:
    print(f"   ❌ Could not read Team table: {e}")

print("\nteams table (new, expected ~16 rows):")
try:
    teams_new = pd.read_sql_query("SELECT * FROM teams LIMIT 5", conn)
    print(teams_new.to_string(index=False))
    print("   ... and more rows not shown")
except Exception as e:
    print(f"   ❌ Could not read teams table: {e}")

# 2) SEASON TABLE ANALYSIS
print(f"\n📅 SEASON DATA")
print("-" * 40)
try:
    seasons = pd.read_sql_query("SELECT * FROM Season", conn)
    print("Season table:")
    print(seasons.to_string(index=False))
except Exception as e:
    print(f"   ❌ Could not read Season table: {e}")

# 3) MATCH TABLE DATA QUALITY
print(f"\n⚽ MATCH DATA QUALITY")
print("-" * 40)

match_quality = pd.read_sql_query("""
    SELECT 
        COUNT(*) as total_matches,
        COUNT(season_id) as matches_with_season,
        COUNT(home_goals) as matches_with_goals,
        COUNT(home_xg) as matches_with_xg,
        COUNT(attendance) as matches_with_attendance,
        COUNT(venue) as matches_with_venue,
        COUNT(referee) as matches_with_referee,
        MIN(match_date) as earliest_match,
        MAX(match_date) as latest_match
    FROM Match
""", conn)

print("Match data completeness:")
for col in match_quality.columns:
    value = match_quality.iloc[0][col]
    if col.startswith("matches_with") or col.startswith("count("):
        total = match_quality.iloc[0]["total_matches"]
        pct = (value / total * 100) if total > 0 else 0
        print(f"  {col}: {value} ({pct:.1f}%)")
    else:
        print(f"  {col}: {value}")

# 4) PLAYER DATA QUALITY
print(f"\n👥 PLAYER DATA QUALITY")
print("-" * 40)

player_quality = pd.read_sql_query("""
    SELECT 
        COUNT(*) as total_players,
        COUNT(nationality) as players_with_nationality,
        COUNT(dob) as players_with_dob,
        COUNT(preferred_foot) as players_with_foot_data
    FROM Player
""", conn)

print("Player data completeness:")
for col in player_quality.columns:
    value = player_quality.iloc[0][col]
    if col != 'total_players':
        total = player_quality.iloc[0]['total_players']
        pct = (value / total * 100) if total > 0 else 0
        print(f"  {col}: {value} ({pct:.1f}%)")
    else:
        print(f"  {col}: {value}")

# 5) FOREIGN KEY INTEGRITY CHECK
print(f"\n🔗 FOREIGN KEY INTEGRITY")
print("-" * 40)

integrity_check = pd.read_sql_query("""
    SELECT 
        COUNT(*) as total_matches,
        COUNT(CASE WHEN h.team_id IS NOT NULL THEN 1 END) as valid_home_teams,
        COUNT(CASE WHEN a.team_id IS NOT NULL THEN 1 END) as valid_away_teams
    FROM Match m
    LEFT JOIN teams h ON m.home_team_id = h.team_id
    LEFT JOIN teams a ON m.away_team_id = a.team_id
""", conn)

print("Foreign key integrity:")
for col in integrity_check.columns:
    value = integrity_check.iloc[0][col]
    if col != 'total_matches':
        total = integrity_check.iloc[0]['total_matches']
        pct = (value / total * 100) if total > 0 else 0
        print(f"  {col}: {value} ({pct:.1f}%)")
    else:
        print(f"  {col}: {value}")

# 6) TEAM_SEASONS COVERAGE
print(f"\n📊 TEAM-SEASON COVERAGE")
print("-" * 40)

season_coverage = pd.read_sql_query("""
    SELECT 
        COUNT(DISTINCT season) as seasons_covered,
        MIN(season) as earliest_season,
        MAX(season) as latest_season,
        COUNT(*) as total_team_season_records
    FROM team_seasons
""", conn)

print("Team-season data:")
print(season_coverage.to_string(index=False))

conn.close()

# Summary
print("=" * 70)
print("📋 DIAGNOSTIC SUMMARY:")
print("🚨 CRITICAL ISSUES:")
print("   1. Duplicate team tables (Team vs teams)")
print("   2. Missing match statistics (goals, xG, attendance)")
print("   3. Only 1 season in Season table vs many team-season records")

print("\n✅ GOOD ASPECTS:")
print("   1. All foreign keys are valid")
print("   2. Good player nationality coverage")
print("   3. Comprehensive team-season historical data")


🔍 DATABASE DIAGNOSTICS: DETAILED ANALYSIS
🚨 ISSUE 1: DUPLICATE TEAM TABLES
----------------------------------------
Team table (legacy, expected 2 rows):
 team_id        team_name team_name_short
       1    Chicago Stars            None
       2 Seattle Reign FC           Reign

teams table (new, expected ~16 rows):
 team_id              team_name team_name_short team_name_alias_1 team_name_alias_2
2a6178ac          Orlando Pride           Pride              None              None
d4c130bc            Utah Royals          Royals    Utah Royals FC              None
85c458aa North Carolina Courage         Courage              None              None
e813709a           Houston Dash            Dash              None              None
bf961da0      San Diego Wave FC            Wave              None              None
   ... and more rows not shown

📅 SEASON DATA
----------------------------------------
Season table:
 season_id  season_year league_name
         1         2025        None

⚽ M

In [75]:
# ══════════════════════════════════════════════════════════════════════════════
#  PHASE 2A.1: SAFELY DROP LEGACY TEAM TABLE
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd

db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

print("🗑️  PHASE 2A.1: SAFELY DROPPING LEGACY TEAM TABLE")
print("=" * 60)

# 1) Safety check: Verify no foreign keys reference the old Team table
print("🔍 Step 1: Foreign Key Safety Check")

fk_check_queries = [
    "SELECT COUNT(*) as count FROM Match WHERE home_team_id IN (SELECT team_id FROM Team)",
    "SELECT COUNT(*) as count FROM Match WHERE away_team_id IN (SELECT team_id FROM Team)"
]

legacy_references = 0
for i, query in enumerate(fk_check_queries, 1):
    try:
        result = pd.read_sql_query(query, conn)
        count = result.iloc[0]['count']
        legacy_references += count
        print(f"   Check {i}: {count} references found")
    except Exception as e:
        print(f"   Check {i}: Error - {e}")

if legacy_references > 0:
    print(f"⚠️  WARNING: Found {legacy_references} references to legacy Team table!")
    print("   Cannot safely drop. Investigation needed.")
else:
    print("✅ Safe to proceed: No foreign key references to legacy Team table")

# 2) Show contents of legacy Team table
print(f"\n📋 Step 2: Legacy Team Table Contents")
try:
    legacy_team = pd.read_sql_query("SELECT * FROM Team", conn)
    print("About to delete:")
    print(legacy_team.to_string(index=False))
except Exception as e:
    print(f"❌ Unable to fetch Team table: {e}")
    legacy_team = pd.DataFrame()

# 3) Backup the legacy data
print(f"\n💾 Step 3: Creating Backup")
try:
    conn.execute("CREATE TEMPORARY TABLE Team_backup AS SELECT * FROM Team")
    print("✅ Legacy Team data backed up to temporary table")
except Exception as e:
    print(f"❌ Backup failed: {e}")

# 4) Drop the legacy table if safe
if legacy_references == 0 and not legacy_team.empty:
    print(f"\n🗑️  Step 4: Dropping Legacy Team Table")

    try:
        conn.execute("DROP TABLE Team")
        conn.commit()
        print("✅ Legacy Team table successfully dropped!")

        # Verify remaining team-related tables
        tables_after = pd.read_sql_query("""
            SELECT name FROM sqlite_master 
            WHERE type='table' AND (name LIKE '%team%' OR name LIKE '%Team%')
            ORDER BY name
        """, conn)

        print(f"\n📊 Remaining team-related tables:")
        for table in tables_after['name']:
            print(f"   - {table}")

    except Exception as e:
        print(f"❌ Error dropping table: {e}")
        print("Attempting to restore from backup...")
        try:
            conn.execute("CREATE TABLE Team AS SELECT * FROM Team_backup")
            conn.commit()
            print("✅ Restored from backup")
        except Exception as restore_err:
            print(f"❌ Backup restore failed: {restore_err}")

else:
    print(f"\n🛑 SKIPPING: Table drop cancelled due to foreign key references or missing Team table")

# 5) Final verification
print(f"\n🔍 Step 5: Final Verification")

total_tables = pd.read_sql_query(
    "SELECT COUNT(*) as count FROM sqlite_master WHERE type='table'", conn)
print(f"Total tables remaining: {total_tables.iloc[0]['count']}")

# Verify Match-to-teams foreign key still works
try:
    match_test = pd.read_sql_query("""
        SELECT COUNT(*) as count 
        FROM Match m
        JOIN teams t ON m.home_team_id = t.team_id
        LIMIT 1
    """, conn)
    print(f"✅ Match-teams relationship still working: {match_test.iloc[0]['count']} matches verified")
except Exception as e:
    print(f"❌ Match-teams relationship broken: {e}")

conn.close()

print("=" * 60)
print("🎉 LEGACY TEAM TABLE CLEANUP COMPLETE!")


🗑️  PHASE 2A.1: SAFELY DROPPING LEGACY TEAM TABLE
🔍 Step 1: Foreign Key Safety Check
   Check 1: 0 references found
   Check 2: 0 references found
✅ Safe to proceed: No foreign key references to legacy Team table

📋 Step 2: Legacy Team Table Contents
About to delete:
 team_id        team_name team_name_short
       1    Chicago Stars            None
       2 Seattle Reign FC           Reign

💾 Step 3: Creating Backup
✅ Legacy Team data backed up to temporary table

🗑️  Step 4: Dropping Legacy Team Table
✅ Legacy Team table successfully dropped!

📊 Remaining team-related tables:
   - team_seasons
   - teams

🔍 Step 5: Final Verification
Total tables remaining: 6
✅ Match-teams relationship still working: 749 matches verified
🎉 LEGACY TEAM TABLE CLEANUP COMPLETE!


In [76]:
# ══════════════════════════════════════════════════════════════════════════════
#  EXAMINE TEAMS TABLE – COMPREHENSIVE ANALYSIS
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3
import pandas as pd

db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

print("🏟️  TEAMS TABLE ANALYSIS")
print("=" * 70)

# 1) Full teams table structure and data
print("📋 Step 1: Complete Teams Table")
teams_full = pd.read_sql_query("SELECT * FROM teams ORDER BY team_name", conn)
print(f"Total teams: {len(teams_full)}")
print("\nAll teams data:")
print(teams_full.to_string(index=False))

# 2) Teams with aliases
print(f"\n🏷️  Step 2: Teams with Aliases")
teams_with_aliases = pd.read_sql_query("""
    SELECT team_name, team_name_short, team_name_alias_1, team_name_alias_2
    FROM teams 
    WHERE team_name_alias_1 IS NOT NULL OR team_name_alias_2 IS NOT NULL
    ORDER BY team_name
""", conn)

if len(teams_with_aliases) > 0:
    print(f"Teams with historical aliases ({len(teams_with_aliases)}):")
    print(teams_with_aliases.to_string(index=False))
else:
    print("No teams have historical aliases")

# 3) Team ID format analysis
print(f"\n🔑 Step 3: Team ID Format Analysis")
team_id_analysis = pd.read_sql_query("""
    SELECT 
        LENGTH(team_id) AS id_length,
        COUNT(*) AS count,
        GROUP_CONCAT(team_name, ', ') AS teams
    FROM teams
    GROUP BY LENGTH(team_id)
    ORDER BY id_length
""", conn)

print("Team ID formats:")
print(team_id_analysis.to_string(index=False))

# 4) Short name coverage
print(f"\n📝 Step 4: Short Name Coverage")
short_name_coverage = pd.read_sql_query("""
    SELECT 
        COUNT(*) AS total_teams,
        COUNT(team_name_short) AS teams_with_short_names,
        COUNT(CASE WHEN team_name_short IS NULL THEN 1 END) AS teams_without_short_names
    FROM teams
""", conn)

print("Short name coverage:")
total_teams = short_name_coverage.iloc[0]['total_teams']
for col in short_name_coverage.columns:
    value = short_name_coverage.iloc[0][col]
    if col != 'total_teams':
        pct = (value / total_teams * 100) if total_teams > 0 else 0
        print(f"  {col}: {value} ({pct:.1f}%)")
    else:
        print(f"  {col}: {value}")

# 5) Teams missing short names
teams_no_short = pd.read_sql_query("""
    SELECT team_name, team_id 
    FROM teams 
    WHERE team_name_short IS NULL
    ORDER BY team_name
""", conn)

if len(teams_no_short) > 0:
    print(f"\n📌 Teams missing short names:")
    print(teams_no_short.to_string(index=False))

# 6) Match usage
print(f"\n⚽ Step 5: Team Usage in Matches")
team_usage = pd.read_sql_query("""
    SELECT 
        t.team_name,
        t.team_name_short,
        COUNT(DISTINCT CASE WHEN m.home_team_id = t.team_id THEN m.match_id END) AS home_matches,
        COUNT(DISTINCT CASE WHEN m.away_team_id = t.team_id THEN m.match_id END) AS away_matches,
        COUNT(DISTINCT CASE WHEN m.home_team_id = t.team_id OR m.away_team_id = t.team_id THEN m.match_id END) AS total_matches
    FROM teams t
    LEFT JOIN Match m ON t.team_id = m.home_team_id OR t.team_id = m.away_team_id
    GROUP BY t.team_id, t.team_name, t.team_name_short
    ORDER BY total_matches DESC, t.team_name
""", conn)

print("Team usage in matches:")
print(team_usage.to_string(index=False))

# 7) Unused teams
unused_teams = team_usage[team_usage['total_matches'] == 0]
if len(unused_teams) > 0:
    print(f"\n⚠️  Teams not used in any matches:")
    print(unused_teams[['team_name', 'team_name_short']].to_string(index=False))
else:
    print(f"\n✅ All teams are used in matches")

conn.close()

# Final summary
print("=" * 70)
print("🎯 TEAMS TABLE SUMMARY:")
print(f"   📊 {len(teams_full)} total teams")
print(f"   🏷️  {len(teams_with_aliases)} teams with historical aliases")
print(f"   ⚽ {len(team_usage[team_usage['total_matches'] > 0])} teams used in matches")
print(f"   📝 {short_name_coverage.iloc[0]['teams_with_short_names']}/{total_teams} teams have short names")


🏟️  TEAMS TABLE ANALYSIS
📋 Step 1: Complete Teams Table
Total teams: 16

All teams data:
 team_id              team_name team_name_short team_name_alias_1 team_name_alias_2
ae38d267          Angel City FC      Angel City              None              None
231a532f                 Bay FC          Bay FC              None              None
ab757728        Boston Breakers        Breakers              None              None
d976a235       Chicago Stars FC   Chicago Stars Chicago Red Stars         Red Stars
8e306dc6              Gotham FC       Gotham FC       Sky Blue FC              None
e813709a           Houston Dash            Dash              None              None
6f666306    Kansas City Current         Current       Kansas City              None
85c458aa North Carolina Courage         Courage              None              None
2a6178ac          Orlando Pride           Pride              None              None
df9a10a1     Portland Thorns FC          Thorns              None      

In [77]:
import sqlite3
import pandas as pd
from pathlib import Path

# Connect to the database
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

# 1. View contents of the Season table
print("🔍 Current Season table:")
season_current = pd.read_sql_query("SELECT * FROM Season ORDER BY season_year", conn)
print(season_current)
print(f"\n📊 Total rows: {len(season_current)}")

# 2. Show Season table schema
print("\n📋 Season table schema:")
schema = pd.read_sql_query("PRAGMA table_info(Season)", conn)
print(schema)

conn.close()


🔍 Current Season table:
   season_id  season_year league_name
0          1         2025        None

📊 Total rows: 1

📋 Season table schema:
   cid         name     type  notnull dflt_value  pk
0    0    season_id  INTEGER        0       None   1
1    1  season_year  INTEGER        0       None   0
2    2  league_name     TEXT        0       None   0


In [78]:
import pandas as pd
import sqlite3

# Replace with your actual database file if needed
# conn = sqlite3.connect("your_database.db")

# 🔍 Check what seasons are currently in the team_seasons table
print("🔍 Seasons referenced in team_seasons table:")
seasons_used = pd.read_sql_query("""
    SELECT DISTINCT season 
    FROM team_seasons 
    ORDER BY season
""", conn)
print(seasons_used)

# 📝 Insert missing seasons (2013-2024) into Season table
print("\n📝 Inserting missing seasons...")
missing_seasons = [(year, year, "NWSL") for year in range(2013, 2025)]

conn.executemany("""
    INSERT OR IGNORE INTO Season (season_id, season_year, league_name) 
    VALUES (?, ?, ?)
""", missing_seasons)
conn.commit()

# ✅ Verify the Season table update
print("\n✅ Updated Season table:")
season_updated = pd.read_sql_query("""
    SELECT * 
    FROM Season 
    ORDER BY season_year
""", conn)
print(season_updated)
print(f"\nTotal rows: {len(season_updated)}")


🔍 Seasons referenced in team_seasons table:


ProgrammingError: Cannot operate on a closed database.

In [79]:
import sqlite3
import pandas as pd

# 🚨 Reconnect to the database at the start of each code block
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

# 🔍 Check what seasons are currently in the team_seasons table
print("🔍 Seasons referenced in team_seasons table:")
seasons_used = pd.read_sql_query("""
    SELECT DISTINCT season 
    FROM team_seasons 
    ORDER BY season
""", conn)
print(seasons_used)

# 📝 Insert missing seasons (2013–2024) into Season table
print("\n📝 Inserting missing seasons...")
missing_seasons = [(year, year, "NWSL") for year in range(2013, 2025)]

conn.executemany("""
    INSERT OR IGNORE INTO Season (season_id, season_year, league_name) 
    VALUES (?, ?, ?)
""", missing_seasons)
conn.commit()

# ✅ Verify the Season table update
print("\n✅ Updated Season table:")
season_updated = pd.read_sql_query("""
    SELECT * 
    FROM Season 
    ORDER BY season_year
""", conn)
print(season_updated)
print(f"\nTotal rows: {len(season_updated)}")


🔍 Seasons referenced in team_seasons table:
    season
0     2013
1     2014
2     2015
3     2016
4     2017
5     2018
6     2019
7     2020
8     2021
9     2022
10    2023
11    2024
12    2025

📝 Inserting missing seasons...

✅ Updated Season table:
    season_id  season_year league_name
0        2013         2013        NWSL
1        2014         2014        NWSL
2        2015         2015        NWSL
3        2016         2016        NWSL
4        2017         2017        NWSL
5        2018         2018        NWSL
6        2019         2019        NWSL
7        2020         2020        NWSL
8        2021         2021        NWSL
9        2022         2022        NWSL
10       2023         2023        NWSL
11       2024         2024        NWSL
12          1         2025        None

Total rows: 13


In [80]:
import sqlite3
import pandas as pd

# 🚨 Reconnect to the database
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

# 🔧 Fix the 2025 season to match the consistent pattern
print("🔧 Fixing season 2025 to match the consistent pattern...")

conn.execute("""
    UPDATE Season 
    SET season_id = 2025, league_name = 'NWSL' 
    WHERE season_year = 2025
""")
conn.commit()

# ✅ Verify the fix
print("\n✅ Fixed Season table:")
season_fixed = pd.read_sql_query("""
    SELECT * 
    FROM Season 
    ORDER BY season_year
""", conn)
print(season_fixed)
print(f"\nTotal rows: {len(season_fixed)}")

# 🔍 Check for consistency
all_ids_match = all(row['season_id'] == row['season_year'] for _, row in season_fixed.iterrows())
all_leagues_nwsl = all(row['league_name'] == 'NWSL' for _, row in season_fixed.iterrows())

print(f"\n🔍 All seasons have consistent format: {all_ids_match}")
print(f"🔍 All seasons have NWSL league_name: {all_leagues_nwsl}")


🔧 Fixing season 2025 to match the consistent pattern...

✅ Fixed Season table:
    season_id  season_year league_name
0        2013         2013        NWSL
1        2014         2014        NWSL
2        2015         2015        NWSL
3        2016         2016        NWSL
4        2017         2017        NWSL
5        2018         2018        NWSL
6        2019         2019        NWSL
7        2020         2020        NWSL
8        2021         2021        NWSL
9        2022         2022        NWSL
10       2023         2023        NWSL
11       2024         2024        NWSL
12       2025         2025        NWSL

Total rows: 13

🔍 All seasons have consistent format: True
🔍 All seasons have NWSL league_name: True


In [81]:
import sqlite3
import pandas as pd

# 🚨 Reconnect to the database
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

# ✅ Enable foreign key constraints
conn.execute("PRAGMA foreign_keys = ON")

print("🔍 DATABASE INTEGRITY VERIFICATION")
print("=" * 50)

# 1️⃣ Foreign Key Constraint Check
print("\n1️⃣ Foreign Key Constraint Check:")
fk_check = conn.execute("PRAGMA foreign_key_check").fetchall()
if fk_check:
    print(f"❌ Found {len(fk_check)} foreign key violations:")
    for violation in fk_check:
        print(f"   {violation}")
else:
    print("✅ All foreign key constraints are valid")

# 2️⃣ General Integrity Check
print("\n2️⃣ Database Integrity Check:")
integrity = conn.execute("PRAGMA integrity_check").fetchone()[0]
print(f"✅ {integrity}")

# 3️⃣ Table Relationship Verification
print("\n3️⃣ Table Relationship Verification:")

# Match → Season foreign key validation
print("\n   Match → Season references:")
season_refs = pd.read_sql_query("""
    SELECT m.season_id, COUNT(*) as match_count
    FROM Match m
    LEFT JOIN Season s ON m.season_id = s.season_id
    WHERE s.season_id IS NULL
    GROUP BY m.season_id
""", conn)
if not season_refs.empty:
    print("❌ Found matches with invalid season references:")
    print(season_refs)
else:
    print("✅ All matches reference valid seasons")

# team_seasons → Season foreign key validation
print("\n   team_seasons → Season references:")
team_season_refs = pd.read_sql_query("""
    SELECT ts.season, COUNT(*) as team_season_count
    FROM team_seasons ts
    LEFT JOIN Season s ON ts.season = s.season_year
    WHERE s.season_id IS NULL
    GROUP BY ts.season
""", conn)
if not team_season_refs.empty:
    print("❌ Found team_seasons with invalid season references:")
    print(team_season_refs)
else:
    print("✅ All team_seasons reference valid seasons")

# 4️⃣ Table Row Counts Summary
print("\n4️⃣ Data Consistency Summary:")
tables = pd.read_sql_query("""
    SELECT name 
    FROM sqlite_master 
    WHERE type='table'
""", conn)

for table in tables['name']:
    count = pd.read_sql_query(f"SELECT COUNT(*) as count FROM {table}", conn)['count'][0]
    print(f"   {table}: {count} rows")


🔍 DATABASE INTEGRITY VERIFICATION

1️⃣ Foreign Key Constraint Check:
✅ All foreign key constraints are valid

2️⃣ Database Integrity Check:
✅ ok

3️⃣ Table Relationship Verification:

   Match → Season references:
❌ Found matches with invalid season references:
  season_id  match_count
0      None            3

   team_seasons → Season references:
✅ All team_seasons reference valid seasons

4️⃣ Data Consistency Summary:
   Season: 13 rows
   sqlite_sequence: 1 rows
   teams: 16 rows
   Player: 309 rows
   Match: 749 rows
   team_seasons: 135 rows


In [82]:
import sqlite3
import pandas as pd

# 🚨 Reconnect to the database
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

print("🔍 TEAMS TABLE ANALYSIS")
print("=" * 30)

# 📋 Get all teams
teams_full = pd.read_sql_query("""
    SELECT * 
    FROM teams 
    ORDER BY team_name
""", conn)
print(f"\n📊 Total teams: {len(teams_full)}")
print(teams_full)

# 🏈 Check which teams are used in matches
print("\n🏈 Teams used in matches:")
teams_in_matches = pd.read_sql_query("""
    SELECT DISTINCT t.team_id, t.team_name, t.team_name_short
    FROM teams t
    WHERE t.team_id IN (
        SELECT DISTINCT home_team_id FROM Match WHERE home_team_id IS NOT NULL
        UNION 
        SELECT DISTINCT away_team_id FROM Match WHERE away_team_id IS NOT NULL
    )
    ORDER BY t.team_name
""", conn)
print(f"Teams in matches: {len(teams_in_matches)}")
print(teams_in_matches)

# 🚫 Show teams NOT used in matches
print("\n🚫 Teams NOT used in matches:")
teams_not_in_matches = pd.read_sql_query("""
    SELECT t.team_id, t.team_name, t.team_name_short
    FROM teams t
    WHERE t.team_id NOT IN (
        SELECT DISTINCT home_team_id FROM Match WHERE home_team_id IS NOT NULL
        UNION 
        SELECT DISTINCT away_team_id FROM Match WHERE away_team_id IS NOT NULL
    )
    ORDER BY t.team_name
""", conn)
print(f"Teams not in matches: {len(teams_not_in_matches)}")
print(teams_not_in_matches)


🔍 TEAMS TABLE ANALYSIS

📊 Total teams: 16
     team_id               team_name team_name_short  team_name_alias_1  \
0   ae38d267           Angel City FC      Angel City               None   
1   231a532f                  Bay FC          Bay FC               None   
2   ab757728         Boston Breakers        Breakers               None   
3   d976a235        Chicago Stars FC   Chicago Stars  Chicago Red Stars   
4   8e306dc6               Gotham FC       Gotham FC        Sky Blue FC   
5   e813709a            Houston Dash            Dash               None   
6   6f666306     Kansas City Current         Current        Kansas City   
7   85c458aa  North Carolina Courage         Courage               None   
8   2a6178ac           Orlando Pride           Pride               None   
9   df9a10a1      Portland Thorns FC          Thorns               None   
10  da19ebd1       Racing Louisville      Louisville               None   
11  bf961da0       San Diego Wave FC            Wave      

In [83]:
import sqlite3
import pandas as pd

# 🚨 Reconnect to the database
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

print("🔍 2013 MATCHES ANALYSIS")
print("=" * 30)

# 1️⃣ Get matches with season_id = 2013
print("1️⃣ Matches with season_id = 2013:")
matches_2013_by_id = pd.read_sql_query("""
    SELECT match_id, match_date, home_team_id, away_team_id, 
           home_goals, away_goals, season_id
    FROM Match 
    WHERE season_id = 2013
    ORDER BY match_date
""", conn)
print(f"Count: {len(matches_2013_by_id)}")
if not matches_2013_by_id.empty:
    print(matches_2013_by_id.head(10))

# 2️⃣ Get matches from 2013 by date prefix
print("\n2️⃣ Matches from 2013 by date (regardless of season_id):")
matches_2013_by_date = pd.read_sql_query("""
    SELECT match_id, match_date, home_team_id, away_team_id, 
           home_goals, away_goals, season_id
    FROM Match 
    WHERE match_date LIKE '2013%'
    ORDER BY match_date
""", conn)
print(f"Count: {len(matches_2013_by_date)}")
if not matches_2013_by_date.empty:
    print(matches_2013_by_date.head(10))

# 📊 Expected match count vs actual
expected_matches = 88
actual_matches = len(matches_2013_by_date)
print(f"\n📊 Expected from games_season.md: {expected_matches} regular season matches")
print(f"📊 Found in database: {actual_matches} matches with 2013 dates")

# 📅 Check date range
if not matches_2013_by_date.empty:
    min_date = matches_2013_by_date['match_date'].min()
    max_date = matches_2013_by_date['match_date'].max()
    print(f"\n📅 Date range: {min_date} to {max_date}")


🔍 2013 MATCHES ANALYSIS
1️⃣ Matches with season_id = 2013:
Count: 0

2️⃣ Matches from 2013 by date (regardless of season_id):
Count: 0

📊 Expected from games_season.md: 88 regular season matches
📊 Found in database: 0 matches with 2013 dates


In [84]:
import sqlite3
import pandas as pd

# 🚨 Reconnect to the database
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

print("🔍 CURRENT MATCH DATA COVERAGE")
print("=" * 40)

# 📅 Check what years we have match data for
matches_by_year = pd.read_sql_query("""
    SELECT 
        SUBSTR(match_date, 1, 4) AS year,
        COUNT(*) AS match_count,
        MIN(match_date) AS earliest_date,
        MAX(match_date) AS latest_date
    FROM Match 
    WHERE match_date IS NOT NULL
    GROUP BY SUBSTR(match_date, 1, 4)
    ORDER BY year
""", conn)

print("📊 Matches by year:")
print(matches_by_year)

# 📆 Check distribution of matches by season_id
season_distribution = pd.read_sql_query("""
    SELECT 
        season_id,
        COUNT(*) AS match_count
    FROM Match 
    GROUP BY season_id
    ORDER BY season_id
""", conn)

print("\n📊 Matches by season_id:")
print(season_distribution)


🔍 CURRENT MATCH DATA COVERAGE
📊 Matches by year:
   year  match_count earliest_date latest_date
0  2017           32    2017-04-29  2017-10-14
1  2020           29    2020-04-18  2020-10-22
2  2021          126    2021-05-15  2021-11-20
3  2022          137    2022-04-29  2022-10-29
4  2023          137    2023-03-25  2023-11-11
5  2024          191    2024-03-16  2024-11-23
6  2025           97    2025-03-14  2025-06-22

📊 Matches by season_id:
   season_id  match_count
0        NaN            3
1     2017.0           32
2     2020.0           29
3     2021.0          125
4     2022.0          137
5     2023.0          137
6     2024.0          189
7     2025.0           97


In [85]:
import os

print("🔍 AVAILABLE DATA SOURCES")
print("=" * 30)

# 🔎 Look for common data file formats
data_extensions = ['.csv', '.json', '.html', '.xlsx']
for root, dirs, files in os.walk("../"):
    for file in files:
        if any(file.lower().endswith(ext) for ext in data_extensions):
            print(f"📄 {os.path.join(root, file)}")

# 📅 Look for files referencing 2013
print("\n📅 Files mentioning 2013:")
for root, dirs, files in os.walk("../"):
    for file in files:
        if '2013' in file:
            print(f"📄 {os.path.join(root, file)}")


🔍 AVAILABLE DATA SOURCES
📄 ../.venv/lib/python3.13/site-packages/debugpy/_vendored/pydevd/_pydevd_bundle/_debug_adapter/debugProtocol.json
📄 ../.venv/lib/python3.13/site-packages/debugpy/_vendored/pydevd/_pydevd_bundle/_debug_adapter/debugProtocolCustom.json
📄 ../.venv/lib/python3.13/site-packages/decorator-5.2.1.dist-info/pbr.json
📄 ../.venv/lib/python3.13/site-packages/numpy/_core/tests/data/umath-validation-set-log2.csv
📄 ../.venv/lib/python3.13/site-packages/numpy/_core/tests/data/umath-validation-set-arcsinh.csv
📄 ../.venv/lib/python3.13/site-packages/numpy/_core/tests/data/umath-validation-set-arctanh.csv
📄 ../.venv/lib/python3.13/site-packages/numpy/_core/tests/data/umath-validation-set-sin.csv
📄 ../.venv/lib/python3.13/site-packages/numpy/_core/tests/data/umath-validation-set-cos.csv
📄 ../.venv/lib/python3.13/site-packages/numpy/_core/tests/data/umath-validation-set-cbrt.csv
📄 ../.venv/lib/python3.13/site-packages/numpy/_core/tests/data/umath-validation-set-arctan.csv
📄 ../.ven

In [86]:
import sqlite3
import pandas as pd
from bs4 import BeautifulSoup
import os
from pathlib import Path

# 🚨 Reconnect to the database
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

print("🎯 EXTRACTING 2013 MATCHES FROM EXISTING HTML FILES")
print("=" * 55)

# 📄 Path to the 2013 season fixtures HTML file
season_file = "../data/raw_season_pages/2013 NWSL Scores & Fixtures _ FBref.com.html"

if os.path.exists(season_file):
    print(f"📄 Reading: {season_file}")
    with open(season_file, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f.read(), 'html.parser')

    # 🔍 Find the fixtures table by ID
    fixtures_table = soup.find('table', {'id': 'sched_2013_10_1'})
    if fixtures_table:
        # 📊 Convert the HTML table to a DataFrame
        fixtures_df = pd.read_html(str(fixtures_table))[0]
        print(f"📊 Found {len(fixtures_df)} fixture rows")
        print(fixtures_df.head())
    else:
        print("❌ Fixtures table not found in the HTML")
else:
    print("❌ 2013 season file not found")


🎯 EXTRACTING 2013 MATCHES FROM EXISTING HTML FILES
📄 Reading: ../data/raw_season_pages/2013 NWSL Scores & Fixtures _ FBref.com.html
❌ Fixtures table not found in the HTML


In [87]:
import sqlite3
import pandas as pd
from bs4 import BeautifulSoup
import os

# 🚨 Reconnect to the database
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

# 📄 Path to the 2013 season HTML file
season_file = "../data/raw_season_pages/2013 NWSL Scores & Fixtures _ FBref.com.html"

print("🔍 EXPLORING 2013 HTML STRUCTURE")
print("=" * 40)

# ✅ Load and parse the HTML file
with open(season_file, 'r', encoding='utf-8') as f:
    soup = BeautifulSoup(f.read(), 'html.parser')

# 📊 Find all tables and their attributes
print("📊 All tables found:")
tables = soup.find_all('table')
for i, table in enumerate(tables):
    table_id = table.get('id', 'no-id')
    table_class = table.get('class', 'no-class')
    print(f"  Table {i}: id='{table_id}' class='{table_class}'")

print(f"\n🎯 Total tables found: {len(tables)}")

# 🔍 If there are tables, inspect the first one
if tables:
    print(f"\n📄 First table structure:")
    first_table = tables[0]
    print(f"ID: {first_table.get('id')}")
    print(f"Classes: {first_table.get('class')}")

    # 🧪 Try parsing the first table to preview its content
    try:
        df_preview = pd.read_html(str(first_table))[0]
        print(f"Shape: {df_preview.shape}")
        print(df_preview.head())
    except Exception as e:
        print("❌ Could not parse first table as DataFrame")
        print(f"Error: {e}")


🔍 EXPLORING 2013 HTML STRUCTURE
📊 All tables found:
  Table 0: id='sched_all' class='['stats_table', 'sortable', 'min_width']'
  Table 1: id='sched_2013_182_1' class='['stats_table', 'sortable', 'min_width']'
  Table 2: id='sched_2013_182_2' class='['stats_table', 'sortable', 'min_width']'

🎯 Total tables found: 3

📄 First table structure:
ID: sched_all
Classes: ['stats_table', 'sortable', 'min_width']
Shape: (94, 12)
            Round  Day        Date   Time             Home Score       Away  \
0  Regular Season  Sun  2013-04-14  17:00        Red Stars   1–1      Reign   
1  Regular Season  Sun  2013-04-14  18:00      Sky Blue FC   1–0  WNY Flash   
2  Regular Season  Sun  2013-04-14  18:30  Boston Breakers   1–1     Spirit   
3  Regular Season  Sat  2013-04-13  19:35      Kansas City   1–1     Thorns   
4  Regular Season  Sat  2013-04-20  19:00           Spirit   1–1  WNY Flash   

  Attendance                                        Venue         Referee  \
0       1255  Village of L

  df_preview = pd.read_html(str(first_table))[0]


In [88]:
import sqlite3
import pandas as pd
from bs4 import BeautifulSoup

# 🚨 Reconnect to the database
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

# 📄 Path to the 2013 season HTML file
season_file = "../data/raw_season_pages/2013 NWSL Scores & Fixtures _ FBref.com.html"

print("📊 LOADING ALL 2013 MATCHES FOR EXAMINATION")
print("=" * 45)

# ✅ Load and parse the HTML file
with open(season_file, 'r', encoding='utf-8') as f:
    soup = BeautifulSoup(f.read(), 'html.parser')

# 🔍 Find the full match schedule table
fixtures_table = soup.find('table', {'id': 'sched_all'})
matches_2013 = pd.read_html(str(fixtures_table))[0]

# 📈 Summary of matches
print(f"📈 Total 2013 matches loaded: {len(matches_2013)}")
print(f"📋 Columns: {matches_2013.columns.tolist()}")

# 🗂️ Show the full dataset
print(f"\n📄 All 2013 matches:")
print(matches_2013.to_string(index=False))

# 🏆 Summary by Round type
print(f"\n🏆 Matches by Round:")
round_counts = matches_2013['Round'].value_counts()
print(round_counts)

# 📅 Check match date range
print(f"\n📅 Date range:")
print(f"First match: {matches_2013['Date'].min()}")
print(f"Last match: {matches_2013['Date'].max()}")


📊 LOADING ALL 2013 MATCHES FOR EXAMINATION
📈 Total 2013 matches loaded: 94
📋 Columns: ['Round', 'Day', 'Date', 'Time', 'Home', 'Score', 'Away', 'Attendance', 'Venue', 'Referee', 'Match Report', 'Notes']

📄 All 2013 matches:
         Round Day       Date  Time            Home Score            Away Attendance                                       Venue           Referee Match Report               Notes
Regular Season Sun 2013-04-14 17:00       Red Stars   1–1           Reign       1255 Village of Lisle-Benedictine University ...      Josh Wilkens Match Report                 NaN
Regular Season Sun 2013-04-14 18:00     Sky Blue FC   1–0       WNY Flash       2611                                Yurcak Field    John McCloskey Match Report                 NaN
Regular Season Sun 2013-04-14 18:30 Boston Breakers   1–1          Spirit       2634                              Dilboy Stadium    Hernan Aguilar Match Report                 NaN
Regular Season Sat 2013-04-13 19:35     Kansas City   1–

  matches_2013 = pd.read_html(str(fixtures_table))[0]


In [89]:
import sqlite3
import pandas as pd
from bs4 import BeautifulSoup

# 🚨 Reconnect to the database
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

# 📄 Path to the 2013 season HTML file
season_file = "../data/raw_season_pages/2013 NWSL Scores & Fixtures _ FBref.com.html"

print("🧹 CLEANING 2013 MATCHES DATA")
print("=" * 35)

# ✅ Load and parse the HTML file
with open(season_file, 'r', encoding='utf-8') as f:
    soup = BeautifulSoup(f.read(), 'html.parser')

# 🔍 Extract the fixtures table
fixtures_table = soup.find('table', {'id': 'sched_all'})
matches_2013 = pd.read_html(str(fixtures_table))[0]

print(f"📊 Original matches: {len(matches_2013)}")

# 🧽 Remove duplicate header rows (e.g., when 'Round' == 'Round')
clean_matches = matches_2013[matches_2013['Round'] != 'Round'].copy()

print(f"📊 After removing duplicate headers: {len(clean_matches)}")

# 🏆 Show match counts by round
print(f"\n🏆 Matches by Round (cleaned):")
round_counts = clean_matches['Round'].value_counts()
print(round_counts)

# 📄 Preview cleaned data
print(f"\n📄 First few cleaned matches:")
print(clean_matches.head())

print(f"\n📄 Last few cleaned matches:")
print(clean_matches.tail())

# ✅ Verify expected regular season match count
regular_season_count = len(clean_matches[clean_matches['Round'] == 'Regular Season'])
print(f"\n✅ Regular Season matches: {regular_season_count} (expected: 88)")


🧹 CLEANING 2013 MATCHES DATA
📊 Original matches: 94
📊 After removing duplicate headers: 91

🏆 Matches by Round (cleaned):
Round
Regular Season    88
Semifinals         2
Final              1
Name: count, dtype: int64

📄 First few cleaned matches:
            Round  Day        Date   Time             Home Score       Away  \
0  Regular Season  Sun  2013-04-14  17:00        Red Stars   1–1      Reign   
1  Regular Season  Sun  2013-04-14  18:00      Sky Blue FC   1–0  WNY Flash   
2  Regular Season  Sun  2013-04-14  18:30  Boston Breakers   1–1     Spirit   
3  Regular Season  Sat  2013-04-13  19:35      Kansas City   1–1     Thorns   
4  Regular Season  Sat  2013-04-20  19:00           Spirit   1–1  WNY Flash   

  Attendance                                        Venue         Referee  \
0       1255  Village of Lisle-Benedictine University ...    Josh Wilkens   
1       2611                                 Yurcak Field  John McCloskey   
2       2634                               Dilb

  matches_2013 = pd.read_html(str(fixtures_table))[0]


In [90]:
import sqlite3
import pandas as pd
from bs4 import BeautifulSoup
import re

# 🚨 Reconnect to the database
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

# 📄 Path to the 2013 HTML file
season_file = "../data/raw_season_pages/2013 NWSL Scores & Fixtures _ FBref.com.html"

print("🎯 EXTRACTING MATCH IDs FROM 2013 DATA")
print("=" * 40)

# ✅ Load and parse the HTML file
with open(season_file, 'r', encoding='utf-8') as f:
    soup = BeautifulSoup(f.read(), 'html.parser')

# 🔍 Locate and parse the fixtures table
fixtures_table = soup.find('table', {'id': 'sched_all'})
matches_2013 = pd.read_html(str(fixtures_table))[0]

# 🧹 Clean out duplicate header rows
clean_matches = matches_2013[matches_2013['Round'] != 'Round'].copy()
print(f"📊 Clean matches: {len(clean_matches)}")

# 🔗 Find all match report links in the table
match_links = fixtures_table.find_all('a', href=True)
match_report_links = [link for link in match_links if 'matches' in link.get('href', '')]
print(f"🔗 Found {len(match_report_links)} match report links")

# 🆔 Extract 8-digit alphanumeric match IDs from links
match_ids = []
for link in match_report_links:
    href = link.get('href', '')
    match_id_pattern = r'/matches/([a-f0-9]{8})/'
    match = re.search(match_id_pattern, href)
    match_ids.append(match.group(1) if match else None)

print(f"📋 Extracted {len(match_ids)} match IDs")
print(f"📄 First few match IDs: {match_ids[:5]}")

# ➕ Add match IDs to the DataFrame if counts match
if len(match_ids) == len(clean_matches):
    clean_matches['match_id'] = match_ids
    print(f"✅ Successfully added match_id column")

    # 🖼️ Show preview of enriched data
    print(f"\n📊 Sample matches with IDs:")
    print(clean_matches[['Date', 'Home', 'Score', 'Away', 'Round', 'match_id']].head())
else:
    print(f"❌ Mismatch: {len(match_ids)} IDs vs {len(clean_matches)} matches")


🎯 EXTRACTING MATCH IDs FROM 2013 DATA
📊 Clean matches: 91
🔗 Found 273 match report links
📋 Extracted 273 match IDs
📄 First few match IDs: [None, '6aee226c', '6aee226c', None, '5c187984']
❌ Mismatch: 273 IDs vs 91 matches


  matches_2013 = pd.read_html(str(fixtures_table))[0]


In [91]:
import sqlite3
import pandas as pd
from bs4 import BeautifulSoup
import re

# 🚨 Reconnect to the database
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

# 📄 Path to the 2013 season HTML file
season_file = "../data/raw_season_pages/2013 NWSL Scores & Fixtures _ FBref.com.html"

print("🎯 EXTRACTING MATCH IDs FROM 2013 DATA (REVISED)")
print("=" * 50)

# ✅ Load and parse the HTML file
with open(season_file, 'r', encoding='utf-8') as f:
    soup = BeautifulSoup(f.read(), 'html.parser')

# 🔍 Locate the specific match schedule table
fixtures_table = soup.find('table', {'id': 'sched_all'})

# 🧾 Extract all data rows from <tbody>, excluding headers
table_rows = fixtures_table.find('tbody').find_all('tr') if fixtures_table.find('tbody') else fixtures_table.find_all('tr')[1:]
print(f"📊 Found {len(table_rows)} table rows")

# 🆔 Parse each row for match info and match ID
match_data = []
for row in table_rows:
    cells = row.find_all('td')
    if len(cells) >= 11:
        round_cell = cells[0].get_text(strip=True)
        if round_cell != 'Round':
            match_report_cell = cells[10]  # Match Report link cell
            link = match_report_cell.find('a', href=True)

            match_id = None
            if link:
                href = link.get('href', '')
                match_id_pattern = r'/matches/([a-f0-9]{8})/'
                match = re.search(match_id_pattern, href)
                if match:
                    match_id = match.group(1)

            match_info = {
                'round': round_cell,
                'date': cells[2].get_text(strip=True),
                'home': cells[4].get_text(strip=True),
                'score': cells[5].get_text(strip=True),
                'away': cells[6].get_text(strip=True),
                'match_id': match_id
            }
            match_data.append(match_info)

# 📄 Convert to DataFrame
matches_df = pd.DataFrame(match_data)
print(f"📊 Extracted {len(matches_df)} matches with IDs")
print(f"📊 Matches with valid IDs: {matches_df['match_id'].notna().sum()}")

# 🖼️ Preview sample matches
print(f"\n📄 Sample matches with IDs:")
print(matches_df[['date', 'home', 'away', 'round', 'match_id']].head(10))


🎯 EXTRACTING MATCH IDs FROM 2013 DATA (REVISED)
📊 Found 94 table rows
📊 Extracted 91 matches with IDs
📊 Matches with valid IDs: 0

📄 Sample matches with IDs:
    date home    away round match_id
0  17:00  1–1   1,255   Sun     None
1  18:00  1–0   2,611   Sun     None
2  18:30  1–1   2,634   Sun     None
3  19:35  1–1   6,784   Sat     None
4  19:00  1–1   4,569   Sat     None
5  14:00  2–1  16,479   Sun     None
6  19:00  1–2   3,102   Sat     None
7  19:35  2–0   4,064   Fri     None
8  19:35  1–2   4,065   Sat     None
9  19:00  0–2   2,855   Sat     None


In [92]:
import sqlite3
import pandas as pd
from bs4 import BeautifulSoup
import re

# 🚨 Reconnect to the database
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

# 📄 Path to the 2013 season HTML file
season_file = "../data/raw_season_pages/2013 NWSL Scores & Fixtures _ FBref.com.html"

print("🔍 DEBUGGING HTML STRUCTURE")
print("=" * 35)

# ✅ Load and parse the HTML file
with open(season_file, 'r', encoding='utf-8') as f:
    soup = BeautifulSoup(f.read(), 'html.parser')

# 🔍 Locate the match schedule table
fixtures_table = soup.find('table', {'id': 'sched_all'})

# 🧪 Extract and inspect the first data row
first_row = fixtures_table.find('tbody').find('tr') if fixtures_table.find('tbody') else fixtures_table.find_all('tr')[1]

print("📊 First row cell contents:")
cells = first_row.find_all(['td', 'th'])
for i, cell in enumerate(cells):
    cell_text = cell.get_text(strip=True)
    link = cell.find('a', href=True)
    link_href = link.get('href') if link else 'No link'
    print(f"  Cell {i}: '{cell_text}' | Link: {link_href}")

print(f"\n📊 Total cells in first row: {len(cells)}")

# 🔗 Check for match report links in the full table
all_links = fixtures_table.find_all('a', href=True)
match_links = [link for link in all_links if '/matches/' in link.get('href', '')]
print(f"\n🔗 Found {len(match_links)} links with '/matches/' in href")

if match_links:
    print("📄 First few match links:")
    for i, link in enumerate(match_links[:5]):
        print(f"  {i}: {link.get('href')}")


🔍 DEBUGGING HTML STRUCTURE
📊 First row cell contents:
  Cell 0: 'Regular Season' | Link: /en/comps/182/2013/2013-NWSL-Stats
  Cell 1: 'Sun' | Link: No link
  Cell 2: '2013-04-14' | Link: /en/matches/2013-04-14
  Cell 3: '17:00' | Link: No link
  Cell 4: 'Red Stars' | Link: /en/squads/d976a235/2013/Chicago-Red-Stars-Stats
  Cell 5: '1–1' | Link: /en/matches/6aee226c/Chicago-Red-Stars-Seattle-Reign-FC-April-14-2013-NWSL
  Cell 6: 'Reign' | Link: /en/squads/257fad2b/2013/Seattle-Reign-FC-Stats
  Cell 7: '1,255' | Link: No link
  Cell 8: 'Village of Lisle-Benedictine University ...' | Link: No link
  Cell 9: 'Josh Wilkens' | Link: No link
  Cell 10: 'Match Report' | Link: /en/matches/6aee226c/Chicago-Red-Stars-Seattle-Reign-FC-April-14-2013-NWSL
  Cell 11: '' | Link: No link

📊 Total cells in first row: 12

🔗 Found 273 links with '/matches/' in href
📄 First few match links:
  0: /en/matches/2013-04-14
  1: /en/matches/6aee226c/Chicago-Red-Stars-Seattle-Reign-FC-April-14-2013-NWSL
  2: /en/

In [93]:
import sqlite3
import pandas as pd
from bs4 import BeautifulSoup
import re

# 🚨 Reconnect to the database
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

# 📄 Path to the 2013 season HTML file
season_file = "../data/raw_season_pages/2013 NWSL Scores & Fixtures _ FBref.com.html"

print("✅ EXTRACTING MATCH IDs CORRECTLY")
print("=" * 40)

# ✅ Load and parse the HTML file
with open(season_file, 'r', encoding='utf-8') as f:
    soup = BeautifulSoup(f.read(), 'html.parser')

# 🔍 Locate the fixtures table
fixtures_table = soup.find('table', {'id': 'sched_all'})

# 🧾 Extract table rows from <tbody> if available
table_rows = fixtures_table.find('tbody').find_all('tr') if fixtures_table.find('tbody') else fixtures_table.find_all('tr')[1:]

match_data = []
for row in table_rows:
    cells = row.find_all(['td', 'th'])
    if len(cells) >= 11:
        round_text = cells[0].get_text(strip=True)
        if round_text != 'Round':  # Skip header rows

            # 🆔 Extract match ID from Match Report column (cell index 10)
            match_id = None
            match_report_cell = cells[10]
            link = match_report_cell.find('a', href=True)
            if link:
                href = link.get('href', '')
                match_id_pattern = r'/matches/([a-f0-9]{8})/'
                match = re.search(match_id_pattern, href)
                if match:
                    match_id = match.group(1)

            # 📋 Extract full match details
            match_info = {
                'round': cells[0].get_text(strip=True),
                'day': cells[1].get_text(strip=True),
                'date': cells[2].get_text(strip=True),
                'time': cells[3].get_text(strip=True),
                'home': cells[4].get_text(strip=True),
                'score': cells[5].get_text(strip=True),
                'away': cells[6].get_text(strip=True),
                'attendance': cells[7].get_text(strip=True),
                'venue': cells[8].get_text(strip=True),
                'referee': cells[9].get_text(strip=True),
                'match_id': match_id
            }
            match_data.append(match_info)

# 📄 Create DataFrame
matches_df = pd.DataFrame(match_data)
print(f"📊 Extracted {len(matches_df)} matches")
print(f"📊 Matches with valid IDs: {matches_df['match_id'].notna().sum()}")

# 🖼️ Show sample data
print(f"\n📄 Sample matches with correct data:")
print(matches_df[['date', 'home', 'score', 'away', 'round', 'match_id']].head())


✅ EXTRACTING MATCH IDs CORRECTLY
📊 Extracted 91 matches
📊 Matches with valid IDs: 91

📄 Sample matches with correct data:
         date             home score       away           round  match_id
0  2013-04-14        Red Stars   1–1      Reign  Regular Season  6aee226c
1  2013-04-14      Sky Blue FC   1–0  WNY Flash  Regular Season  5c187984
2  2013-04-14  Boston Breakers   1–1     Spirit  Regular Season  eb172ca3
3  2013-04-13      Kansas City   1–1     Thorns  Regular Season  d0426a07
4  2013-04-20           Spirit   1–1  WNY Flash  Regular Season  83edc9ff


In [94]:
print("📊 2013 MATCH DATA - 10 ROWS")
print("=" * 35)

# 🖼️ Show first 10 rows
print(matches_df.head(10).to_string(index=False))

# 🧾 Show column data types
print(f"\n📋 Column Info:")
for col in matches_df.columns:
    print(f"  {col}: {matches_df[col].dtype}")

# 📈 Summary statistics
print(f"\n📊 Total matches: {len(matches_df)}")
print(f"📊 Date range: {matches_df['date'].min()} to {matches_df['date'].max()}")


📊 2013 MATCH DATA - 10 ROWS
         round day       date  time            home score            away attendance                                       venue           referee match_id
Regular Season Sun 2013-04-14 17:00       Red Stars   1–1           Reign      1,255 Village of Lisle-Benedictine University ...      Josh Wilkens 6aee226c
Regular Season Sun 2013-04-14 18:00     Sky Blue FC   1–0       WNY Flash      2,611                                Yurcak Field    John McCloskey 5c187984
Regular Season Sun 2013-04-14 18:30 Boston Breakers   1–1          Spirit      2,634                              Dilboy Stadium    Hernan Aguilar eb172ca3
Regular Season Sat 2013-04-13 19:35     Kansas City   1–1          Thorns      6,784            Shawnee Mission District Stadium        Kari Seitz d0426a07
Regular Season Sat 2013-04-20 19:00          Spirit   1–1       WNY Flash      4,569 Maureen Hendricks Field at Maryland Socc...        Kari Seitz 83edc9ff
Regular Season Sun 2013-04-21 14:00 

In [95]:
import sqlite3
import pandas as pd

# Reconnect to database
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

print("🏗️ CREATING MATCH REPORT DATABASE SCHEMA")
print("=" * 45)

# Enable foreign key constraints
conn.execute("PRAGMA foreign_keys = ON")

# Define SQL schema
schema_sql = """
-- 1. Extend Match table with additional fields
ALTER TABLE Match ADD COLUMN home_xg DECIMAL(3,1);
ALTER TABLE Match ADD COLUMN away_xg DECIMAL(3,1);
ALTER TABLE Match ADD COLUMN home_formation VARCHAR(10);
ALTER TABLE Match ADD COLUMN away_formation VARCHAR(10);
ALTER TABLE Match ADD COLUMN temperature DECIMAL(4,1);

-- 2. Match Events
CREATE TABLE IF NOT EXISTS match_events (
    event_id INTEGER PRIMARY KEY AUTOINCREMENT,
    match_id VARCHAR(8) NOT NULL,
    minute INTEGER NOT NULL,
    event_type VARCHAR(20) NOT NULL,
    team_id VARCHAR(36),
    player_id VARCHAR(36),
    player_name VARCHAR(255),
    assist_player VARCHAR(255),
    additional_info VARCHAR(255),
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    FOREIGN KEY (match_id) REFERENCES Match(match_id)
);

-- 3. Team Match Stats
CREATE TABLE IF NOT EXISTS team_match_stats (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    match_id VARCHAR(8) NOT NULL,
    team_id VARCHAR(36) NOT NULL,
    is_home BOOLEAN NOT NULL,
    possession_pct INTEGER,
    passing_accuracy_pct DECIMAL(4,1),
    passes_completed INTEGER,
    passes_attempted INTEGER,
    shots_total INTEGER,
    shots_on_target INTEGER,
    shots_on_target_pct DECIMAL(4,1),
    saves_made INTEGER,
    saves_faced INTEGER,
    saves_pct DECIMAL(4,1),
    fouls INTEGER,
    corners INTEGER,
    crosses INTEGER,
    touches INTEGER,
    tackles INTEGER,
    interceptions INTEGER,
    aerials_won INTEGER,
    clearances INTEGER,
    offsides INTEGER,
    goal_kicks INTEGER,
    throw_ins INTEGER,
    long_balls INTEGER,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    FOREIGN KEY (match_id) REFERENCES Match(match_id),
    FOREIGN KEY (team_id) REFERENCES teams(team_id)
);

-- 4. Player Match Stats
CREATE TABLE IF NOT EXISTS player_match_stats (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    match_id VARCHAR(8) NOT NULL,
    team_id VARCHAR(36) NOT NULL,
    player_id VARCHAR(36),
    player_name VARCHAR(255) NOT NULL,
    shirt_number INTEGER,
    nation VARCHAR(3),
    position VARCHAR(10),
    age VARCHAR(10),
    minutes INTEGER DEFAULT 0,
    goals INTEGER DEFAULT 0,
    assists INTEGER DEFAULT 0,
    penalty_kicks_made INTEGER DEFAULT 0,
    penalty_kicks_attempted INTEGER DEFAULT 0,
    shots INTEGER DEFAULT 0,
    shots_on_target INTEGER DEFAULT 0,
    yellow_cards INTEGER DEFAULT 0,
    red_cards INTEGER DEFAULT 0,
    touches INTEG


_IncompleteInputError: incomplete input (135364816.py, line 15)

In [97]:
import sqlite3
import pandas as pd

# Reconnect to database
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

print("🏗️ CREATING MATCH REPORT DATABASE SCHEMA")
print("=" * 45)

# Enable foreign key constraints
conn.execute("PRAGMA foreign_keys = ON")

try:
    # 1. Add columns to existing Match table
    print("📝 Adding columns to Match table...")

    # Check existing columns to avoid duplication
    columns = conn.execute("PRAGMA table_info(Match)").fetchall()
    existing_cols = [col[1] for col in columns]

    if 'home_xg' not in existing_cols:
        conn.execute("ALTER TABLE Match ADD COLUMN home_xg DECIMAL(3,1)")
    if 'away_xg' not in existing_cols:
        conn.execute("ALTER TABLE Match ADD COLUMN away_xg DECIMAL(3,1)")
    if 'home_formation' not in existing_cols:
        conn.execute("ALTER TABLE Match ADD COLUMN home_formation VARCHAR(10)")
    if 'away_formation' not in existing_cols:
        conn.execute("ALTER TABLE Match ADD COLUMN away_formation VARCHAR(10)")
    if 'temperature' not in existing_cols:
        conn.execute("ALTER TABLE Match ADD COLUMN temperature DECIMAL(4,1)")

    # 2. Create match_events table
    print("📝 Creating match_events table...")
    conn.execute("""
        CREATE TABLE IF NOT EXISTS match_events (
            event_id INTEGER PRIMARY KEY AUTOINCREMENT,
            match_id VARCHAR(8) NOT NULL,
            minute INTEGER NOT NULL,
            event_type VARCHAR(20) NOT NULL,
            team_id VARCHAR(36),
            player_id VARCHAR(36),
            player_name VARCHAR(255),
            assist_player VARCHAR(255),
            additional_info VARCHAR(255),
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """)

    # 3. Create team_match_stats table
    print("📝 Creating team_match_stats table...")
    conn.execute("""
        CREATE TABLE IF NOT EXISTS team_match_stats (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            match_id VARCHAR(8) NOT NULL,
            team_id VARCHAR(36) NOT NULL,
            is_home BOOLEAN NOT NULL,
            possession_pct INTEGER,
            passing_accuracy_pct DECIMAL(4,1),
            passes_completed INTEGER,
            passes_attempted INTEGER,
            shots_total INTEGER,
            shots_on_target INTEGER,
            shots_on_target_pct DECIMAL(4,1),
            saves_made INTEGER,
            saves_faced INTEGER,
            saves_pct DECIMAL(4,1),
            fouls INTEGER,
            corners INTEGER,
            crosses INTEGER,
            touches INTEGER,
            tackles INTEGER,
            interceptions INTEGER,
            aerials_won INTEGER,
            clearances INTEGER,
            offsides INTEGER,
            goal_kicks INTEGER,
            throw_ins INTEGER,
            long_balls INTEGER,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """)

    # 4. Create player_match_stats table
    print("📝 Creating player_match_stats table...")
    conn.execute("""
        CREATE TABLE IF NOT EXISTS player_match_stats (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            match_id VARCHAR(8) NOT NULL,
            team_id VARCHAR(36) NOT NULL,
            player_id VARCHAR(36),
            player_name VARCHAR(255) NOT NULL,
            shirt_number INTEGER,
            nation VARCHAR(3),
            position VARCHAR(10),
            age VARCHAR(10),
            minutes INTEGER DEFAULT 0,
            goals INTEGER DEFAULT 0,
            assists INTEGER DEFAULT 0,
            penalty_kicks_made INTEGER DEFAULT 0,
            penalty_kicks_attempted INTEGER DEFAULT 0,
            shots INTEGER DEFAULT 0,
            shots_on_target INTEGER DEFAULT 0,
            yellow_cards INTEGER DEFAULT 0,
            red_cards INTEGER DEFAULT 0,
            touches INTEGER DEFAULT 0,
            tackles INTEGER DEFAULT 0,
            interceptions INTEGER DEFAULT 0,
            blocks INTEGER DEFAULT 0,
            xg DECIMAL(3,1) DEFAULT 0.0,
            npxg DECIMAL(3,1) DEFAULT 0.0,
            xag DECIMAL(3,1) DEFAULT 0.0,
            sca INTEGER DEFAULT 0,
            gca INTEGER DEFAULT 0,
            passes_completed INTEGER DEFAULT 0,
            passes_attempted INTEGER DEFAULT 0,
            pass_completion_pct DECIMAL(4,1),
            progressive_passes INTEGER DEFAULT 0,
            carries INTEGER DEFAULT 0,
            progressive_carries INTEGER DEFAULT 0,
            take_ons_attempted INTEGER DEFAULT 0,
            take_ons_successful INTEGER DEFAULT 0,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """)

    # 5. Create match_shots table
    print("📝 Creating match_shots table...")
    conn.execute("""
        CREATE TABLE IF NOT EXISTS match_shots (
            shot_id INTEGER PRIMARY KEY AUTOINCREMENT,
            match_id VARCHAR(8) NOT NULL,
            minute INTEGER NOT NULL,
            player_name VARCHAR(255) NOT NULL,
            team_id VARCHAR(36) NOT NULL,
            xg DECIMAL(4,2),
            psxg DECIMAL(4,2),
            outcome VARCHAR(20) NOT NULL,
            distance INTEGER,
            body_part VARCHAR(20),
            notes VARCHAR(255),
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """)

    conn.commit()
    print("✅ Successfully created all match report tables!")

    # Show updated database structure
    print("\n📊 Updated database structure:")
    tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name").fetchall()
    for table in tables:
        table_name = table[0]
        columns = conn.execute(f"PRAGMA table_info({table_name})").fetchall()
        print(f"  📋 {table_name} ({len(columns)} columns)")

except Exception as e:
    print(f"❌ Error creating schema: {e}")

conn.close()


🏗️ CREATING MATCH REPORT DATABASE SCHEMA
📝 Adding columns to Match table...
📝 Creating match_events table...
📝 Creating team_match_stats table...
📝 Creating player_match_stats table...
📝 Creating match_shots table...
✅ Successfully created all match report tables!

📊 Updated database structure:
  📋 Match (18 columns)
  📋 Player (5 columns)
  📋 Season (3 columns)
  📋 match_events (10 columns)
  📋 match_shots (12 columns)
  📋 player_match_stats (36 columns)
  📋 sqlite_sequence (2 columns)
  📋 team_match_stats (27 columns)
  📋 team_seasons (3 columns)
  📋 teams (5 columns)


In [98]:
import sqlite3
import pandas as pd

# Reconnect to database
db_path = "../data/processed/nwsldata.db"
conn = sqlite3.connect(db_path)

print("🔄 RENAMING TABLES TO LOWERCASE CONVENTION")
print("=" * 45)

try:
    # ---------------------------------------------
    # Step 1: Rename core tables using ALTER TABLE
    # ---------------------------------------------
    print("📝 Renaming core tables to lowercase...")

    conn.execute("ALTER TABLE Match RENAME TO match")
    conn.execute("ALTER TABLE Season RENAME TO season")
    conn.execute("ALTER TABLE Player RENAME TO player")
    # 'teams' is already lowercase — skipping

    # ---------------------------------------------
    # Step 2: Drop existing incorrectly-named tables
    # ---------------------------------------------
    print("📝 Dropping and recreating match analysis tables with correct names...")

    conn.execute("DROP TABLE IF EXISTS match_events")
    conn.execute("DROP TABLE IF EXISTS team_match_stats")
    conn.execute("DROP TABLE IF EXISTS player_match_stats")
    conn.execute("DROP TABLE IF EXISTS match_shots")

    # ---------------------------------------------
    # Step 3: Create lowercase-named analysis tables
    # ---------------------------------------------

    # match_event
    conn.execute("""
        CREATE TABLE match_event (
            event_id INTEGER PRIMARY KEY AUTOINCREMENT,
            match_id VARCHAR(8) NOT NULL,
            minute INTEGER NOT NULL,
            event_type VARCHAR(20) NOT NULL,
            team_id VARCHAR(36),
            player_id VARCHAR(36),
            player_name VARCHAR(255),
            assist_player VARCHAR(255),
            additional_info VARCHAR(255),
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """)

    # match_team
    conn.execute("""
        CREATE TABLE match_team (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            match_id VARCHAR(8) NOT NULL,
            team_id VARCHAR(36) NOT NULL,
            is_home BOOLEAN NOT NULL,
            possession_pct INTEGER,
            passing_accuracy_pct DECIMAL(4,1),
            passes_completed INTEGER,
            passes_attempted INTEGER,
            shots_total INTEGER,
            shots_on_target INTEGER,
            shots_on_target_pct DECIMAL(4,1),
            saves_made INTEGER,
            saves_faced INTEGER,
            saves_pct DECIMAL(4,1),
            fouls INTEGER,
            corners INTEGER,
            crosses INTEGER,
            touches INTEGER,
            tackles INTEGER,
            interceptions INTEGER,
            aerials_won INTEGER,
            clearances INTEGER,
            offsides INTEGER,
            goal_kicks INTEGER,
            throw_ins INTEGER,
            long_balls INTEGER,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """)

    # match_player
    conn.execute("""
        CREATE TABLE match_player (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            match_id VARCHAR(8) NOT NULL,
            team_id VARCHAR(36) NOT NULL,
            player_id VARCHAR(36),
            player_name VARCHAR(255) NOT NULL,
            shirt_number INTEGER,
            nation VARCHAR(3),
            position VARCHAR(10),
            age VARCHAR(10),
            minutes INTEGER DEFAULT 0,
            goals INTEGER DEFAULT 0,
            assists INTEGER DEFAULT 0,
            penalty_kicks_made INTEGER DEFAULT 0,
            penalty_kicks_attempted INTEGER DEFAULT 0,
            shots INTEGER DEFAULT 0,
            shots_on_target INTEGER DEFAULT 0,
            yellow_cards INTEGER DEFAULT 0,
            red_cards INTEGER DEFAULT 0,
            touches INTEGER DEFAULT 0,
            tackles INTEGER DEFAULT 0,
            interceptions INTEGER DEFAULT 0,
            blocks INTEGER DEFAULT 0,
            xg DECIMAL(3,1) DEFAULT 0.0,
            npxg DECIMAL(3,1) DEFAULT 0.0,
            xag DECIMAL(3,1) DEFAULT 0.0,
            sca INTEGER DEFAULT 0,
            gca INTEGER DEFAULT 0,
            passes_completed INTEGER DEFAULT 0,
            passes_attempted INTEGER DEFAULT 0,
            pass_completion_pct DECIMAL(4,1),
            progressive_passes INTEGER DEFAULT 0,
            carries INTEGER DEFAULT 0,
            progressive_carries INTEGER DEFAULT 0,
            take_ons_attempted INTEGER DEFAULT 0,
            take_ons_successful INTEGER DEFAULT 0,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """)

    # match_shot
    conn.execute("""
        CREATE TABLE match_shot (
            shot_id INTEGER PRIMARY KEY AUTOINCREMENT,
            match_id VARCHAR(8) NOT NULL,
            minute INTEGER NOT NULL,
            player_name VARCHAR(255) NOT NULL,
            team_id VARCHAR(36) NOT NULL,
            xg DECIMAL(4,2),
            psxg DECIMAL(4,2),
            outcome VARCHAR(20) NOT NULL,
            distance INTEGER,
            body_part VARCHAR(20),
            notes VARCHAR(255),
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """)

    # Commit changes
    conn.commit()
    print("✅ Successfully renamed all tables to lowercase convention!")

    # ---------------------------------------------
    # Step 4: Display updated schema structure
    # ---------------------------------------------
    print("\n📊 Final database structure:")
    tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name").fetchall()
    for table in tables:
        print(f"  📋 {table[0]}")

except Exception as e:
    print(f"❌ Error renaming tables: {e}")
    conn.rollback()

conn.close()


🔄 RENAMING TABLES TO LOWERCASE CONVENTION
📝 Renaming core tables to lowercase...
❌ Error renaming tables: there is already another table or index with this name: match


In [None]:
import sqlite3
import pandas as pd

# Connect and check what tables actually exist
conn = sqlite3.connect('../data/processed/nwsldata.db')

print("🔍 DEBUGGING: WHAT TABLES ACTUALLY EXIST?")
print("=" * 50)

# Check all tables in sqlite_master
all_tables = pd.read_sql_query("""
    SELECT name, type, sql 
    FROM sqlite_master 
    WHERE type='table' 
    ORDER BY name
""", conn)

print(f"📊 Found {len(all_tables)} tables in sqlite_master:")
for _, row in all_tables.iterrows():
    print(f"  📋 {row['name']}")

# Check if our match report tables were created
missing_tables = []
expected_tables = ['match_event', 'match_team', 'match_player', 'match_shot']

for table in expected_tables:
    try:
        conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()
        print(f"✅ {table} exists")
    except:
        missing_tables.append(table)
        print(f"❌ {table} missing")

if missing_tables:
    print(f"\n🔧 CREATING MISSING TABLES: {missing_tables}")
    
    # Create match_event table
    if 'match_event' in missing_tables:
        conn.execute("""
            CREATE TABLE match_event (
                event_id INTEGER PRIMARY KEY AUTOINCREMENT,
                match_id VARCHAR(8) NOT NULL,
                minute INTEGER NOT NULL,
                event_type VARCHAR(20) NOT NULL,
                team_id VARCHAR(36),
                player_id VARCHAR(36),
                player_name VARCHAR(255),
                FOREIGN KEY (match_id) REFERENCES match(match_id)
            )
        """)
        print("✅ Created match_event")

    # Create match_team table  
    if 'match_team' in missing_tables:
        conn.execute("""
            CREATE TABLE match_team (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                match_id VARCHAR(8) NOT NULL,
                team_id VARCHAR(36) NOT NULL,
                is_home BOOLEAN NOT NULL,
                possession_pct INTEGER,
                shots_total INTEGER,
                shots_on_target INTEGER,
                FOREIGN KEY (match_id) REFERENCES match(match_id)
            )
        """)
        print("✅ Created match_team")

    # Create match_player table
    if 'match_player' in missing_tables:
        conn.execute("""
            CREATE TABLE match_player (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                match_id VARCHAR(8) NOT NULL,
                player_id VARCHAR(36),
                player_name VARCHAR(255) NOT NULL,
                minutes INTEGER DEFAULT 0,
                goals INTEGER DEFAULT 0,
                assists INTEGER DEFAULT 0,
                yellow_cards INTEGER DEFAULT 0,
                red_cards INTEGER DEFAULT 0,
                FOREIGN KEY (match_id) REFERENCES match(match_id)
            )
        """)
        print("✅ Created match_player")

    # Create match_shot table
    if 'match_shot' in missing_tables:
        conn.execute("""
            CREATE TABLE match_shot (
                shot_id INTEGER PRIMARY KEY AUTOINCREMENT,
                match_id VARCHAR(8) NOT NULL,
                minute INTEGER NOT NULL,
                player_name VARCHAR(255) NOT NULL,
                xg DECIMAL(4,2),
                outcome VARCHAR(20) NOT NULL,
                FOREIGN KEY (match_id) REFERENCES match(match_id)
            )
        """)
        print("✅ Created match_shot")

    conn.commit()

# Force WAL checkpoint for DBeaver
print("\n🔄 Forcing WAL checkpoint for DBeaver...")
conn.execute("PRAGMA wal_checkpoint")
conn.commit()
conn.close()

print("\n✅ Done! Now in DBeaver:")
print("   1. Close the Tables folder if it's open")
print("   2. Click the arrow to expand Tables again")
print("   3. You should now see all 10 tables!")
print(f"   4. Expected tables: match, player, season, teams, team_seasons")
print(f"   5. Plus new: match_event, match_team, match_player, match_shot")