In [1]:
import pandas as pd
import numpy as np
import glob
import os
import gc

In [10]:
DATA_DIR = '../data/train'
SUPP_FILE = '../data/supplementary_data.csv'
REPORT_FILE = '../data/dataset_inspection_results.txt'

In [11]:


def inspect_dataset():
    print(f"üöÄ Starting Deep Inspection of {DATA_DIR}...")
    
    with open(REPORT_FILE, 'w') as f:
        f.write("üèà BIG DATA BOWL 2026: RAW DATASET INSPECTION REPORT\n")
        f.write("====================================================\n\n")

        # ---------------------------------------------------------
        # PART 1 & 2: (Kept brief as per previous logic)
        # ---------------------------------------------------------
        input_files = sorted(glob.glob(os.path.join(DATA_DIR, 'input_*.csv')))
        output_files = sorted(glob.glob(os.path.join(DATA_DIR, 'output_*.csv')))
        
        f.write(f"Files Found: {len(input_files)} Input pairs, {len(output_files)} Output pairs.\n\n")

        # ---------------------------------------------------------
        # PART 3: DEEP DIVE SAMPLE (WEEK 1) - WITH DROPOUT ANALYSIS
        # ---------------------------------------------------------
        f.write("PART 3: WEEK 1 DEEP DIVE & SPARSITY PROOF\n")
        f.write("-----------------------------------------\n")
        
        if input_files and output_files:
            # Load Week 1 Pair
            w1_input_path = input_files[0]
            w1_output_path = output_files[0]
            
            print("   Loading Week 1 data for analysis...")
            df_in = pd.read_csv(w1_input_path, low_memory=False)
            df_out = pd.read_csv(w1_output_path, low_memory=False)
            
            # --- 3.7 ATTRITION & SPARSITY ANALYSIS (NEW) ---
            f.write("3.7 PLAYER DROPOUT & TEMPORAL SPARSITY ANALYSIS\n")
            f.write("   (Proving that pre_throw frames > post_throw and identifying missing players)\n\n")

            # A. Temporal Sparsity (Frame Counts)
            # Group by play and count unique frames
            in_frames = df_in.groupby(['game_id', 'play_id'])['frame_id'].nunique()
            out_frames = df_out.groupby(['game_id', 'play_id'])['frame_id'].nunique()
            
            # Merge series to compare
            frame_comp = pd.concat([in_frames, out_frames], axis=1, keys=['in_frames', 'out_frames']).dropna()
            frame_comp['ratio_in_to_out'] = frame_comp['in_frames'] / frame_comp['out_frames']
            
            f.write("   A. Temporal Differences (Frame Counts):\n")
            f.write(f"      - Avg Pre-Throw Frames:  {frame_comp['in_frames'].mean():.2f}\n")
            f.write(f"      - Avg Post-Throw Frames: {frame_comp['out_frames'].mean():.2f}\n")
            f.write(f"      - Conclusion: On average, input files are {frame_comp['ratio_in_to_out'].mean():.2f}x longer than output files.\n\n")

            # B. Entity Dropout (Players Disappearing)
            f.write("   B. Entity Dropout (Players vanishing in Output):\n")
            
            # Get unique NFL IDs per play for Input and Output
            # We filter out NaN nfl_ids (usually the football)
            in_players = df_in.dropna(subset=['nfl_id']).groupby(['game_id', 'play_id'])['nfl_id'].apply(set)
            out_players = df_out.dropna(subset=['nfl_id']).groupby(['game_id', 'play_id'])['nfl_id'].apply(set)
            
            # Merge to compare sets
            dropout_df = pd.concat([in_players, out_players], axis=1, keys=['in_set', 'out_set']).dropna()
            
            # Calculate dropout
            # "Dropout" = IDs in Input that are NOT in Output
            dropout_df['missing_ids'] = dropout_df.apply(lambda x: x['in_set'] - x['out_set'], axis=1)
            dropout_df['missing_count'] = dropout_df['missing_ids'].apply(len)
            
            total_plays_w1 = len(dropout_df)
            plays_with_dropout = len(dropout_df[dropout_df['missing_count'] > 0])
            avg_missing = dropout_df['missing_count'].mean()
            max_missing = dropout_df['missing_count'].max()
            
            f.write(f"      - Total Plays Analyzed: {total_plays_w1}\n")
            f.write(f"      - Plays with AT LEAST ONE missing player: {plays_with_dropout} ({plays_with_dropout/total_plays_w1*100:.2f}%)\n")
            f.write(f"      - Avg Missing Players per Play: {avg_missing:.2f}\n")
            f.write(f"      - Max Missing Players in a single play: {max_missing}\n\n")
            
            # C. Identify WHO is disappearing (Roles)
            if plays_with_dropout > 0:
                f.write("   C. Profile of Vanishing Players (Sample):\n")
                # Get a sample play with high dropout
                bad_play = dropout_df.sort_values('missing_count', ascending=False).index[0]
                missing_ids_list = list(dropout_df.loc[bad_play, 'missing_ids'])
                
                f.write(f"      Sample Play (Game {bad_play[0]}, Play {bad_play[1]}):\n")
                f.write(f"      Missing {len(missing_ids_list)} players in Output.\n")
                
                # Look up these IDs in the input file to see their roles
                missing_details = df_in[
                    (df_in.game_id == bad_play[0]) & 
                    (df_in.play_id == bad_play[1]) & 
                    (df_in.nfl_id.isin(missing_ids_list))
                ][['nfl_id', 'player_name', 'player_role', 'player_position']].drop_duplicates()
                
                f.write(missing_details.to_string(index=False))
                f.write("\n\n")
                
                # Check specific hypothesis: Do specific positions disappear more?
                # (Simple check: are they mostly linemen?)
                linemen = missing_details['player_position'].isin(['T', 'G', 'C', 'DT', 'NT', 'DE']).mean()
                f.write(f"      Observation: {linemen*100:.1f}% of missing players in this sample are Linemen.\n")
                f.write("      (Hypothesis: Tracking data often drops interior linemen post-throw if they are not near the play.)\n\n")

            # Clean up
            del df_in, df_out, dropout_df, in_frames, out_frames
            gc.collect()

        f.write("Inspection complete. See 'data_inspection_results.txt' for details.")

    print(f"‚úÖ Inspection Complete. Results saved to {REPORT_FILE}")

if __name__ == "__main__":
    inspect_dataset()

üöÄ Starting Deep Inspection of ../data/train...
   Loading Week 1 data for analysis...
‚úÖ Inspection Complete. Results saved to ../data/dataset_inspection_results.txt


In [20]:
REPORT_FILE = '../data/dataset_inspection_report_v2.txt'


def inspect_dataset():
    print(f"üöÄ Starting Deep Inspection of {DATA_DIR}...")
    
    with open(REPORT_FILE, 'w') as f:
        f.write("üèà BIG DATA BOWL 2026: RAW DATASET INSPECTION REPORT\n")
        f.write(f"Generated on: {pd.Timestamp.now()}\n")
        f.write("====================================================\n\n")

        # ---------------------------------------------------------
        # PART 1: SUPPLEMENTARY DATA INSPECTION
        # ---------------------------------------------------------
        f.write("PART 1: SUPPLEMENTARY DATA (METADATA)\n")
        f.write("-------------------------------------\n")
        
        if os.path.exists(SUPP_FILE):
            supp_df = pd.read_csv(SUPP_FILE)
            f.write(f"File: {SUPP_FILE}\n")
            f.write(f"Shape: {supp_df.shape}\n")
            f.write(f"Columns: {list(supp_df.columns)}\n\n")
            
            # Critical Field Analysis
            f.write(">>> Coverage Types Distribution:\n")
            f.write(supp_df['team_coverage_man_zone'].value_counts(dropna=False).to_string())
            f.write("\n\n")
            
            f.write(">>> Pass Results:\n")
            f.write(supp_df['pass_result'].value_counts(dropna=False).to_string())
            f.write("\n\n")
            
            f.write(">>> Null Values in Critical Columns:\n")
            critical_cols = ['game_id', 'play_id', 'team_coverage_type', 'pass_length']
            print(supp_df.columns)
            f.write(supp_df[critical_cols].isnull().sum().to_string())
            print("sdfwefwE")
            f.write("\n\n")
        else:
            f.write(f"‚ùå ERROR: Supplementary file not found at {SUPP_FILE}\n\n")

        # ---------------------------------------------------------
        # PART 2: TRACKING DATA FILE TOPOLOGY
        # ---------------------------------------------------------
        f.write("PART 2: TRACKING DATA TOPOLOGY\n")
        f.write("------------------------------\n")
        input_files = sorted(glob.glob(os.path.join(DATA_DIR, 'input_*.csv')))
        output_files = sorted(glob.glob(os.path.join(DATA_DIR, 'output_*.csv')))
        
        f.write(f"Total Input Files (Pre-Throw): {len(input_files)}\n")
        f.write(f"Total Output Files (Post-Throw): {len(output_files)}\n\n")
        
        # ---------------------------------------------------------
        # PART 3: DEEP DIVE SAMPLE (WEEK 1)
        # ---------------------------------------------------------
        f.write("PART 3: DEEP SAMPLE ANALYSIS (WEEK 1)\n")
        f.write("-------------------------------------\n")
        
        if input_files and output_files:
            # Load Week 1 Pair
            w1_input_path = input_files[0]
            w1_output_path = output_files[0]
            
            f.write(f"Loading Sample Pair:\n  - {os.path.basename(w1_input_path)}\n  - {os.path.basename(w1_output_path)}\n\n")
            
            df_in = pd.read_csv(w1_input_path, low_memory=False)
            df_out = pd.read_csv(w1_output_path, low_memory=False)
            
            # 3.1 Column Integrity
            f.write("3.1 Column Consistency Check:\n")
            in_cols = set(df_in.columns)
            out_cols = set(df_out.columns)
            
            if in_cols == out_cols:
                f.write("‚úÖ Input and Output files have identical schemas.\n")
                f.write(f"Columns: {list(df_in.columns)}\n\n")
            else:
                f.write("‚ö†Ô∏è Schema Mismatch detected!\n")
                f.write(f"Only in Input: {in_cols - out_cols}\n")
                f.write(f"Only in Output: {out_cols - in_cols}\n\n")

            # 3.2 Basic Stats
            f.write("3.2 Sample Statistics (Week 1):\n")
            f.write(f"Input Rows (Pre-Throw): {len(df_in):,}\n")
            f.write(f"Output Rows (Post-Throw): {len(df_out):,}\n")
            
            # 3.3 Stitching Logic Check
            f.write("\n3.3 Stitching Logic Inspection:\n")
            sample_play = df_in[['game_id', 'play_id']].iloc[0]
            g_id, p_id = sample_play['game_id'], sample_play['play_id']
            
            sample_in = df_in[(df_in.game_id == g_id) & (df_in.play_id == p_id)]
            sample_out = df_out[(df_out.game_id == g_id) & (df_out.play_id == p_id)]
            
            max_frame_in = sample_in['frame_id'].max()
            min_frame_out = sample_out['frame_id'].min()
            
            f.write(f"Sample Play ({g_id} - {p_id}):\n")
            f.write(f"  - Max Input Frame: {max_frame_in}\n")
            f.write(f"  - Min Output Frame: {min_frame_out} (Should be 1)\n")
            f.write(f"  - Gap Analysis: Input ends at {max_frame_in}, Output starts at {min_frame_out}. Offset required.\n\n")

            # 3.4 Player Roles
            f.write("3.4 Player Roles in Tracking Data:\n")
            roles = df_in['player_role'].dropna().unique()
            f.write(f"{', '.join(roles)}\n\n")
            
            # 3.5 Coordinate Range (Normalization Check)
            f.write("3.5 Coordinate Bounds (Raw Data):\n")
            f.write(f"  - X Range: {df_in['x'].min()} to {df_in['x'].max()}\n")
            f.write(f"  - Y Range: {df_in['y'].min()} to {df_in['y'].max()}\n")
            f.write("  *Note: If X goes > 100, standard NFL coords (0-120) are likely used.*\n\n")

            # 3.6 Ball Landing Spot availability
            f.write("3.6 Ball Landing Spot Availability:\n")
            print(df_in.columns, "Sdfwe")
            null_ball = df_in['ball_land_x'].isnull().mean() * 100
            f.write(f"  - Percentage of rows with Missing 'ball_land_x': {null_ball:.2f}%\n")
            if null_ball > 0:
                f.write("  - (This is expected if ball_land is only populated on specific frames or plays)\n\n")

            # Clean up memory
            del df_in, df_out, sample_in, sample_out
            gc.collect()

        # ---------------------------------------------------------
        # PART 4: AGGREGATE SCAN (ALL FILES)
        # ---------------------------------------------------------
        f.write("PART 4: AGGREGATE SCAN (ALL WEEKS)\n")
        f.write("----------------------------------\n")
        
        total_rows = 0
        total_games = set()
        total_plays = set() # Store tuples (game_id, play_id)
        
        # Iterate through pairs to save memory
        for i_path, o_path in zip(input_files, output_files):
            week_num = i_path.split('_')[-1].split('.')[0] # e.g., w01
            print(f"Scanning {week_num}...")
            
            # Just read minimal columns to get counts
            cols_to_load = ['game_id', 'play_id']
            
            # Read chunks to avoid memory spike
            df_i = pd.read_csv(i_path, usecols=cols_to_load, low_memory=False)
            df_o = pd.read_csv(o_path, usecols=cols_to_load, low_memory=False)
            
            current_rows = len(df_i) + len(df_o)
            total_rows += current_rows
            
            # Update Unique Sets
            week_games = set(df_i['game_id'].unique())
            week_plays = set(zip(df_i['game_id'], df_i['play_id']))
            
            total_games.update(week_games)
            total_plays.update(week_plays)
            
            f.write(f"  - {week_num}: {current_rows:,} rows | {len(week_games)} games | {len(week_plays)} plays\n")
            
            del df_i, df_o
            gc.collect()

        f.write(f"\nTOTAL DATASET STATS:\n")
        f.write(f"  - Total Rows: {total_rows:,}\n")
        f.write(f"  - Unique Games: {len(total_games)}\n")
        f.write(f"  - Unique Plays: {len(total_plays)}\n")

    print(f"‚úÖ Inspection Complete. Report saved to {REPORT_FILE}")

if __name__ == "__main__":
    try:
        inspect_dataset()
    except Exception as e:
        print(f"‚ùå Failed to run inspection: {e}")

üöÄ Starting Deep Inspection of ../data/train...
Index(['game_id', 'season', 'week', 'game_date', 'game_time_eastern',
       'home_team_abbr', 'visitor_team_abbr', 'play_id', 'play_description',
       'quarter', 'game_clock', 'down', 'yards_to_go', 'possession_team',
       'defensive_team', 'yardline_side', 'yardline_number',
       'pre_snap_home_score', 'pre_snap_visitor_score',
       'play_nullified_by_penalty', 'pass_result', 'pass_length',
       'offense_formation', 'receiver_alignment', 'route_of_targeted_receiver',
       'play_action', 'dropback_type', 'dropback_distance',
       'pass_location_type', 'defenders_in_the_box', 'team_coverage_man_zone',
       'team_coverage_type', 'penalty_yards', 'pre_penalty_yards_gained',
       'yards_gained', 'expected_points', 'expected_points_added',
       'pre_snap_home_team_win_probability',
       'pre_snap_visitor_team_win_probability',
       'home_team_win_probability_added', 'visitor_team_win_probility_added'],
      dtype='o

  supp_df = pd.read_csv(SUPP_FILE)


Index(['game_id', 'play_id', 'player_to_predict', 'nfl_id', 'frame_id',
       'play_direction', 'absolute_yardline_number', 'player_name',
       'player_height', 'player_weight', 'player_birth_date',
       'player_position', 'player_side', 'player_role', 'x', 'y', 's', 'a',
       'dir', 'o', 'num_frames_output', 'ball_land_x', 'ball_land_y'],
      dtype='object') Sdfwe
Scanning w01...
Scanning w02...
Scanning w03...
Scanning w04...
Scanning w05...
Scanning w06...
Scanning w07...
Scanning w08...
Scanning w09...
Scanning w10...
Scanning w11...
Scanning w12...
Scanning w13...
Scanning w14...
Scanning w15...
Scanning w16...
Scanning w17...
Scanning w18...
‚úÖ Inspection Complete. Report saved to ../data/dataset_inspection_report_v2.txt


In [24]:
OUTPUT_FILE = '../data/dataset_dropout_logic_inspection.txt'

def inspect_dropout_logic():
    print("üïµÔ∏è Analyzing Player Dropout based on Ball Landing Spot...")
    
    # Load Week 1 data
    input_files = sorted(glob.glob(os.path.join(DATA_DIR, 'input_*.csv')))
    output_files = sorted(glob.glob(os.path.join(DATA_DIR, 'output_*.csv')))
    
    # We use low_memory=False to ensure columns don't get mixed types
    df_in = pd.read_csv(input_files[0], low_memory=False)
    df_out = pd.read_csv(output_files[0], low_memory=False)
    
    results = []
    
    # Analyze a sample of 100 plays
    unique_plays = df_in[['game_id', 'play_id']].drop_duplicates().head(100).values
    
    for g_id, p_id in unique_plays:
        # 1. Get the "Before" Snapshot: The LAST frame of the Input file
        # This is the split-second BEFORE the throw (or the instant of the throw in input data).
        play_in = df_in[(df_in.game_id == g_id) & (df_in.play_id == p_id)]
        if play_in.empty: continue
            
        last_frame_id = play_in['frame_id'].max()
        snapshot = play_in[play_in['frame_id'] == last_frame_id].copy()
        
        # 2. Get the Ball Landing Spot (From the Input columns)
        # Note: ball_land_x/y only exist in the Input file.
        if 'ball_land_x' not in snapshot.columns:
            continue
            
        ball_land_x = snapshot['ball_land_x'].iloc[0]
        ball_land_y = snapshot['ball_land_y'].iloc[0]
        
        if pd.isna(ball_land_x) or pd.isna(ball_land_y):
            continue

        # 3. Who Survived? (Check Output Frame 1)
        # We verify who made it to the "Moment of the Throw" data.
        play_out = df_out[(df_out.game_id == g_id) & (df_out.play_id == p_id)]
        survivor_ids = set(play_out['nfl_id'].unique())
        
        # 4. Calculate Distance to the LANDING SPOT
        # We calculate this on the Snapshot because the "Ghosts" are still present here.
        snapshot['dist_to_landing'] = np.sqrt(
            (snapshot['x'] - ball_land_x)**2 + 
            (snapshot['y'] - ball_land_y)**2
        )
        
        # 5. Tag Survivors vs Ghosts
        snapshot = snapshot.dropna(subset=['nfl_id'])
        snapshot['status'] = snapshot['nfl_id'].apply(lambda x: 'Survivor' if x in survivor_ids else 'Ghost')
        
        results.append(snapshot[['game_id', 'play_id', 'nfl_id', 'player_role', 'dist_to_landing', 'status']])

    # --- AGGREGATE RESULTS ---
    if not results:
        print("No valid plays found.")
        return

    all_data = pd.concat(results)
    
    with open(OUTPUT_FILE, 'w') as f:
        f.write("THE IRRELEVANCE HYPOTHESIS TEST\n")
        f.write("===============================\n")
        f.write("Hypothesis: Players are dropped in the Output file because they are far from the landing spot.\n\n")
        
        # 1. Compare Distances
        stats = all_data.groupby('status')['dist_to_landing'].describe()
        f.write("1. Distance to Landing Spot (Stats):\n")
        f.write(stats.to_string())
        f.write("\n\n")
        
        # 2. The Danger Check
        # Are there Ghosts within 15 yards of where the ball landed?
        THRESHOLD = 15.0
        danger_ghosts = all_data[(all_data['status'] == 'Ghost') & (all_data['dist_to_landing'] < THRESHOLD)]
        
        f.write(f"2. Danger Ghosts (< {THRESHOLD} yards from landing):\n")
        f.write(f"   Count: {len(danger_ghosts)}\n")
        
        if len(danger_ghosts) > 0:
            f.write("   ‚ö†Ô∏è WARNING: Some players close to the catch point are disappearing!\n")
            f.write("   Sample of these missing players:\n")
            f.write(danger_ghosts[['game_id', 'play_id', 'player_role', 'dist_to_landing']].head(15).to_string())
        else:
            f.write("   ‚úÖ SUCCESS: All missing players were > 15 yards from the landing spot.\n")
            f.write("   (This confirms the dataset filters out irrelevant players.)\n")
            
        f.write("\n\n")

        # 3. Role Analysis
        if 'player_role' in all_data.columns:
            targets = all_data[all_data['player_role'] == 'Targeted Receiver']
            missing_targets = targets[targets['status'] == 'Ghost']
            
            f.write(f"3. Missing Targeted Receivers: {len(missing_targets)}\n")
            if len(missing_targets) > 0:
                f.write("   ‚ùå CRITICAL: Targeted Receivers are dropping out!\n")
            else:
                f.write("   ‚úÖ PASS: Targeted Receiver always survives.\n")

    print(f"Results saved to {OUTPUT_FILE}")

if __name__ == "__main__":
    inspect_dropout_logic()

üïµÔ∏è Analyzing Player Dropout based on Ball Landing Spot...
Results saved to ../data/dataset_dropout_logic_inspection.txt


In [25]:
OUTPUT_FILE = '../data/physics_vars_inspection.txt'

def inspect_physics_vars():
    print("üî¨ Inspecting Physics Variables (s, a, dir, o)...")
    
    input_files = sorted(glob.glob(os.path.join(DATA_DIR, 'input_*.csv')))
    output_files = sorted(glob.glob(os.path.join(DATA_DIR, 'output_*.csv')))
    
    # Load Week 1 pair
    df_in = pd.read_csv(input_files[0], nrows=1000) # Just need headers and a few rows
    df_out = pd.read_csv(output_files[0], nrows=1000)
    
    PHYSICS_COLS = ['s', 'a', 'dir', 'o']
    
    with open(OUTPUT_FILE, 'w') as f:
        f.write("PHYSICS VARIABLES AVAILABILITY REPORT\n")
        f.write("=======================================\n\n")
        
        # 1. Check Column Existence
        f.write("1. Column Existence Check:\n")
        
        in_missing = [c for c in PHYSICS_COLS if c not in df_in.columns]
        out_missing = [c for c in PHYSICS_COLS if c not in df_out.columns]
        
        f.write(f"   Input Files Missing: {in_missing if in_missing else 'None (All present)'}\n")
        f.write(f"   Output Files Missing: {out_missing if out_missing else 'None (All present)'}\n\n")
        
        # 2. Check for Nulls (in case columns exist but are empty)
        f.write("2. Data Content Check (Are they full of NaNs?):\n")
        
        f.write("   [INPUT FILES]\n")
        for col in PHYSICS_COLS:
            if col in df_in.columns:
                null_pct = df_in[col].isnull().mean() * 100
                f.write(f"     - {col}: {null_pct:.1f}% Nulls\n")
            else:
                f.write(f"     - {col}: COLUMN MISSING\n")
                
        f.write("\n   [OUTPUT FILES]\n")
        for col in PHYSICS_COLS:
            if col in df_out.columns:
                null_pct = df_out[col].isnull().mean() * 100
                f.write(f"     - {col}: {null_pct:.1f}% Nulls\n")
            else:
                f.write(f"     - {col}: COLUMN MISSING\n")
        
        f.write("\n")

        # 3. Implication Analysis
        if out_missing or any(df_out[c].isnull().all() for c in PHYSICS_COLS if c in df_out.columns):
            f.write("3. CRITICAL IMPLICATIONS:\n")
            f.write("   The Output files lack motion vectors.\n")
            f.write("   IMPACT 1 (Animation): You cannot use 'dir' arrows in visualization.\n")
            f.write("   IMPACT 2 (Normalization): 'data_preprocessor.py' tries to flip 'dir'/'o' for left-moving plays.\n")
            f.write("             If these columns are missing, that code block will crash.\n")
            f.write("   IMPACT 3 (Physics): You must DERIVE speed/direction from x,y changes if needed.\n")
        else:
            f.write("3. Status: Green. All physics variables are available.\n")

    print(f"Inspection complete. Check {OUTPUT_FILE}")

if __name__ == "__main__":
    inspect_physics_vars()

üî¨ Inspecting Physics Variables (s, a, dir, o)...
Inspection complete. Check ../data/physics_vars_inspection.txt
