# Analysing the Data:

Importing the ODI ball by ball data from 2015 onwards

In [12]:
from pathlib import Path
import pandas as pd

# Get the project root dynamically (the parent of the notebook folder)
project_root = Path(__file__).resolve().parents[1] if "__file__" in locals() else Path().resolve().parents[0]

# Build the path to the CSV
data_path = project_root / "data" / "odi_bbb_recent.csv"

# Read the CSV
data = pd.read_csv(data_path)

print(f"Loaded file from: {data_path}")

data.head()

  data = pd.read_csv(data_path)


Loaded file from: C:\Github\dst_assessment1\YoussefO\data\odi_bbb_recent.csv


Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,player_dismissed,other_wicket_type,other_player_dismissed,winner,toss_winner,toss_decision,team1,team2,bowling_style,batting_style
0,749781,2014/15,2015-01-11,Hagley Oval,1,0.1,Sri Lanka,New Zealand,FDM Karunaratne,TM Dilshan,...,,,,New Zealand,Sri Lanka,bat,New Zealand,Sri Lanka,Left arm Fast medium,Left hand Bat
1,749781,2014/15,2015-01-11,Hagley Oval,1,0.2,Sri Lanka,New Zealand,TM Dilshan,FDM Karunaratne,...,,,,New Zealand,Sri Lanka,bat,New Zealand,Sri Lanka,Left arm Fast medium,Right hand Bat
2,749781,2014/15,2015-01-11,Hagley Oval,1,0.3,Sri Lanka,New Zealand,TM Dilshan,FDM Karunaratne,...,,,,New Zealand,Sri Lanka,bat,New Zealand,Sri Lanka,Left arm Fast medium,Right hand Bat
3,749781,2014/15,2015-01-11,Hagley Oval,1,0.4,Sri Lanka,New Zealand,TM Dilshan,FDM Karunaratne,...,,,,New Zealand,Sri Lanka,bat,New Zealand,Sri Lanka,Left arm Fast medium,Right hand Bat
4,749781,2014/15,2015-01-11,Hagley Oval,1,0.5,Sri Lanka,New Zealand,TM Dilshan,FDM Karunaratne,...,,,,New Zealand,Sri Lanka,bat,New Zealand,Sri Lanka,Left arm Fast medium,Right hand Bat


In [6]:
data.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed', 'winner', 'toss_winner', 'toss_decision',
       'team1', 'team2', 'bowling_style', 'batting_style'],
      dtype='object')

In [7]:
data.shape

(620647, 29)

## Feature Generation:

In [None]:

# Convert ball column to numeric over (e.g., 1.1 -> over 1)
data['over'] = data['ball'].astype(float).apply(lambda x: int(x))

# Filter for first 10 overs only
data_10_overs = data[data['over'] < 10].copy()

# Calculate total runs per ball (runs_off_bat + extras)
data_10_overs['total_runs'] = data_10_overs['runs_off_bat'] + data_10_overs['extras']

# Identify boundaries
data_10_overs['is_four'] = (data_10_overs['runs_off_bat'] == 4).astype(int)
data_10_overs['is_six'] = (data_10_overs['runs_off_bat'] == 6).astype(int)
data_10_overs['is_boundary'] = ((data_10_overs['runs_off_bat'] == 4) | 
                                    (data_10_overs['runs_off_bat'] == 6)).astype(int)

# Identify dot balls (no runs scored)
data_10_overs['is_dot'] = (data_10_overs['total_runs'] == 0).astype(int)

# Identify wickets
data_10_overs['is_wicket'] = data_10_overs['wicket_type'].notna().astype(int)

# Aggregate statistics by match and innings
match_innings_stats = data_10_overs.groupby(['match_id', 'innings']).agg({
    'total_runs': 'sum',           # Total runs in first 10 overs
    'runs_off_bat': 'sum',         # Runs off bat (excluding extras)
    'extras': 'sum',               # Total extras
    'wides': 'sum',                # Wide balls
    'noballs': 'sum',              # No balls
    'is_wicket': 'sum',            # Wickets lost
    'is_dot': 'sum',               # Dot balls
    'is_four': 'sum',              # Number of fours
    'is_six': 'sum',               # Number of sixes
    'is_boundary': 'sum',          # Total boundaries
    'ball': 'count'                # Total balls bowled
}).reset_index()

# Rename columns for clarity
match_innings_stats.columns = [
    'match_id', 'innings', 'total_runs', 'runs_off_bat', 'total_extras',
    'wides', 'noballs', 'wickets', 'dot_balls', 'fours', 'sixes', 
    'boundaries', 'balls_bowled'
]



Now that we have sucessfully preprocessed the data.
We now engineer some metrics to help classify which team is most likely to win.

In [None]:
# Calculate derived metrics
match_innings_stats['run_rate'] = (match_innings_stats['total_runs'] / 
                                    match_innings_stats['balls_bowled'] * 6)
match_innings_stats['dot_ball_percentage'] = (match_innings_stats['dot_balls'] / 
                                                match_innings_stats['balls_bowled'] * 100)
match_innings_stats['boundary_percentage'] = (match_innings_stats['boundaries'] / 
                                                match_innings_stats['balls_bowled'] * 100)
match_innings_stats['extras_per_over'] = (match_innings_stats['total_extras'] / 10)



We then make sure we have seperate data for the first 10 balls of each innings

In [None]:


# Separate innings 1 and innings 2
innings1 = match_innings_stats[match_innings_stats['innings'] == 1].copy()
innings2 = match_innings_stats[match_innings_stats['innings'] == 2].copy()

# Rename columns to distinguish between innings
innings1_cols = {col: f'team1_{col}' for col in innings1.columns 
                    if col not in ['match_id', 'innings']}
innings2_cols = {col: f'team2_{col}' for col in innings2.columns 
                    if col not in ['match_id', 'innings']}

innings1 = innings1.rename(columns=innings1_cols).drop('innings', axis=1)
innings2 = innings2.rename(columns=innings2_cols).drop('innings', axis=1)

# Merge innings data
match_features = pd.merge(innings1, innings2, on='match_id', how='inner')

We get the match metadata (season, start_date, ...) and merge it with our match features for a complete dataset with features and metadata.

In [67]:

# Get match metadata (take first row per match from original data)
match_metadata = data.groupby('match_id').first()[
    ['season', 'start_date', 'venue', 'winner', 'toss_winner', 
        'toss_decision', 'team1', 'team2']
].reset_index()

# Merge with features
final_data = pd.merge(match_features, match_metadata, on='match_id', how='left')

final_data.head(1)

Unnamed: 0,match_id,team1_total_runs,team1_runs_off_bat,team1_total_extras,team1_wides,team1_noballs,team1_wickets,team1_dot_balls,team1_fours,team1_sixes,...,team2_boundary_percentage,team2_extras_per_over,season,start_date,venue,winner,toss_winner,toss_decision,team1,team2
0,656399,77,71,6,2.0,2.0,0,31,10,1,...,11.666667,0.0,2014/15,2015-02-14,Hagley Oval,New Zealand,Sri Lanka,field,New Zealand,Sri Lanka


Now we convert the main response variable 'The Winner' into a binary 0 or 1 variable. Similarly, we categorise the toss winner (0/1) and toss decision (0/1). Calculate some more metrics to gauge how the two teams are doing after their first 10 balls.

Crucially we also categorise the venues, teams and the season in which they played the match in.

In [None]:


# Create binary winner variable (1 if team1 wins, 0 if team2 wins)
final_data['winner_binary'] = (final_data['winner'] == final_data['team1']).astype(int)

# Create toss winner binary (1 if team1 won toss, 0 otherwise)
final_data['toss_won_by_team1'] = (final_data['toss_winner'] == final_data['team1']).astype(int)

# Create toss decision binary (1 if bat first, 0 if field first)
final_data['toss_decision_bat'] = (final_data['toss_decision'] == 'bat').astype(int)

# Calculate relative performance metrics
final_data['runs_difference'] = final_data['team1_total_runs'] - final_data['team2_total_runs']
final_data['wickets_difference'] = final_data['team1_wickets'] - final_data['team2_wickets']
final_data['run_rate_difference'] = final_data['team1_run_rate'] - final_data['team2_run_rate']
final_data['boundary_difference'] = final_data['team1_boundaries'] - final_data['team2_boundaries']

# Create venue and team categorical encodings (for mixed effects)
final_data['venue_encoded'] = pd.Categorical(final_data['venue']).codes
final_data['team1_encoded'] = pd.Categorical(final_data['team1']).codes
final_data['team2_encoded'] = pd.Categorical(final_data['team2']).codes
final_data['season_encoded'] = pd.Categorical(final_data['season']).codes

# Sort by match_id for consistency
final_data = final_data.sort_values('match_id').reset_index(drop=True)

In [69]:
final_data.head(1)

Unnamed: 0,match_id,team1_total_runs,team1_runs_off_bat,team1_total_extras,team1_wides,team1_noballs,team1_wickets,team1_dot_balls,team1_fours,team1_sixes,...,team2_boundary_percentage,team2_extras_per_over,season,start_date,venue,winner,toss_winner,toss_decision,team1,team2
0,656399,77,71,6,2.0,2.0,0,31,10,1,...,11.666667,0.0,2014/15,2015-02-14,Hagley Oval,New Zealand,Sri Lanka,field,New Zealand,Sri Lanka


In [40]:
import pandas as pd
import numpy as np

def preprocess_cricket_data(df):
    """
    Preprocess cricket ball-by-ball data for match outcome prediction.
    Focuses on first 10 overs performance of both teams.
    
    Parameters:
    -----------
    df : pandas DataFrame
        Ball-by-ball cricket data with required columns
        
    Returns:
    --------
    processed_df : pandas DataFrame
        Aggregated match-level features with binary winner variable
    """
    
    # Create a copy to avoid modifying original
    data = df.copy()
    
    # ===== DATA CLEANING =====
    
    # 1. Fill NAs in extras columns with 0 (if a ball didn't have that extra, it's 0)
    extras_cols = ['wides', 'noballs', 'byes', 'legbyes', 'penalty', 'extras']
    for col in extras_cols:
        if col in data.columns:
            data[col] = data[col].fillna(0)
    
    # 2. Fill NAs in runs_off_bat with 0
    data['runs_off_bat'] = data['runs_off_bat'].fillna(0)
    
    # 3. Remove matches with missing critical information
    # Keep only matches where we have both team1, team2, and winner
    data = data.dropna(subset=['match_id', 'innings', 'ball', 'team1', 'team2', 'winner'])
    
    # 4. Remove matches where winner is neither team1 nor team2 
    # (ties, no results, abandoned matches)
    valid_winners = (data['winner'] == data['team1']) | (data['winner'] == data['team2'])
    data = data[valid_winners].copy()
    
    # 5. Keep only matches with both innings present
    innings_count = data.groupby('match_id')['innings'].nunique()
    valid_matches = innings_count[innings_count == 2].index
    data = data[data['match_id'].isin(valid_matches)].copy()
    
    # Convert ball column to numeric over (e.g., 1.1 -> over 1)
    data['over'] = data['ball'].astype(float).apply(lambda x: int(x))
    
    # Filter for first 10 overs only (overs 0-9 or 1-10 depending on notation)
    # Checking the unique values to understand the notation
    data_10_overs = data[data['over'] < 10].copy()
    
    # Calculate total runs per ball (runs_off_bat + extras)
    data_10_overs['total_runs'] = data_10_overs['runs_off_bat'] + data_10_overs['extras']
    
    # Identify boundaries
    data_10_overs['is_four'] = (data_10_overs['runs_off_bat'] == 4).astype(int)
    data_10_overs['is_six'] = (data_10_overs['runs_off_bat'] == 6).astype(int)
    data_10_overs['is_boundary'] = ((data_10_overs['runs_off_bat'] == 4) | 
                                     (data_10_overs['runs_off_bat'] == 6)).astype(int)
    
    # Identify dot balls (no runs scored)
    data_10_overs['is_dot'] = (data_10_overs['total_runs'] == 0).astype(int)
    
    # Identify wickets
    data_10_overs['is_wicket'] = data_10_overs['wicket_type'].notna().astype(int)
    
    # Aggregate statistics by match and innings
    match_innings_stats = data_10_overs.groupby(['match_id', 'innings']).agg({
        'total_runs': 'sum',           # Total runs in first 10 overs
        'runs_off_bat': 'sum',         # Runs off bat (excluding extras)
        'extras': 'sum',               # Total extras
        'wides': 'sum',                # Wide balls
        'noballs': 'sum',              # No balls
        'is_wicket': 'sum',            # Wickets lost
        'is_dot': 'sum',               # Dot balls
        'is_four': 'sum',              # Number of fours
        'is_six': 'sum',               # Number of sixes
        'is_boundary': 'sum',          # Total boundaries
        'ball': 'count'                # Total balls bowled
    }).reset_index()
    
    # Rename columns for clarity
    match_innings_stats.columns = [
        'match_id', 'innings', 'total_runs', 'runs_off_bat', 'total_extras',
        'wides', 'noballs', 'wickets', 'dot_balls', 'fours', 'sixes', 
        'boundaries', 'balls_bowled'
    ]
    
    # Calculate derived metrics
    match_innings_stats['run_rate'] = (match_innings_stats['total_runs'] / 
                                        match_innings_stats['balls_bowled'] * 6)
    match_innings_stats['dot_ball_percentage'] = (match_innings_stats['dot_balls'] / 
                                                   match_innings_stats['balls_bowled'] * 100)
    match_innings_stats['boundary_percentage'] = (match_innings_stats['boundaries'] / 
                                                   match_innings_stats['balls_bowled'] * 100)
    match_innings_stats['extras_per_over'] = (match_innings_stats['total_extras'] / 10)
    
    # Separate innings 1 and innings 2
    innings1 = match_innings_stats[match_innings_stats['innings'] == 1].copy()
    innings2 = match_innings_stats[match_innings_stats['innings'] == 2].copy()
    
    # Rename columns to distinguish between innings
    innings1_cols = {col: f'team1_{col}' for col in innings1.columns 
                     if col not in ['match_id', 'innings']}
    innings2_cols = {col: f'team2_{col}' for col in innings2.columns 
                     if col not in ['match_id', 'innings']}
    
    innings1 = innings1.rename(columns=innings1_cols).drop('innings', axis=1)
    innings2 = innings2.rename(columns=innings2_cols).drop('innings', axis=1)
    
    # Merge innings data - use inner join to only keep complete matches
    match_features = pd.merge(innings1, innings2, on='match_id', how='inner')
    
    # Get match metadata (take first row per match from original data)
    match_metadata = data.groupby('match_id').first()[
        ['season', 'start_date', 'venue', 'winner', 'toss_winner', 
         'toss_decision', 'team1', 'team2']
    ].reset_index()
    
    # Merge with features
    final_data = pd.merge(match_features, match_metadata, on='match_id', how='left')
    
    # Drop any remaining rows with NAs in critical columns
    final_data = final_data.dropna(subset=['winner', 'team1', 'team2'])
    
    # Create binary winner variable (1 if team1 wins, 0 if team2 wins)
    final_data['winner_binary'] = (final_data['winner'] == final_data['team1']).astype(int)
    
    # Create toss winner binary (1 if team1 won toss, 0 otherwise)
    final_data['toss_won_by_team1'] = (final_data['toss_winner'] == final_data['team1']).astype(int)
    
    # Create toss decision binary (1 if bat first, 0 if field first)
    final_data['toss_decision_bat'] = (final_data['toss_decision'] == 'bat').astype(int)
    
    # Calculate relative performance metrics
    final_data['runs_difference'] = final_data['team1_total_runs'] - final_data['team2_total_runs']
    final_data['wickets_difference'] = final_data['team1_wickets'] - final_data['team2_wickets']
    final_data['run_rate_difference'] = final_data['team1_run_rate'] - final_data['team2_run_rate']
    final_data['boundary_difference'] = final_data['team1_boundaries'] - final_data['team2_boundaries']
    
    # Create venue and team categorical encodings (for mixed effects)
    final_data['venue_encoded'] = pd.Categorical(final_data['venue']).codes
    final_data['team1_encoded'] = pd.Categorical(final_data['team1']).codes
    final_data['team2_encoded'] = pd.Categorical(final_data['team2']).codes
    final_data['season_encoded'] = pd.Categorical(final_data['season']).codes
    
    # Sort by match_id for consistency
    final_data = final_data.sort_values('match_id').reset_index(drop=True)
    
    # Final check: ensure no NAs in key modeling columns
    modeling_cols = ['winner_binary', 'team1_total_runs', 'team2_total_runs',
                     'team1_wickets', 'team2_wickets', 'team1_run_rate', 'team2_run_rate']
    final_data = final_data.dropna(subset=modeling_cols)
    
    return final_data


def check_data_quality(df):
    """
    Check for missing values and data quality issues before preprocessing.
    """
    print("=" * 80)
    print("DATA QUALITY CHECK")
    print("=" * 80)
    
    # Check for missing values in critical columns
    critical_cols = ['match_id', 'innings', 'ball', 'batting_team', 'bowling_team',
                     'runs_off_bat', 'extras', 'winner', 'team1', 'team2']
    
    print("\nMissing values in critical columns:")
    missing_critical = df[critical_cols].isna().sum()
    if missing_critical.sum() == 0:
        print("  ✓ No missing values in critical columns")
    else:
        print(missing_critical[missing_critical > 0])
    
    # Check wicket-related columns (expected to have NAs when no wicket)
    wicket_cols = ['wicket_type', 'player_dismissed', 'other_wicket_type', 
                   'other_player_dismissed']
    print(f"\nWicket-related columns (NAs expected when no wicket):")
    for col in wicket_cols:
        if col in df.columns:
            na_count = df[col].isna().sum()
            na_pct = (na_count / len(df)) * 100
            print(f"  {col}: {na_count} NAs ({na_pct:.1f}%)")
    
    # Check extras columns (expected to have NAs or 0s)
    extras_cols = ['wides', 'noballs', 'byes', 'legbyes', 'penalty']
    print(f"\nExtras columns (checking for NAs):")
    for col in extras_cols:
        if col in df.columns:
            na_count = df[col].isna().sum()
            if na_count > 0:
                print(f"  {col}: {na_count} NAs - will be filled with 0")
    
    # Check for matches with incomplete innings data
    print(f"\nInnings completeness check:")
    innings_per_match = df.groupby('match_id')['innings'].nunique()
    incomplete_matches = (innings_per_match < 2).sum()
    print(f"  Total matches: {len(innings_per_match)}")
    print(f"  Matches with both innings: {(innings_per_match == 2).sum()}")
    print(f"  Matches with only 1 innings: {incomplete_matches}")
    
    # Check for matches with missing winner
    print(f"\nWinner information:")
    matches_with_winner = df.groupby('match_id')['winner'].first().notna().sum()
    total_matches = df['match_id'].nunique()
    print(f"  Matches with winner: {matches_with_winner}/{total_matches}")
    
    # Check for matches where winner is not team1 or team2
    match_teams = df.groupby('match_id')[['team1', 'team2', 'winner']].first()
    winner_mismatch = ~match_teams['winner'].isin([match_teams['team1'], match_teams['team2']])
    if winner_mismatch.sum() > 0:
        print(f"  ⚠ Warning: {winner_mismatch.sum()} matches where winner is neither team1 nor team2")
        print(f"    (These may be ties, no results, or data errors)")
    
    print("\n" + "=" * 80)
    return df


def summarize_preprocessing(processed_df):
    """
    Print summary statistics of the preprocessed data.
    """
    print("=" * 80)
    print("PREPROCESSING SUMMARY")
    print("=" * 80)
    print(f"\nTotal matches processed: {len(processed_df)}")
    print(f"\nTarget variable distribution:")
    print(f"  Team1 wins: {processed_df['winner_binary'].sum()} ({processed_df['winner_binary'].mean()*100:.1f}%)")
    print(f"  Team2 wins: {(1-processed_df['winner_binary']).sum()} ({(1-processed_df['winner_binary']).mean()*100:.1f}%)")
    
    print(f"\nFeature summary:")
    print(f"  Unique venues: {processed_df['venue'].nunique()}")
    print(f"  Unique teams: {pd.concat([processed_df['team1'], processed_df['team2']]).nunique()}")
    print(f"  Seasons covered: {processed_df['season'].nunique()}")
    
    print(f"\nFirst 10 overs performance (mean values):")
    print(f"  Team1 runs: {processed_df['team1_total_runs'].mean():.1f} (SD: {processed_df['team1_total_runs'].std():.1f})")
    print(f"  Team2 runs: {processed_df['team2_total_runs'].mean():.1f} (SD: {processed_df['team2_total_runs'].std():.1f})")
    print(f"  Team1 wickets: {processed_df['team1_wickets'].mean():.2f} (SD: {processed_df['team1_wickets'].std():.2f})")
    print(f"  Team2 wickets: {processed_df['team2_wickets'].mean():.2f} (SD: {processed_df['team2_wickets'].std():.2f})")
    print(f"  Team1 run rate: {processed_df['team1_run_rate'].mean():.2f}")
    print(f"  Team2 run rate: {processed_df['team2_run_rate'].mean():.2f}")
    
    print(f"\nRelative performance metrics (mean):")
    print(f"  Runs difference: {processed_df['runs_difference'].mean():.2f}")
    print(f"  Run rate difference: {processed_df['run_rate_difference'].mean():.2f}")
    print(f"  Wickets difference: {processed_df['wickets_difference'].mean():.2f}")
    
    print("\n" + "=" * 80)
    print("KEY FEATURES FOR GLMM:")
    print("=" * 80)
    print("\nFixed Effects (potential predictors):")
    fixed_effects = [
        'team1_total_runs', 'team2_total_runs', 'team1_wickets', 'team2_wickets',
        'team1_run_rate', 'team2_run_rate', 'runs_difference', 'wickets_difference',
        'run_rate_difference', 'team1_boundaries', 'team2_boundaries',
        'team1_dot_ball_percentage', 'team2_dot_ball_percentage',
        'toss_won_by_team1', 'toss_decision_bat'
    ]
    print("  " + "\n  ".join(fixed_effects))
    
    print("\nRandom Effects (grouping variables):")
    random_effects = ['venue', 'team1', 'team2', 'season']
    print("  " + "\n  ".join(random_effects))
    
    print("\nTarget Variable:")
    print("  winner_binary (1 = team1 wins, 0 = team2 wins)")
    print("=" * 80)


In [41]:

# Example usage:
    
# Step 1: Check data quality
check_data_quality(data)
    
# Step 2: Preprocess
processed_data = preprocess_cricket_data(data)
    
    # Step 3: Save processed data
#processed_data.to_csv('cricket_processed_10overs.csv', index=False)
    
# Step 4: Print summary
summarize_preprocessing(processed_data)

DATA QUALITY CHECK

Missing values in critical columns:
winner    16505
dtype: int64

Wicket-related columns (NAs expected when no wicket):
  wicket_type: 603430 NAs (97.2%)
  player_dismissed: 603430 NAs (97.2%)
  other_wicket_type: 620646 NAs (100.0%)
  other_player_dismissed: 620646 NAs (100.0%)

Extras columns (checking for NAs):
  wides: 606354 NAs - will be filled with 0
  noballs: 619247 NAs - will be filled with 0
  byes: 619645 NAs - will be filled with 0
  legbyes: 615281 NAs - will be filled with 0
  penalty: 620633 NAs - will be filled with 0

Innings completeness check:
  Total matches: 1173
  Matches with both innings: 1138
  Matches with only 1 innings: 30

Winner information:
  Matches with winner: 1118/1173
    (These may be ties, no results, or data errors)

PREPROCESSING SUMMARY

Total matches processed: 1118

Target variable distribution:
  Team1 wins: 622 (55.6%)
  Team2 wins: 496 (44.4%)

Feature summary:
  Unique venues: 207
  Unique teams: 22
  Seasons covered: 

In [42]:
# After preprocessing
processed = preprocess_cricket_data(data)
print(f"Matches after preprocessing: {len(processed)}")
print(f"Matches lost due to cleaning: {data['match_id'].nunique() - len(processed)}")

Matches after preprocessing: 1118
Matches lost due to cleaning: 55


In [43]:
processed.shape

(1118, 50)

# Investigating the Data to determine fixed effects and random effects:

In [44]:
# Number of levels
print(f"Venues: {processed['venue'].nunique()}")
print(f"Teams: {pd.concat([processed['team1'], processed['team2']]).nunique()}")
print(f"Seasons: {processed['season'].nunique()}")

# Distribution of matches per level
print(processed['venue'].value_counts().describe())

Venues: 207
Teams: 22
Seasons: 24
count    207.000000
mean       5.400966
std        6.523192
min        1.000000
25%        2.000000
50%        4.000000
75%        6.000000
max       59.000000
Name: count, dtype: float64


In [45]:
# Check correlation among performance metrics
performance_cols = ['team1_total_runs', 'team2_total_runs', 'runs_difference',
                    'team1_wickets', 'team2_wickets', 'wickets_difference',
                    'team1_run_rate', 'team2_run_rate', 'run_rate_difference',
                    'team1_boundaries', 'team2_boundaries']

correlation_matrix = processed[performance_cols].corr()
# Remove highly correlated pairs (|r| > 0.8)

In [46]:
print(correlation_matrix)

                     team1_total_runs  team2_total_runs  runs_difference  \
team1_total_runs             1.000000          0.149725         0.607523   
team2_total_runs             0.149725          1.000000        -0.694387   
runs_difference              0.607523         -0.694387         1.000000   
team1_wickets               -0.368536         -0.002150        -0.266495   
team2_wickets                0.061135         -0.284996         0.273449   
wickets_difference          -0.294450          0.196016        -0.371774   
team1_run_rate               0.995965          0.147362         0.606484   
team2_run_rate               0.141513          0.987768        -0.690537   
run_rate_difference          0.601973         -0.687447         0.990385   
team1_boundaries             0.898018          0.132282         0.547313   
team2_boundaries             0.118189          0.911348        -0.646120   

                     team1_wickets  team2_wickets  wickets_difference  \
team1_total_ru

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_random_effects(df):
    """
    Analyze potential random effect variables to determine suitability.
    """
    
    print("\nRANDOM EFFECTS ANALYSIS\n")
    
    random_candidates = {}
    
    # Venue analysis
    print("Venue as random effect:")
    n_venues = df['venue'].nunique()
    venue_counts = df['venue'].value_counts()
    
    print(f"Unique venues: {n_venues}")
    print(f"Matches per venue - mean: {venue_counts.mean():.1f}, median: {venue_counts.median():.1f}, min: {venue_counts.min()}, max: {venue_counts.max()}")
    
    top_5_pct = (venue_counts.head(5).sum() / len(df)) * 100
    print(f"Top 5 venues = {top_5_pct:.1f}% of all matches")
    
    if n_venues >= 10 and venue_counts.min() >= 2:
        print("Recommendation: Good candidate for random effect")
        recommended = True
    elif n_venues >= 5:
        print("Recommendation: Possible but limited")
        recommended = False
    else:
        print("Recommendation: Not enough levels")
        recommended = False
    
    random_candidates['venue'] = {
        'n_levels': n_venues,
        'mean_obs': venue_counts.mean(),
        'min_obs': venue_counts.min(),
        'recommended': recommended
    }
    
    # Season analysis
    print("\nSeason as random effect:")
    n_seasons = df['season'].nunique()
    season_counts = df['season'].value_counts().sort_index()
    
    print(f"Unique seasons: {n_seasons}")
    print(f"Matches per season - mean: {season_counts.mean():.1f}, min: {season_counts.min()}, max: {season_counts.max()}")
    
    print("Distribution by season:")
    for season, count in season_counts.items():
        print(f"  {season}: {count} matches")
    
    if n_seasons >= 6:
        print("Recommendation: Good for random effect")
        recommended = True
    elif n_seasons >= 4:
        print("Recommendation: Consider as fixed effect instead")
        recommended = False
    else:
        print("Recommendation: Too few levels, use fixed")
        recommended = False
    
    random_candidates['season'] = {
        'n_levels': n_seasons,
        'mean_obs': season_counts.mean(),
        'min_obs': season_counts.min(),
        'recommended': recommended
    }
    
    # Team analysis
    print("\nTeam as random effect:")
    
    all_teams = pd.concat([df['team1'], df['team2']])
    n_teams = all_teams.nunique()
    team_counts = all_teams.value_counts()
    
    print(f"Unique teams: {n_teams}")
    print(f"Matches per team - mean: {team_counts.mean():.1f}, min: {team_counts.min()}, max: {team_counts.max()}")
    
    print("\nTop 10 teams:")
    for i, (team, count) in enumerate(team_counts.head(10).items(), 1):
        print(f"  {i}. {team}: {count} matches")
    
    team1_wins = df['winner_binary'].mean()
    print(f"\nTeam1 (batting first) wins {team1_wins*100:.1f}% of the time")
    
    if n_teams >= 8:
        print("Recommendation: Use as random effect - can do (1|team1) + (1|team2)")
        recommended = True
    else:
        print("Recommendation: Maybe use as fixed effect")
        recommended = False
    
    random_candidates['team'] = {
        'n_levels': n_teams,
        'mean_obs': team_counts.mean(),
        'min_obs': team_counts.min(),
        'recommended': recommended
    }
    
    return random_candidates


In [48]:


def analyze_fixed_effects(df, threshold=0.8):
    """
    Analyze fixed effect candidates and check for multicollinearity.
    """
    
    print("\n\nFIXED EFFECTS ANALYSIS\n")
    
    # Get performance metrics
    performance_metrics = [
        'team1_total_runs', 'team2_total_runs', 'runs_difference',
        'team1_wickets', 'team2_wickets', 'wickets_difference',
        'team1_run_rate', 'team2_run_rate', 'run_rate_difference',
        'team1_boundaries', 'team2_boundaries', 'boundary_difference',
        'team1_dot_ball_percentage', 'team2_dot_ball_percentage',
        'team1_fours', 'team2_fours', 'team1_sixes', 'team2_sixes',
        'team1_total_extras', 'team2_total_extras'
    ]
    
    available_metrics = [col for col in performance_metrics if col in df.columns]
    
    toss_variables = ['toss_won_by_team1', 'toss_decision_bat']
    available_metrics.extend([col for col in toss_variables if col in df.columns])
    
    print(f"Available metrics: {len(available_metrics)}")
    
    # Correlation matrix
    correlation_matrix = df[available_metrics].corr()
    
    print(f"\nHighly correlated pairs (|r| > {threshold}):")
    
    high_corr_pairs = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            corr_value = correlation_matrix.iloc[i, j]
            if abs(corr_value) > threshold:
                var1 = correlation_matrix.columns[i]
                var2 = correlation_matrix.columns[j]
                high_corr_pairs.append({
                    'var1': var1,
                    'var2': var2,
                    'correlation': corr_value
                })
                print(f"  {var1} <-> {var2}: r = {corr_value:.3f}")
    
    if not high_corr_pairs:
        print("  None found")
    
    # Correlation with outcome
    print("\nCorrelation with winner (top 15):")
    
    target_correlations = df[available_metrics].corrwith(df['winner_binary']).abs().sort_values(ascending=False)
    
    for i, (var, corr) in enumerate(target_correlations.head(15).items(), 1):
        print(f"  {i}. {var}: |r| = {corr:.3f}")
    
    # Recommendations
    print("\n\nRECOMMENDATIONS:\n")
    
    print("Strategy 1 - Use difference metrics (simplest):")
    strategy1_vars = [
        'runs_difference',
        'wickets_difference', 
        'run_rate_difference',
        'boundary_difference',
        'team1_dot_ball_percentage',
        'team2_dot_ball_percentage',
        'toss_won_by_team1',
        'toss_decision_bat'
    ]
    strategy1_available = [v for v in strategy1_vars if v in df.columns]
    print(f"  Variables: {', '.join(strategy1_available)}")
    print(f"  Total: {len(strategy1_available)} predictors")
    
    print("\nStrategy 2 - Use individual team metrics:")
    strategy2_vars = [
        'team1_total_runs',
        'team2_total_runs',
        'team1_wickets',
        'team2_wickets',
        'team1_boundaries',
        'team2_boundaries',
        'team1_dot_ball_percentage',
        'team2_dot_ball_percentage',
        'toss_won_by_team1',
        'toss_decision_bat'
    ]
    strategy2_available = [v for v in strategy2_vars if v in df.columns]
    print(f"  Variables: {', '.join(strategy2_available)}")
    print(f"  Total: {len(strategy2_available)} predictors")
    
    print("\nStrategy 3 - Hybrid (recommended to start):")
    strategy3_vars = [
        'runs_difference',
        'team1_wickets',
        'team2_wickets',
        'team1_run_rate',
        'team2_run_rate',
        'boundary_difference',
        'team1_dot_ball_percentage',
        'toss_won_by_team1',
        'toss_decision_bat'
    ]
    strategy3_available = [v for v in strategy3_vars if v in df.columns]
    print(f"  Variables: {', '.join(strategy3_available)}")
    print(f"  Total: {len(strategy3_available)} predictors")
    
    print("\nMaking correlation heatmap...")
    
    return {
        'correlation_matrix': correlation_matrix,
        'high_corr_pairs': high_corr_pairs,
        'target_correlations': target_correlations,
        'strategy1': strategy1_available,
        'strategy2': strategy2_available,
        'strategy3': strategy3_available
    }



In [49]:

def plot_correlation_heatmap(correlation_matrix, figsize=(14, 12)):
    """
    Create correlation heatmap.
    """
    plt.figure(figsize=figsize)
    
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
    
    sns.heatmap(correlation_matrix, 
                mask=mask,
                annot=True, 
                fmt='.2f', 
                cmap='coolwarm', 
                center=0,
                square=True,
                linewidths=0.5,
                cbar_kws={"shrink": 0.8})
    
    plt.title('Correlation Matrix', fontsize=16, pad=20)
    plt.tight_layout()
    plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
    print("Saved as correlation_heatmap.png")
    plt.close()


In [50]:


def examine_random_effect_variance(df, random_effect_var, outcome='winner_binary'):
    """
    Check if random effect shows meaningful variance.
    """
    
    print(f"\n\nChecking variance for {random_effect_var}:\n")
    
    group_stats = df.groupby(random_effect_var).agg({
        outcome: ['mean', 'count']
    }).round(3)
    
    group_stats.columns = ['win_rate', 'n_matches']
    group_stats = group_stats.sort_values('win_rate', ascending=False)
    
    overall_mean = df[outcome].mean()
    group_variance = group_stats['win_rate'].var()
    
    print(f"Overall team1 win rate: {overall_mean:.3f}")
    print(f"Variance across {random_effect_var}: {group_variance:.6f}")
    print(f"SD: {np.sqrt(group_variance):.3f}")
    
    print(f"\nTop 10 {random_effect_var}:")
    print(group_stats.head(10))
    
    print(f"\nBottom 10 {random_effect_var}:")
    print(group_stats.tail(10))
    
    if group_variance > 0.01:
        print(f"\nLooks like {random_effect_var} has substantial variance, should use as random effect")
    else:
        print(f"\nNot much variance in {random_effect_var}, might not help much")
    
    return group_stats



In [51]:

def generate_model_recommendations(random_candidates, fixed_analysis):
    """
    Generate final recommendations.
    """
    print("\n\nFINAL RECOMMENDATIONS\n")
    
    print("Random effects:")
    random_effects_formula = []
    if random_candidates['venue']['recommended']:
        random_effects_formula.append("(1 | venue)")
    if random_candidates['season']['recommended']:
        random_effects_formula.append("(1 | season)")
    if random_candidates['team']['recommended']:
        random_effects_formula.append("(1 | team1) + (1 | team2)")
    
    if random_effects_formula:
        print("  " + " + ".join(random_effects_formula))
    else:
        print("  None recommended - might just use regular logistic regression")
    
    print("\nSuggested models to try:")
    print("\nModel 1 (simple baseline):")
    print("  winner_binary ~ runs_difference + wickets_difference + toss_won_by_team1")
    
    print("\nModel 2 (extended):")
    print("  winner_binary ~ runs_difference + wickets_difference + boundary_difference +")
    print("                  team1_dot_ball_percentage + toss_won_by_team1 + toss_decision_bat")
    
    print("\nModel 3 (hybrid approach):")
    fixed_vars = " + ".join(fixed_analysis['strategy3'][:6])
    print(f"  winner_binary ~ {fixed_vars} + ...")
    
    print("\n\nNext steps:")
    print("- Start with model 1")
    print("- Fit with statsmodels or R")
    print("- Compare AIC/BIC")
    print("- Check random effect variance components")
    print("- Do some cross-validation")



In [52]:

def analyze_glmm_structure(df):
    """
    Run full GLMM structure analysis.
    """
    
    print("\nGLMM STRUCTURE ANALYSIS")
    print("Analyzing random and fixed effects for cricket match prediction\n")
    
    # Analyze random effects
    random_candidates = analyze_random_effects(df)
    
    # Analyze fixed effects
    fixed_analysis = analyze_fixed_effects(df)
    
    # Make heatmap
    plot_correlation_heatmap(fixed_analysis['correlation_matrix'])
    
    # Check variance in random effects
    for effect in ['venue', 'season']:
        if effect in df.columns and random_candidates[effect]['recommended']:
            examine_random_effect_variance(df, effect)
    
    # Final recommendations
    generate_model_recommendations(random_candidates, fixed_analysis)
    
    return {
        'random_candidates': random_candidates,
        'fixed_analysis': fixed_analysis
    }

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_random_effects(df):
    """
    Analyze potential random effect variables to determine suitability.
    """
    
    print("\nRANDOM EFFECTS ANALYSIS\n")
    
    random_candidates = {}
    
    # Venue analysis
    print("Venue as random effect:")
    n_venues = df['venue'].nunique()
    venue_counts = df['venue'].value_counts()
    
    print(f"Unique venues: {n_venues}")
    print(f"Matches per venue - mean: {venue_counts.mean():.1f}, median: {venue_counts.median():.1f}, min: {venue_counts.min()}, max: {venue_counts.max()}")
    
    top_5_pct = (venue_counts.head(5).sum() / len(df)) * 100
    print(f"Top 5 venues = {top_5_pct:.1f}% of all matches")
    
    if n_venues >= 10 and venue_counts.min() >= 2:
        print("Recommendation: Good candidate for random effect")
        recommended = True
    elif n_venues >= 5:
        print("Recommendation: Possible but limited")
        recommended = False
    else:
        print("Recommendation: Not enough levels")
        recommended = False
    
    random_candidates['venue'] = {
        'n_levels': n_venues,
        'mean_obs': venue_counts.mean(),
        'min_obs': venue_counts.min(),
        'recommended': recommended
    }
    
    # Season analysis
    print("\nSeason as random effect:")
    n_seasons = df['season'].nunique()
    
    # Convert season to string to avoid mixed type issues, then sort
    season_counts = df['season'].astype(str).value_counts()
    try:
        season_counts = season_counts.sort_index()
    except:
        # If sorting fails, just use unsorted
        pass
    
    print(f"Unique seasons: {n_seasons}")
    print(f"Matches per season - mean: {season_counts.mean():.1f}, min: {season_counts.min()}, max: {season_counts.max()}")
    
    print("Distribution by season:")
    for season, count in season_counts.items():
        print(f"  {season}: {count} matches")
    
    if n_seasons >= 6:
        print("Recommendation: Good for random effect")
        recommended = True
    elif n_seasons >= 4:
        print("Recommendation: Consider as fixed effect instead")
        recommended = False
    else:
        print("Recommendation: Too few levels, use fixed")
        recommended = False
    
    random_candidates['season'] = {
        'n_levels': n_seasons,
        'mean_obs': season_counts.mean(),
        'min_obs': season_counts.min(),
        'recommended': recommended
    }
    
    # Team analysis
    print("\nTeam as random effect:")
    
    all_teams = pd.concat([df['team1'], df['team2']])
    n_teams = all_teams.nunique()
    team_counts = all_teams.value_counts()
    
    print(f"Unique teams: {n_teams}")
    print(f"Matches per team - mean: {team_counts.mean():.1f}, min: {team_counts.min()}, max: {team_counts.max()}")
    
    print("\nTop 10 teams:")
    for i, (team, count) in enumerate(team_counts.head(10).items(), 1):
        print(f"  {i}. {team}: {count} matches")
    
    team1_wins = df['winner_binary'].mean()
    print(f"\nTeam1 (batting first) wins {team1_wins*100:.1f}% of the time")
    
    if n_teams >= 8:
        print("Recommendation: Use as random effect - can do (1|team1) + (1|team2)")
        recommended = True
    else:
        print("Recommendation: Maybe use as fixed effect")
        recommended = False
    
    random_candidates['team'] = {
        'n_levels': n_teams,
        'mean_obs': team_counts.mean(),
        'min_obs': team_counts.min(),
        'recommended': recommended
    }
    
    return random_candidates


def analyze_fixed_effects(df, threshold=0.8):
    """
    Analyze fixed effect candidates and check for multicollinearity.
    """
    
    print("\n\nFIXED EFFECTS ANALYSIS\n")
    
    # Get performance metrics
    performance_metrics = [
        'team1_total_runs', 'team2_total_runs', 'runs_difference',
        'team1_wickets', 'team2_wickets', 'wickets_difference',
        'team1_run_rate', 'team2_run_rate', 'run_rate_difference',
        'team1_boundaries', 'team2_boundaries', 'boundary_difference',
        'team1_dot_ball_percentage', 'team2_dot_ball_percentage',
        'team1_fours', 'team2_fours', 'team1_sixes', 'team2_sixes',
        'team1_total_extras', 'team2_total_extras'
    ]
    
    available_metrics = [col for col in performance_metrics if col in df.columns]
    
    toss_variables = ['toss_won_by_team1', 'toss_decision_bat']
    available_metrics.extend([col for col in toss_variables if col in df.columns])
    
    print(f"Available metrics: {len(available_metrics)}")
    
    # Correlation matrix
    correlation_matrix = df[available_metrics].corr()
    
    print(f"\nHighly correlated pairs (|r| > {threshold}):")
    
    high_corr_pairs = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            corr_value = correlation_matrix.iloc[i, j]
            if abs(corr_value) > threshold:
                var1 = correlation_matrix.columns[i]
                var2 = correlation_matrix.columns[j]
                high_corr_pairs.append({
                    'var1': var1,
                    'var2': var2,
                    'correlation': corr_value
                })
                print(f"  {var1} <-> {var2}: r = {corr_value:.3f}")
    
    if not high_corr_pairs:
        print("  None found")
    
    # Correlation with outcome
    print("\nCorrelation with winner (top 15):")
    
    target_correlations = df[available_metrics].corrwith(df['winner_binary']).abs().sort_values(ascending=False)
    
    for i, (var, corr) in enumerate(target_correlations.head(15).items(), 1):
        print(f"  {i}. {var}: |r| = {corr:.3f}")
    
    # Recommendations
    print("\n\nRECOMMENDATIONS:\n")
    
    print("Strategy 1 - Use difference metrics (simplest):")
    strategy1_vars = [
        'runs_difference',
        'wickets_difference', 
        'run_rate_difference',
        'boundary_difference',
        'team1_dot_ball_percentage',
        'team2_dot_ball_percentage',
        'toss_won_by_team1',
        'toss_decision_bat'
    ]
    strategy1_available = [v for v in strategy1_vars if v in df.columns]
    print(f"  Variables: {', '.join(strategy1_available)}")
    print(f"  Total: {len(strategy1_available)} predictors")
    
    print("\nStrategy 2 - Use individual team metrics:")
    strategy2_vars = [
        'team1_total_runs',
        'team2_total_runs',
        'team1_wickets',
        'team2_wickets',
        'team1_boundaries',
        'team2_boundaries',
        'team1_dot_ball_percentage',
        'team2_dot_ball_percentage',
        'toss_won_by_team1',
        'toss_decision_bat'
    ]
    strategy2_available = [v for v in strategy2_vars if v in df.columns]
    print(f"  Variables: {', '.join(strategy2_available)}")
    print(f"  Total: {len(strategy2_available)} predictors")
    
    print("\nStrategy 3 - Hybrid (recommended to start):")
    strategy3_vars = [
        'runs_difference',
        'team1_wickets',
        'team2_wickets',
        'team1_run_rate',
        'team2_run_rate',
        'boundary_difference',
        'team1_dot_ball_percentage',
        'toss_won_by_team1',
        'toss_decision_bat'
    ]
    strategy3_available = [v for v in strategy3_vars if v in df.columns]
    print(f"  Variables: {', '.join(strategy3_available)}")
    print(f"  Total: {len(strategy3_available)} predictors")
    
    print("\nMaking correlation heatmap...")
    
    return {
        'correlation_matrix': correlation_matrix,
        'high_corr_pairs': high_corr_pairs,
        'target_correlations': target_correlations,
        'strategy1': strategy1_available,
        'strategy2': strategy2_available,
        'strategy3': strategy3_available
    }


def plot_correlation_heatmap(correlation_matrix, figsize=(14, 12)):
    """
    Create correlation heatmap.
    """
    plt.figure(figsize=figsize)
    
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
    
    sns.heatmap(correlation_matrix, 
                mask=mask,
                annot=True, 
                fmt='.2f', 
                cmap='coolwarm', 
                center=0,
                square=True,
                linewidths=0.5,
                cbar_kws={"shrink": 0.8})
    
    plt.title('Correlation Matrix', fontsize=16, pad=20)
    plt.tight_layout()
    plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
    print("Saved as correlation_heatmap.png")
    plt.close()


def examine_random_effect_variance(df, random_effect_var, outcome='winner_binary'):
    """
    Check if random effect shows meaningful variance.
    """
    
    print(f"\n\nChecking variance for {random_effect_var}:\n")
    
    group_stats = df.groupby(random_effect_var).agg({
        outcome: ['mean', 'count']
    }).round(3)
    
    group_stats.columns = ['win_rate', 'n_matches']
    group_stats = group_stats.sort_values('win_rate', ascending=False)
    
    overall_mean = df[outcome].mean()
    group_variance = group_stats['win_rate'].var()
    
    print(f"Overall team1 win rate: {overall_mean:.3f}")
    print(f"Variance across {random_effect_var}: {group_variance:.6f}")
    print(f"SD: {np.sqrt(group_variance):.3f}")
    
    print(f"\nTop 10 {random_effect_var}:")
    print(group_stats.head(10))
    
    print(f"\nBottom 10 {random_effect_var}:")
    print(group_stats.tail(10))
    
    if group_variance > 0.01:
        print(f"\nLooks like {random_effect_var} has substantial variance, should use as random effect")
    else:
        print(f"\nNot much variance in {random_effect_var}, might not help much")
    
    return group_stats


def generate_model_recommendations(random_candidates, fixed_analysis):
    """
    Generate final recommendations.
    """
    print("\n\nFINAL RECOMMENDATIONS\n")
    
    print("Random effects:")
    random_effects_formula = []
    if random_candidates['venue']['recommended']:
        random_effects_formula.append("(1 | venue)")
    if random_candidates['season']['recommended']:
        random_effects_formula.append("(1 | season)")
    if random_candidates['team']['recommended']:
        random_effects_formula.append("(1 | team1) + (1 | team2)")
    
    if random_effects_formula:
        print("  " + " + ".join(random_effects_formula))
    else:
        print("  None recommended - might just use regular logistic regression")
    
    print("\nSuggested models to try:")
    print("\nModel 1 (simple baseline):")
    print("  winner_binary ~ runs_difference + wickets_difference + toss_won_by_team1")
    
    print("\nModel 2 (extended):")
    print("  winner_binary ~ runs_difference + wickets_difference + boundary_difference +")
    print("                  team1_dot_ball_percentage + toss_won_by_team1 + toss_decision_bat")
    
    print("\nModel 3 (hybrid approach):")
    fixed_vars = " + ".join(fixed_analysis['strategy3'][:6])
    print(f"  winner_binary ~ {fixed_vars} + ...")
    
    print("\n\nNext steps:")
    print("- Start with model 1")
    print("- Fit with statsmodels or R")
    print("- Compare AIC/BIC")
    print("- Check random effect variance components")
    print("- Do some cross-validation")


def analyze_glmm_structure(df):
    """
    Run full GLMM structure analysis.
    """
    
    print("\nGLMM STRUCTURE ANALYSIS")
    print("Analyzing random and fixed effects for cricket match prediction\n")
    
    # Analyze random effects
    random_candidates = analyze_random_effects(df)
    
    # Analyze fixed effects
    fixed_analysis = analyze_fixed_effects(df)
    
    # Make heatmap
    plot_correlation_heatmap(fixed_analysis['correlation_matrix'])
    
    # Check variance in random effects
    for effect in ['venue', 'season']:
        if effect in df.columns and random_candidates[effect]['recommended']:
            examine_random_effect_variance(df, effect)
    
    # Final recommendations
    generate_model_recommendations(random_candidates, fixed_analysis)
    
    return {
        'random_candidates': random_candidates,
        'fixed_analysis': fixed_analysis
    }


# Usage
if __name__ == "__main__":
    # After preprocessing:
    # results = analyze_glmm_structure(processed_data)
    
    print("Ready to analyze GLMM structure")
    print("Run: results = analyze_glmm_structure(processed_data)")

Ready to analyze GLMM structure
Run: results = analyze_glmm_structure(processed_data)


In [54]:
results = analyze_glmm_structure(processed_data)


GLMM STRUCTURE ANALYSIS
Analyzing random and fixed effects for cricket match prediction


RANDOM EFFECTS ANALYSIS

Venue as random effect:
Unique venues: 207
Matches per venue - mean: 5.4, median: 4.0, min: 1, max: 59
Top 5 venues = 15.9% of all matches
Recommendation: Possible but limited

Season as random effect:
Unique seasons: 24
Matches per season - mean: 50.8, min: 6, max: 80
Distribution by season:
  2014/15: 62 matches
  2015: 36 matches
  2015/16: 41 matches
  2016: 30 matches
  2016/17: 62 matches
  2017: 48 matches
  2017/18: 72 matches
  2018: 30 matches
  2018/19: 63 matches
  2019: 72 matches
  2019/20: 41 matches
  2020: 6 matches
  2020/21: 19 matches
  2021: 45 matches
  2021/22: 39 matches
  2022: 79 matches
  2022/23: 74 matches
  2023: 79 matches
  2023/24: 80 matches
  2024: 29 matches
  2024/25: 70 matches
  2025: 41 matches
Recommendation: Good for random effect

Team as random effect:
Unique teams: 22
Matches per team - mean: 101.6, min: 4, max: 187

Top 10 tea

## Selected Variables:

In [55]:
model_columns = [
        # Outcome
        'winner_binary',
        
        # Random effects
        'season',
        'team1',
        'team2',
        
        # Fixed effects - difference metrics (avoiding multicollinearity)
        'runs_difference',
        'wickets_difference',
        'boundary_difference',
        
        # Fixed effects - individual team metrics
        'team1_wickets',
        'team2_wickets',
        'team1_dot_ball_percentage',
        'team2_dot_ball_percentage',
        
        # Toss variables
        'toss_won_by_team1',
        'toss_decision_bat',
        
        # Keep match_id for reference
        'match_id'
    ]

# Create modeling dataset
modeling_data = processed_data[model_columns].copy()


In [56]:
# Suppose your DataFrame is called 'data'
print(modeling_data.head())            # Show first 5 rows
print(modeling_data.shape)             # Show number of rows and columns
print(modeling_data.columns)           # List all column names
print(modeling_data.dtypes)            # Show each column’s data type

   winner_binary   season         team1        team2  runs_difference  \
0              1  2014/15   New Zealand    Sri Lanka               31   
1              1  2014/15     Australia      England               16   
2              1  2014/15  South Africa     Zimbabwe              -28   
3              1  2014/15         India     Pakistan               -4   
4              1  2014/15       Ireland  West Indies              -21   

   wickets_difference  boundary_difference  team1_wickets  team2_wickets  \
0                   0                    4              0              0   
1                   0                    0              2              2   
2                   1                   -5              2              1   
3                   0                    0              1              1   
4                   2                   -1              2              0   

   team1_dot_ball_percentage  team2_dot_ball_percentage  toss_won_by_team1  \
0                  48.4375

## Saving our Dataframe to a CSV file:

In [57]:
modeling_data['season'] = modeling_data['season'].astype(str)
modeling_data['team1'] = modeling_data['team1'].astype(str)
modeling_data['team2'] = modeling_data['team2'].astype(str)
modeling_data.to_csv("cricket_model_data.csv", index=False)



# Random Forest

In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


# Aggregate to match level (simplified)
match_df = data.groupby("match_id").agg({
    "runs_off_bat": "sum",
    "extras": "sum",
    "batting_team": "first",
    "bowling_team": "first",
    "venue": "first",
    "toss_winner": "first",
    "toss_decision": "first",
    "winner": "first"
}).reset_index()

# Binary target: did team1 win?
match_df["team1_win"] = (match_df["winner"] == match_df["batting_team"]).astype(int)

# Encode categorical features
X = pd.get_dummies(match_df.drop(columns=["winner", "team1_win", "match_id"]))
y = match_df["team1_win"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

# Evaluate
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.6042553191489362
              precision    recall  f1-score   support

           0       0.61      0.61      0.61       120
           1       0.59      0.60      0.60       115

    accuracy                           0.60       235
   macro avg       0.60      0.60      0.60       235
weighted avg       0.60      0.60      0.60       235



In [59]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

class CricketMatchPredictor:
    """
    Comprehensive cricket match prediction system with feature engineering,
    hyperparameter tuning, and proper evaluation.
    """
    
    def __init__(self, data):
        """
        Initialize predictor with ball-by-ball data.
        
        Parameters:
        -----------
        data : pd.DataFrame
            Ball-by-ball cricket match data
        """
        self.data = data.copy()
        self.label_encoders = {}
        self.model = None
        self.feature_names = None
        
    def clean_data(self):
        """Step 1: Data cleaning and preprocessing"""
        print("=" * 70)
        print("STEP 1: DATA CLEANING")
        print("=" * 70)
        
        # Convert date to datetime
        self.data['start_date'] = pd.to_datetime(self.data['start_date'])
        
        # Handle missing values
        print(f"\nMissing values before cleaning:")
        print(self.data.isnull().sum()[self.data.isnull().sum() > 0])
        
        # Fill missing categorical values
        cat_cols = ['wicket_type', 'player_dismissed', 'other_wicket_type', 
                    'other_player_dismissed', 'bowling_style', 'batting_style']
        for col in cat_cols:
            if col in self.data.columns:
                self.data[col] = self.data[col].fillna('None')
        
        # Fill missing numerical values with 0
        num_cols = ['wides', 'noballs', 'byes', 'legbyes', 'penalty']
        for col in num_cols:
            if col in self.data.columns:
                self.data[col] = self.data[col].fillna(0)
        
        # Remove matches with no winner (tie/no result)
        initial_matches = self.data['match_id'].nunique()
        self.data = self.data[self.data['winner'].notna()]
        final_matches = self.data['match_id'].nunique()
        print(f"\nMatches: {initial_matches} -> {final_matches} (removed {initial_matches - final_matches} ties/no results)")
        
        print("\n✓ Data cleaning completed")
        
    def create_match_level_features(self):
        """Step 2: Aggregate ball-by-ball to match-level with engineered features"""
        print("\n" + "=" * 70)
        print("STEP 2: FEATURE ENGINEERING")
        print("=" * 70)
        
        match_features = []
        
        for match_id in self.data['match_id'].unique():
            match_data = self.data[self.data['match_id'] == match_id]
            
            # Basic match info
            feature_dict = {
                'match_id': match_id,
                'season': match_data['season'].iloc[0],
                'start_date': match_data['start_date'].iloc[0],
                'venue': match_data['venue'].iloc[0],
                'winner': match_data['winner'].iloc[0],
                'toss_winner': match_data['toss_winner'].iloc[0],
                'toss_decision': match_data['toss_decision'].iloc[0],
                'team1': match_data['team1'].iloc[0],
                'team2': match_data['team2'].iloc[0]
            }
            
            # Calculate match statistics for each team
            for team in [feature_dict['team1'], feature_dict['team2']]:
                team_batting = match_data[match_data['batting_team'] == team]
                team_bowling = match_data[match_data['bowling_team'] == team]
                
                prefix = 'team1' if team == feature_dict['team1'] else 'team2'
                
                # Batting stats
                feature_dict[f'{prefix}_total_runs'] = (
                    team_batting['runs_off_bat'].sum() + team_batting['extras'].sum()
                )
                feature_dict[f'{prefix}_boundaries'] = len(team_batting[team_batting['runs_off_bat'].isin([4, 6])])
                feature_dict[f'{prefix}_sixes'] = len(team_batting[team_batting['runs_off_bat'] == 6])
                feature_dict[f'{prefix}_dots'] = len(team_batting[team_batting['runs_off_bat'] == 0])
                
                # Bowling stats (wickets taken by opposition)
                feature_dict[f'{prefix}_wickets_lost'] = team_batting['wicket_type'].notna().sum()
                feature_dict[f'{prefix}_extras_given'] = team_bowling['extras'].sum()
                feature_dict[f'{prefix}_wides'] = team_bowling['wides'].sum()
                
                # Calculate run rate (runs per over)
                balls_faced = len(team_batting)
                if balls_faced > 0:
                    overs = balls_faced / 6
                    feature_dict[f'{prefix}_run_rate'] = feature_dict[f'{prefix}_total_runs'] / overs if overs > 0 else 0
                else:
                    feature_dict[f'{prefix}_run_rate'] = 0
                
                # Strike rate indicators
                if balls_faced > 0:
                    feature_dict[f'{prefix}_boundary_pct'] = feature_dict[f'{prefix}_boundaries'] / balls_faced * 100
                    feature_dict[f'{prefix}_dot_pct'] = feature_dict[f'{prefix}_dots'] / balls_faced * 100
                else:
                    feature_dict[f'{prefix}_boundary_pct'] = 0
                    feature_dict[f'{prefix}_dot_pct'] = 0
            
            # Toss advantage
            feature_dict['toss_winner_is_team1'] = 1 if feature_dict['toss_winner'] == feature_dict['team1'] else 0
            feature_dict['toss_and_bat'] = 1 if feature_dict['toss_decision'] == 'bat' else 0
            
            # Winner encoding
            feature_dict['team1_won'] = 1 if feature_dict['winner'] == feature_dict['team1'] else 0
            
            match_features.append(feature_dict)
        
        self.match_df = pd.DataFrame(match_features)
        
        # Sort by date for time-series features
        self.match_df = self.match_df.sort_values('start_date').reset_index(drop=True)
        
        print(f"\n✓ Created {len(self.match_df)} match-level records")
        print(f"✓ Generated {len(self.match_df.columns)} initial features")
        
    def create_historical_features(self, lookback=5):
        """Create rolling statistics for team performance"""
        print("\n" + "=" * 70)
        print(f"STEP 3: HISTORICAL FEATURES (Last {lookback} matches)")
        print("=" * 70)
        
        teams = pd.concat([self.match_df['team1'], self.match_df['team2']]).unique()
        
        # Initialize historical feature columns
        hist_features = ['wins', 'matches', 'avg_runs', 'avg_wickets', 'avg_run_rate']
        for prefix in ['team1', 'team2']:
            for feat in hist_features:
                self.match_df[f'{prefix}_last{lookback}_{feat}'] = 0.0
        
        # Calculate rolling statistics
        for team in teams:
            # Get all matches for this team
            team_matches = self.match_df[
                (self.match_df['team1'] == team) | (self.match_df['team2'] == team)
            ].copy()
            
            for idx in team_matches.index:
                # Get previous matches (before current match)
                prev_matches = team_matches[team_matches.index < idx].tail(lookback)
                
                if len(prev_matches) > 0:
                    # Calculate statistics
                    wins = 0
                    total_runs = 0
                    total_wickets = 0
                    total_run_rate = 0
                    
                    for _, match in prev_matches.iterrows():
                        if match['team1'] == team:
                            wins += match['team1_won']
                            total_runs += match['team1_total_runs']
                            total_wickets += match['team1_wickets_lost']
                            total_run_rate += match['team1_run_rate']
                        else:
                            wins += (1 - match['team1_won'])
                            total_runs += match['team2_total_runs']
                            total_wickets += match['team2_wickets_lost']
                            total_run_rate += match['team2_run_rate']
                    
                    n = len(prev_matches)
                    win_rate = wins / n
                    avg_runs = total_runs / n
                    avg_wickets = total_wickets / n
                    avg_run_rate = total_run_rate / n
                    
                    # Assign to correct team column
                    if self.match_df.loc[idx, 'team1'] == team:
                        prefix = 'team1'
                    else:
                        prefix = 'team2'
                    
                    self.match_df.loc[idx, f'{prefix}_last{lookback}_wins'] = wins
                    self.match_df.loc[idx, f'{prefix}_last{lookback}_matches'] = n
                    self.match_df.loc[idx, f'{prefix}_last{lookback}_avg_runs'] = avg_runs
                    self.match_df.loc[idx, f'{prefix}_last{lookback}_avg_wickets'] = avg_wickets
                    self.match_df.loc[idx, f'{prefix}_last{lookback}_avg_run_rate'] = avg_run_rate
        
        # Calculate win rate
        for prefix in ['team1', 'team2']:
            self.match_df[f'{prefix}_last{lookback}_win_rate'] = (
                self.match_df[f'{prefix}_last{lookback}_wins'] / 
                self.match_df[f'{prefix}_last{lookback}_matches'].replace(0, 1)
            )
        
        print(f"\n✓ Added historical performance features")
        
    def create_head_to_head_features(self):
        """Create head-to-head statistics between teams"""
        print("\n" + "=" * 70)
        print("STEP 4: HEAD-TO-HEAD FEATURES")
        print("=" * 70)
        
        self.match_df['h2h_team1_wins'] = 0
        self.match_df['h2h_team2_wins'] = 0
        self.match_df['h2h_matches'] = 0
        
        for idx in self.match_df.index:
            team1 = self.match_df.loc[idx, 'team1']
            team2 = self.match_df.loc[idx, 'team2']
            
            # Get previous h2h matches
            prev_h2h = self.match_df[
                (self.match_df.index < idx) &
                (
                    ((self.match_df['team1'] == team1) & (self.match_df['team2'] == team2)) |
                    ((self.match_df['team1'] == team2) & (self.match_df['team2'] == team1))
                )
            ]
            
            if len(prev_h2h) > 0:
                team1_wins = 0
                for _, match in prev_h2h.iterrows():
                    if match['winner'] == team1:
                        team1_wins += 1
                
                self.match_df.loc[idx, 'h2h_team1_wins'] = team1_wins
                self.match_df.loc[idx, 'h2h_team2_wins'] = len(prev_h2h) - team1_wins
                self.match_df.loc[idx, 'h2h_matches'] = len(prev_h2h)
        
        # H2H win rate
        self.match_df['h2h_team1_win_rate'] = (
            self.match_df['h2h_team1_wins'] / 
            self.match_df['h2h_matches'].replace(0, 1)
        )
        
        print(f"\n✓ Added head-to-head features")
        
    def prepare_features(self):
        """Prepare final feature set for modeling"""
        print("\n" + "=" * 70)
        print("STEP 5: FEATURE PREPARATION")
        print("=" * 70)
        
        # Encode categorical variables
        categorical_cols = ['venue', 'toss_decision']
        
        for col in categorical_cols:
            le = LabelEncoder()
            self.match_df[f'{col}_encoded'] = le.fit_transform(self.match_df[col])
            self.label_encoders[col] = le
        
        # Select features for modeling
        feature_cols = [
            # Toss features
            'toss_winner_is_team1', 'toss_and_bat',
            'venue_encoded', 'toss_decision_encoded',
            
            # Match statistics
            'team1_total_runs', 'team1_wickets_lost', 'team1_run_rate',
            'team1_boundaries', 'team1_sixes', 'team1_boundary_pct', 'team1_dot_pct',
            'team2_total_runs', 'team2_wickets_lost', 'team2_run_rate',
            'team2_boundaries', 'team2_sixes', 'team2_boundary_pct', 'team2_dot_pct',
            
            # Historical features (last 5 matches)
            'team1_last5_win_rate', 'team1_last5_avg_runs', 'team1_last5_avg_wickets', 'team1_last5_avg_run_rate',
            'team2_last5_win_rate', 'team2_last5_avg_runs', 'team2_last5_avg_wickets', 'team2_last5_avg_run_rate',
            
            # Head-to-head
            'h2h_team1_win_rate', 'h2h_matches'
        ]
        
        # Remove rows with insufficient historical data
        initial_rows = len(self.match_df)
        self.match_df = self.match_df[self.match_df['h2h_matches'] > 0]
        print(f"\n✓ Removed {initial_rows - len(self.match_df)} matches with no historical data")
        
        self.X = self.match_df[feature_cols].fillna(0)
        self.y = self.match_df['team1_won']
        self.feature_names = feature_cols
        
        print(f"✓ Final feature set: {len(feature_cols)} features")
        print(f"✓ Training samples: {len(self.X)}")
        print(f"✓ Class distribution: {self.y.value_counts().to_dict()}")
        
        return self.X, self.y
    
    def split_data(self, test_size=0.2, time_series_split=True):
        """Split data with proper time-series consideration"""
        print("\n" + "=" * 70)
        print("STEP 6: TRAIN-TEST SPLIT")
        print("=" * 70)
        
        if time_series_split:
            # Split by time to avoid data leakage
            split_idx = int(len(self.X) * (1 - test_size))
            self.X_train = self.X.iloc[:split_idx]
            self.X_test = self.X.iloc[split_idx:]
            self.y_train = self.y.iloc[:split_idx]
            self.y_test = self.y.iloc[split_idx:]
            print(f"\n✓ Time-series split (train on past, test on recent matches)")
        else:
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
                self.X, self.y, test_size=test_size, random_state=42, stratify=self.y
            )
            print(f"\n✓ Random stratified split")
        
        print(f"✓ Train set: {len(self.X_train)} samples")
        print(f"✓ Test set: {len(self.X_test)} samples")
        
    def tune_hyperparameters(self, quick_mode=False):
        """Hyperparameter tuning with GridSearchCV"""
        print("\n" + "=" * 70)
        print("STEP 7: HYPERPARAMETER TUNING")
        print("=" * 70)
        
        if quick_mode:
            param_grid = {
                'n_estimators': [100, 200],
                'max_depth': [10, 20],
                'min_samples_split': [2, 5],
                'max_features': ['sqrt']
            }
            print("\n⚡ Quick mode enabled (reduced parameter space)")
        else:
            param_grid = {
                'n_estimators': [100, 200, 300, 500],
                'max_depth': [5, 10, 20, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['sqrt', 'log2', None]
            }
        
        rf = RandomForestClassifier(random_state=42, n_jobs=-1)
        
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        print(f"\nSearching {np.prod([len(v) for v in param_grid.values()])} combinations...")
        
        grid = GridSearchCV(
            rf, param_grid, cv=cv, scoring='accuracy', 
            n_jobs=-1, verbose=1
        )
        
        grid.fit(self.X_train, self.y_train)
        
        print(f"\n✓ Best parameters found:")
        for param, value in grid.best_params_.items():
            print(f"  • {param}: {value}")
        
        print(f"\n✓ Best CV accuracy: {grid.best_score_:.4f}")
        
        self.model = grid.best_estimator_
        return grid.best_params_
    
    def train_model(self, use_tuned=True, **kwargs):
        """Train the Random Forest model"""
        if not use_tuned or self.model is None:
            print("\n" + "=" * 70)
            print("STEP 7: TRAINING MODEL (Default Parameters)")
            print("=" * 70)
            self.model = RandomForestClassifier(
                n_estimators=kwargs.get('n_estimators', 200),
                max_depth=kwargs.get('max_depth', 20),
                min_samples_split=kwargs.get('min_samples_split', 5),
                random_state=42,
                n_jobs=-1
            )
            self.model.fit(self.X_train, self.y_train)
            print("\n✓ Model trained successfully")
    
    def evaluate_model(self):
        """Comprehensive model evaluation"""
        print("\n" + "=" * 70)
        print("STEP 8: MODEL EVALUATION")
        print("=" * 70)
        
        # Predictions
        y_train_pred = self.model.predict(self.X_train)
        y_test_pred = self.model.predict(self.X_test)
        
        y_test_proba = self.model.predict_proba(self.X_test)[:, 1]
        
        # Accuracy
        train_acc = accuracy_score(self.y_train, y_train_pred)
        test_acc = accuracy_score(self.y_test, y_test_pred)
        
        print(f"\n📊 ACCURACY")
        print(f"  • Training Accuracy: {train_acc:.4f} ({train_acc*100:.2f}%)")
        print(f"  • Test Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")
        print(f"  • Overfit Margin: {(train_acc - test_acc)*100:.2f}%")
        
        # Classification Report
        print(f"\n📋 CLASSIFICATION REPORT (Test Set)")
        print(classification_report(self.y_test, y_test_pred, 
                                     target_names=['Team 2 Wins', 'Team 1 Wins']))
        
        # Confusion Matrix
        cm = confusion_matrix(self.y_test, y_test_pred)
        print(f"\n📉 CONFUSION MATRIX")
        print(f"                Predicted")
        print(f"                0      1")
        print(f"Actual   0   {cm[0,0]:4d}  {cm[0,1]:4d}")
        print(f"         1   {cm[1,0]:4d}  {cm[1,1]:4d}")
        
        # ROC-AUC
        try:
            roc_auc = roc_auc_score(self.y_test, y_test_proba)
            print(f"\n🎯 ROC-AUC Score: {roc_auc:.4f}")
        except:
            print(f"\n⚠ ROC-AUC not available")
        
        # Cross-validation score
        cv_scores = cross_val_score(self.model, self.X_train, self.y_train, 
                                     cv=5, scoring='accuracy')
        print(f"\n✅ Cross-Validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
        
        return {
            'train_accuracy': train_acc,
            'test_accuracy': test_acc,
            'cv_scores': cv_scores,
            'confusion_matrix': cm
        }
    
    def plot_feature_importance(self, top_n=15):
        """Visualize feature importance"""
        print("\n" + "=" * 70)
        print("STEP 9: FEATURE IMPORTANCE ANALYSIS")
        print("=" * 70)
        
        importances = self.model.feature_importances_
        indices = np.argsort(importances)[::-1]
        
        print(f"\n🔝 Top {top_n} Most Important Features:")
        for i in range(min(top_n, len(self.feature_names))):
            idx = indices[i]
            print(f"  {i+1:2d}. {self.feature_names[idx]:40s} {importances[idx]:.4f}")
        
        # Plot
        plt.figure(figsize=(10, 8))
        top_indices = indices[:top_n]
        plt.barh(range(top_n), importances[top_indices])
        plt.yticks(range(top_n), [self.feature_names[i] for i in top_indices])
        plt.xlabel('Feature Importance')
        plt.title(f'Top {top_n} Feature Importances')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
        print(f"\n✓ Feature importance plot saved as 'feature_importance.png'")
        plt.close()
        
    def predict_match(self, team1_name, team2_name, venue_name, toss_winner_name, toss_decision):
        """Predict outcome of a new match"""
        print("\n" + "=" * 70)
        print("MATCH PREDICTION")
        print("=" * 70)
        print(f"\nTeam 1: {team1_name}")
        print(f"Team 2: {team2_name}")
        print(f"Venue: {venue_name}")
        print(f"Toss Winner: {toss_winner_name}")
        print(f"Toss Decision: {toss_decision}")
        
        # This would require looking up recent stats for these teams
        # For demonstration, returning prediction format
        print("\n⚠ Note: Real-time prediction requires current team statistics")
        print("Use the trained model with actual feature values for predictions")
    
    def run_full_pipeline(self, quick_tune=False, time_series_split=True):
        """Execute complete pipeline"""
        print("\n" + "🏏" * 35)
        print(" " * 20 + "CRICKET MATCH PREDICTION SYSTEM")
        print(" " * 20 + "Random Forest with Feature Engineering")
        print("🏏" * 35 + "\n")
        
        # Execute all steps
        self.clean_data()
        self.create_match_level_features()
        self.create_historical_features(lookback=5)
        self.create_head_to_head_features()
        self.prepare_features()
        self.split_data(test_size=0.2, time_series_split=time_series_split)
        self.tune_hyperparameters(quick_mode=quick_tune)
        results = self.evaluate_model()
        self.plot_feature_importance(top_n=15)
        
        print("\n" + "=" * 70)
        print("🎉 PIPELINE COMPLETED SUCCESSFULLY!")
        print("=" * 70)
        print(f"\n📈 Final Test Accuracy: {results['test_accuracy']*100:.2f}%")
        print(f"✨ Model ready for predictions!\n")
        
        return results


# ============================================================================
# USAGE EXAMPLE
# ============================================================================

if __name__ == "__main__":
    # Load your data
    # data = pd.read_csv('your_cricket_data.csv')
    
    # Example with placeholder:
    print("Load your cricket ball-by-ball data as 'data' DataFrame")
    print("\nThen run:")
    print("  predictor = CricketMatchPredictor(data)")
    print("  results = predictor.run_full_pipeline(quick_tune=True)")
    print("\nQuick tune=True for faster testing, False for thorough optimization")
    
    # Uncomment below when you have data loaded:
    """
    predictor = CricketMatchPredictor(data)
    results = predictor.run_full_pipeline(
        quick_tune=False,  # Set True for quick testing
        time_series_split=True  # Recommended for cricket predictions
    )
    
    # Access the trained model
    model = predictor.model
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': predictor.feature_names,
        'importance': predictor.model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nFeature Importance:")
    print(feature_importance.head(10))
    """

Load your cricket ball-by-ball data as 'data' DataFrame

Then run:
  predictor = CricketMatchPredictor(data)
  results = predictor.run_full_pipeline(quick_tune=True)

Quick tune=True for faster testing, False for thorough optimization


In [60]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

class CricketMatchPredictor:
    """
    Comprehensive cricket match prediction system with feature engineering,
    hyperparameter tuning, and proper evaluation.
    """
    
    def __init__(self, data):
        """
        Initialize predictor with ball-by-ball data.
        
        Parameters:
        -----------
        data : pd.DataFrame
            Ball-by-ball cricket match data
        """
        self.data = data.copy()
        self.label_encoders = {}
        self.model = None
        self.feature_names = None
        
    def clean_data(self):
        """Step 1: Data cleaning and preprocessing"""
        print("=" * 70)
        print("STEP 1: DATA CLEANING")
        print("=" * 70)
        
        # Convert date to datetime
        self.data['start_date'] = pd.to_datetime(self.data['start_date'])
        
        # Handle missing values
        print(f"\nMissing values before cleaning:")
        print(self.data.isnull().sum()[self.data.isnull().sum() > 0])
        
        # Fill missing categorical values
        cat_cols = ['wicket_type', 'player_dismissed', 'other_wicket_type', 
                    'other_player_dismissed', 'bowling_style', 'batting_style']
        for col in cat_cols:
            if col in self.data.columns:
                self.data[col] = self.data[col].fillna('None')
        
        # Fill missing numerical values with 0
        num_cols = ['wides', 'noballs', 'byes', 'legbyes', 'penalty']
        for col in num_cols:
            if col in self.data.columns:
                self.data[col] = self.data[col].fillna(0)
        
        # Remove matches with no winner (tie/no result)
        initial_matches = self.data['match_id'].nunique()
        self.data = self.data[self.data['winner'].notna()]
        final_matches = self.data['match_id'].nunique()
        print(f"\nMatches: {initial_matches} -> {final_matches} (removed {initial_matches - final_matches} ties/no results)")
        
        print("\n✓ Data cleaning completed")
        
    def create_match_level_features(self):
        """Step 2: Aggregate ball-by-ball to match-level with engineered features"""
        print("\n" + "=" * 70)
        print("STEP 2: FEATURE ENGINEERING")
        print("=" * 70)
        
        match_features = []
        
        for match_id in self.data['match_id'].unique():
            match_data = self.data[self.data['match_id'] == match_id]
            
            # Basic match info
            feature_dict = {
                'match_id': match_id,
                'season': match_data['season'].iloc[0],
                'start_date': match_data['start_date'].iloc[0],
                'venue': match_data['venue'].iloc[0],
                'winner': match_data['winner'].iloc[0],
                'toss_winner': match_data['toss_winner'].iloc[0],
                'toss_decision': match_data['toss_decision'].iloc[0],
                'team1': match_data['team1'].iloc[0],
                'team2': match_data['team2'].iloc[0]
            }
            
            # Calculate match statistics for each team
            for team in [feature_dict['team1'], feature_dict['team2']]:
                team_batting = match_data[match_data['batting_team'] == team]
                team_bowling = match_data[match_data['bowling_team'] == team]
                
                prefix = 'team1' if team == feature_dict['team1'] else 'team2'
                
                # Batting stats
                feature_dict[f'{prefix}_total_runs'] = (
                    team_batting['runs_off_bat'].sum() + team_batting['extras'].sum()
                )
                feature_dict[f'{prefix}_boundaries'] = len(team_batting[team_batting['runs_off_bat'].isin([4, 6])])
                feature_dict[f'{prefix}_sixes'] = len(team_batting[team_batting['runs_off_bat'] == 6])
                feature_dict[f'{prefix}_dots'] = len(team_batting[team_batting['runs_off_bat'] == 0])
                
                # Bowling stats (wickets taken by opposition)
                feature_dict[f'{prefix}_wickets_lost'] = team_batting['wicket_type'].notna().sum()
                feature_dict[f'{prefix}_extras_given'] = team_bowling['extras'].sum()
                feature_dict[f'{prefix}_wides'] = team_bowling['wides'].sum()
                
                # Calculate run rate (runs per over)
                balls_faced = len(team_batting)
                if balls_faced > 0:
                    overs = balls_faced / 6
                    feature_dict[f'{prefix}_run_rate'] = feature_dict[f'{prefix}_total_runs'] / overs if overs > 0 else 0
                else:
                    feature_dict[f'{prefix}_run_rate'] = 0
                
                # Strike rate indicators
                if balls_faced > 0:
                    feature_dict[f'{prefix}_boundary_pct'] = feature_dict[f'{prefix}_boundaries'] / balls_faced * 100
                    feature_dict[f'{prefix}_dot_pct'] = feature_dict[f'{prefix}_dots'] / balls_faced * 100
                else:
                    feature_dict[f'{prefix}_boundary_pct'] = 0
                    feature_dict[f'{prefix}_dot_pct'] = 0
            
            # Toss advantage
            feature_dict['toss_winner_is_team1'] = 1 if feature_dict['toss_winner'] == feature_dict['team1'] else 0
            feature_dict['toss_and_bat'] = 1 if feature_dict['toss_decision'] == 'bat' else 0
            
            # Winner encoding
            feature_dict['team1_won'] = 1 if feature_dict['winner'] == feature_dict['team1'] else 0
            
            match_features.append(feature_dict)
        
        self.match_df = pd.DataFrame(match_features)
        
        # Sort by date for time-series features
        self.match_df = self.match_df.sort_values('start_date').reset_index(drop=True)
        
        print(f"\n✓ Created {len(self.match_df)} match-level records")
        print(f"✓ Generated {len(self.match_df.columns)} initial features")
        
    def create_historical_features(self, lookback=5):
        """Create rolling statistics for team performance"""
        print("\n" + "=" * 70)
        print(f"STEP 3: HISTORICAL FEATURES (Last {lookback} matches)")
        print("=" * 70)
        
        teams = pd.concat([self.match_df['team1'], self.match_df['team2']]).unique()
        
        # Initialize historical feature columns
        hist_features = ['wins', 'matches', 'avg_runs', 'avg_wickets', 'avg_run_rate']
        for prefix in ['team1', 'team2']:
            for feat in hist_features:
                self.match_df[f'{prefix}_last{lookback}_{feat}'] = 0.0
        
        # Calculate rolling statistics
        for team in teams:
            # Get all matches for this team
            team_matches = self.match_df[
                (self.match_df['team1'] == team) | (self.match_df['team2'] == team)
            ].copy()
            
            for idx in team_matches.index:
                # Get previous matches (before current match)
                prev_matches = team_matches[team_matches.index < idx].tail(lookback)
                
                if len(prev_matches) > 0:
                    # Calculate statistics
                    wins = 0
                    total_runs = 0
                    total_wickets = 0
                    total_run_rate = 0
                    
                    for _, match in prev_matches.iterrows():
                        if match['team1'] == team:
                            wins += match['team1_won']
                            total_runs += match['team1_total_runs']
                            total_wickets += match['team1_wickets_lost']
                            total_run_rate += match['team1_run_rate']
                        else:
                            wins += (1 - match['team1_won'])
                            total_runs += match['team2_total_runs']
                            total_wickets += match['team2_wickets_lost']
                            total_run_rate += match['team2_run_rate']
                    
                    n = len(prev_matches)
                    win_rate = wins / n
                    avg_runs = total_runs / n
                    avg_wickets = total_wickets / n
                    avg_run_rate = total_run_rate / n
                    
                    # Assign to correct team column
                    if self.match_df.loc[idx, 'team1'] == team:
                        prefix = 'team1'
                    else:
                        prefix = 'team2'
                    
                    self.match_df.loc[idx, f'{prefix}_last{lookback}_wins'] = wins
                    self.match_df.loc[idx, f'{prefix}_last{lookback}_matches'] = n
                    self.match_df.loc[idx, f'{prefix}_last{lookback}_avg_runs'] = avg_runs
                    self.match_df.loc[idx, f'{prefix}_last{lookback}_avg_wickets'] = avg_wickets
                    self.match_df.loc[idx, f'{prefix}_last{lookback}_avg_run_rate'] = avg_run_rate
        
        # Calculate win rate
        for prefix in ['team1', 'team2']:
            self.match_df[f'{prefix}_last{lookback}_win_rate'] = (
                self.match_df[f'{prefix}_last{lookback}_wins'] / 
                self.match_df[f'{prefix}_last{lookback}_matches'].replace(0, 1)
            )
        
        print(f"\n✓ Added historical performance features")
        
    def create_head_to_head_features(self):
        """Create head-to-head statistics between teams"""
        print("\n" + "=" * 70)
        print("STEP 4: HEAD-TO-HEAD FEATURES")
        print("=" * 70)
        
        self.match_df['h2h_team1_wins'] = 0
        self.match_df['h2h_team2_wins'] = 0
        self.match_df['h2h_matches'] = 0
        
        for idx in self.match_df.index:
            team1 = self.match_df.loc[idx, 'team1']
            team2 = self.match_df.loc[idx, 'team2']
            
            # Get previous h2h matches
            prev_h2h = self.match_df[
                (self.match_df.index < idx) &
                (
                    ((self.match_df['team1'] == team1) & (self.match_df['team2'] == team2)) |
                    ((self.match_df['team1'] == team2) & (self.match_df['team2'] == team1))
                )
            ]
            
            if len(prev_h2h) > 0:
                team1_wins = 0
                for _, match in prev_h2h.iterrows():
                    if match['winner'] == team1:
                        team1_wins += 1
                
                self.match_df.loc[idx, 'h2h_team1_wins'] = team1_wins
                self.match_df.loc[idx, 'h2h_team2_wins'] = len(prev_h2h) - team1_wins
                self.match_df.loc[idx, 'h2h_matches'] = len(prev_h2h)
        
        # H2H win rate
        self.match_df['h2h_team1_win_rate'] = (
            self.match_df['h2h_team1_wins'] / 
            self.match_df['h2h_matches'].replace(0, 1)
        )
        
        print(f"\n✓ Added head-to-head features")
        
    def prepare_features(self):
        """Prepare final feature set for modeling"""
        print("\n" + "=" * 70)
        print("STEP 5: FEATURE PREPARATION")
        print("=" * 70)
        
        # Encode categorical variables
        categorical_cols = ['venue', 'toss_decision']
        
        for col in categorical_cols:
            le = LabelEncoder()
            self.match_df[f'{col}_encoded'] = le.fit_transform(self.match_df[col])
            self.label_encoders[col] = le
        
        # Select features for modeling
        feature_cols = [
            # Toss features
            'toss_winner_is_team1', 'toss_and_bat',
            'venue_encoded', 'toss_decision_encoded',
            
            # Match statistics
            'team1_total_runs', 'team1_wickets_lost', 'team1_run_rate',
            'team1_boundaries', 'team1_sixes', 'team1_boundary_pct', 'team1_dot_pct',
            'team2_total_runs', 'team2_wickets_lost', 'team2_run_rate',
            'team2_boundaries', 'team2_sixes', 'team2_boundary_pct', 'team2_dot_pct',
            
            # Historical features (last 5 matches)
            'team1_last5_win_rate', 'team1_last5_avg_runs', 'team1_last5_avg_wickets', 'team1_last5_avg_run_rate',
            'team2_last5_win_rate', 'team2_last5_avg_runs', 'team2_last5_avg_wickets', 'team2_last5_avg_run_rate',
            
            # Head-to-head
            'h2h_team1_win_rate', 'h2h_matches'
        ]
        
        # Remove rows with insufficient historical data
        initial_rows = len(self.match_df)
        self.match_df = self.match_df[self.match_df['h2h_matches'] > 0]
        print(f"\n✓ Removed {initial_rows - len(self.match_df)} matches with no historical data")
        
        self.X = self.match_df[feature_cols].fillna(0)
        self.y = self.match_df['team1_won']
        self.feature_names = feature_cols
        
        print(f"✓ Final feature set: {len(feature_cols)} features")
        print(f"✓ Training samples: {len(self.X)}")
        print(f"✓ Class distribution: {self.y.value_counts().to_dict()}")
        
        return self.X, self.y
    
    def split_data(self, test_size=0.2, time_series_split=True):
        """Split data with proper time-series consideration"""
        print("\n" + "=" * 70)
        print("STEP 6: TRAIN-TEST SPLIT")
        print("=" * 70)
        
        if time_series_split:
            # Split by time to avoid data leakage
            split_idx = int(len(self.X) * (1 - test_size))
            self.X_train = self.X.iloc[:split_idx]
            self.X_test = self.X.iloc[split_idx:]
            self.y_train = self.y.iloc[:split_idx]
            self.y_test = self.y.iloc[split_idx:]
            print(f"\n✓ Time-series split (train on past, test on recent matches)")
        else:
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
                self.X, self.y, test_size=test_size, random_state=42, stratify=self.y
            )
            print(f"\n✓ Random stratified split")
        
        print(f"✓ Train set: {len(self.X_train)} samples")
        print(f"✓ Test set: {len(self.X_test)} samples")
        
    def tune_hyperparameters(self, quick_mode=False):
        """Hyperparameter tuning with GridSearchCV"""
        print("\n" + "=" * 70)
        print("STEP 7: HYPERPARAMETER TUNING")
        print("=" * 70)
        
        if quick_mode:
            param_grid = {
                'n_estimators': [100, 200],
                'max_depth': [10, 20],
                'min_samples_split': [2, 5],
                'max_features': ['sqrt']
            }
            print("\n⚡ Quick mode enabled (reduced parameter space)")
        else:
            param_grid = {
                'n_estimators': [100, 200, 300, 500],
                'max_depth': [5, 10, 20, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['sqrt', 'log2', None]
            }
        
        rf = RandomForestClassifier(random_state=42, n_jobs=-1)
        
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        print(f"\nSearching {np.prod([len(v) for v in param_grid.values()])} combinations...")
        
        grid = GridSearchCV(
            rf, param_grid, cv=cv, scoring='accuracy', 
            n_jobs=-1, verbose=1
        )
        
        grid.fit(self.X_train, self.y_train)
        
        print(f"\n✓ Best parameters found:")
        for param, value in grid.best_params_.items():
            print(f"  • {param}: {value}")
        
        print(f"\n✓ Best CV accuracy: {grid.best_score_:.4f}")
        
        self.model = grid.best_estimator_
        return grid.best_params_
    
    def train_model(self, use_tuned=True, **kwargs):
        """Train the Random Forest model"""
        if not use_tuned or self.model is None:
            print("\n" + "=" * 70)
            print("STEP 7: TRAINING MODEL (Default Parameters)")
            print("=" * 70)
            self.model = RandomForestClassifier(
                n_estimators=kwargs.get('n_estimators', 200),
                max_depth=kwargs.get('max_depth', 20),
                min_samples_split=kwargs.get('min_samples_split', 5),
                random_state=42,
                n_jobs=-1
            )
            self.model.fit(self.X_train, self.y_train)
            print("\n✓ Model trained successfully")
    
    def evaluate_model(self):
        """Comprehensive model evaluation"""
        print("\n" + "=" * 70)
        print("STEP 8: MODEL EVALUATION")
        print("=" * 70)
        
        # Predictions
        y_train_pred = self.model.predict(self.X_train)
        y_test_pred = self.model.predict(self.X_test)
        
        y_test_proba = self.model.predict_proba(self.X_test)[:, 1]
        
        # Accuracy
        train_acc = accuracy_score(self.y_train, y_train_pred)
        test_acc = accuracy_score(self.y_test, y_test_pred)
        
        print(f"\n📊 ACCURACY")
        print(f"  • Training Accuracy: {train_acc:.4f} ({train_acc*100:.2f}%)")
        print(f"  • Test Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")
        print(f"  • Overfit Margin: {(train_acc - test_acc)*100:.2f}%")
        
        # Classification Report
        print(f"\n📋 CLASSIFICATION REPORT (Test Set)")
        print(classification_report(self.y_test, y_test_pred, 
                                     target_names=['Team 2 Wins', 'Team 1 Wins']))
        
        # Confusion Matrix
        cm = confusion_matrix(self.y_test, y_test_pred)
        print(f"\n📉 CONFUSION MATRIX")
        print(f"                Predicted")
        print(f"                0      1")
        print(f"Actual   0   {cm[0,0]:4d}  {cm[0,1]:4d}")
        print(f"         1   {cm[1,0]:4d}  {cm[1,1]:4d}")
        
        # ROC-AUC
        try:
            roc_auc = roc_auc_score(self.y_test, y_test_proba)
            print(f"\n🎯 ROC-AUC Score: {roc_auc:.4f}")
        except:
            print(f"\n⚠ ROC-AUC not available")
        
        # Cross-validation score
        cv_scores = cross_val_score(self.model, self.X_train, self.y_train, 
                                     cv=5, scoring='accuracy')
        print(f"\n✅ Cross-Validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
        
        return {
            'train_accuracy': train_acc,
            'test_accuracy': test_acc,
            'cv_scores': cv_scores,
            'confusion_matrix': cm
        }
    
    def plot_feature_importance(self, top_n=15):
        """Visualize feature importance"""
        print("\n" + "=" * 70)
        print("STEP 9: FEATURE IMPORTANCE ANALYSIS")
        print("=" * 70)
        
        importances = self.model.feature_importances_
        indices = np.argsort(importances)[::-1]
        
        print(f"\n🔝 Top {top_n} Most Important Features:")
        for i in range(min(top_n, len(self.feature_names))):
            idx = indices[i]
            print(f"  {i+1:2d}. {self.feature_names[idx]:40s} {importances[idx]:.4f}")
        
        # Plot
        plt.figure(figsize=(10, 8))
        top_indices = indices[:top_n]
        plt.barh(range(top_n), importances[top_indices])
        plt.yticks(range(top_n), [self.feature_names[i] for i in top_indices])
        plt.xlabel('Feature Importance')
        plt.title(f'Top {top_n} Feature Importances')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
        print(f"\n✓ Feature importance plot saved as 'feature_importance.png'")
        plt.close()
        
    def predict_match(self, team1_name, team2_name, venue_name, toss_winner_name, toss_decision):
        """Predict outcome of a new match"""
        print("\n" + "=" * 70)
        print("MATCH PREDICTION")
        print("=" * 70)
        print(f"\nTeam 1: {team1_name}")
        print(f"Team 2: {team2_name}")
        print(f"Venue: {venue_name}")
        print(f"Toss Winner: {toss_winner_name}")
        print(f"Toss Decision: {toss_decision}")
        
        # This would require looking up recent stats for these teams
        # For demonstration, returning prediction format
        print("\n⚠ Note: Real-time prediction requires current team statistics")
        print("Use the trained model with actual feature values for predictions")
    
    def run_full_pipeline(self, quick_tune=False, time_series_split=True):
        """Execute complete pipeline"""
        print("\n" + "🏏" * 35)
        print(" " * 20 + "CRICKET MATCH PREDICTION SYSTEM")
        print(" " * 20 + "Random Forest with Feature Engineering")
        print("🏏" * 35 + "\n")
        
        # Execute all steps
        self.clean_data()
        self.create_match_level_features()
        self.create_historical_features(lookback=5)
        self.create_head_to_head_features()
        self.prepare_features()
        self.split_data(test_size=0.2, time_series_split=time_series_split)
        self.tune_hyperparameters(quick_mode=quick_tune)
        results = self.evaluate_model()
        self.plot_feature_importance(top_n=15)
        
        print("\n" + "=" * 70)
        print("🎉 PIPELINE COMPLETED SUCCESSFULLY!")
        print("=" * 70)
        print(f"\n📈 Final Test Accuracy: {results['test_accuracy']*100:.2f}%")
        print(f"✨ Model ready for predictions!\n")
        
        return results


# ============================================================================
# USAGE INSTRUCTIONS
# ============================================================================
"""
To use this predictor with your cricket data:

1. Load your data (assuming it's in a DataFrame called 'data'):
   
   predictor = CricketMatchPredictor(data)
   
2. Run the full pipeline:
   
   results = predictor.run_full_pipeline(
       quick_tune=True,  # Set False for thorough hyperparameter tuning
       time_series_split=True  # Train on past matches, test on recent ones
   )
   
3. Access the trained model and results:
   
   model = predictor.model
   X_test = predictor.X_test
   y_test = predictor.y_test
   predictions = model.predict(X_test)

4. View feature importance:
   
   feature_importance = pd.DataFrame({
       'feature': predictor.feature_names,
       'importance': predictor.model.feature_importances_
   }).sort_values('importance', ascending=False)
   
   print(feature_importance.head(10))

Example complete workflow:
   
   # Assuming 'data' is your DataFrame with ball-by-ball cricket data
   predictor = CricketMatchPredictor(data)
   results = predictor.run_full_pipeline(quick_tune=True)
   
   # Now you can use predictor.model for predictions
"""

"\nTo use this predictor with your cricket data:\n\n1. Load your data (assuming it's in a DataFrame called 'data'):\n\n   predictor = CricketMatchPredictor(data)\n\n2. Run the full pipeline:\n\n   results = predictor.run_full_pipeline(\n       quick_tune=True,  # Set False for thorough hyperparameter tuning\n       time_series_split=True  # Train on past matches, test on recent ones\n   )\n\n3. Access the trained model and results:\n\n   model = predictor.model\n   X_test = predictor.X_test\n   y_test = predictor.y_test\n   predictions = model.predict(X_test)\n\n4. View feature importance:\n\n   feature_importance = pd.DataFrame({\n       'feature': predictor.feature_names,\n       'importance': predictor.model.feature_importances_\n   }).sort_values('importance', ascending=False)\n\n   print(feature_importance.head(10))\n\nExample complete workflow:\n\n   # Assuming 'data' is your DataFrame with ball-by-ball cricket data\n   predictor = CricketMatchPredictor(data)\n   results = predic

In [61]:
predictor = CricketMatchPredictor(data)

In [62]:
results = predictor.run_full_pipeline(
       quick_tune=True,  # Set False for thorough hyperparameter tuning
       time_series_split=True  # Train on past matches, test on recent ones
   )


🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏
                    CRICKET MATCH PREDICTION SYSTEM
                    Random Forest with Feature Engineering
🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏🏏

STEP 1: DATA CLEANING

Missing values before cleaning:
wides                     606354
noballs                   619247
byes                      619645
legbyes                   615281
penalty                   620633
wicket_type               603430
player_dismissed          603430
other_wicket_type         620646
other_player_dismissed    620646
winner                     16505
bowling_style               3323
batting_style               1522
dtype: int64

Matches: 1173 -> 1118 (removed 55 ties/no results)

✓ Data cleaning completed

STEP 2: FEATURE ENGINEERING

✓ Created 1118 match-level records
✓ Generated 32 initial features

STEP 3: HISTORICAL FEATURES (Last 5 matches)

✓ Added historical performance features

STEP 4: HEAD-TO-HEAD FEATURES

✓ Added head-to-head features

STEP 5: FEATURE PREPARATI

In [63]:
model = predictor.model
   X_test = predictor.X_test
   y_test = predictor.y_test
   predictions = model.predict(X_test)

IndentationError: unexpected indent (2048302953.py, line 2)

In [None]:
feature_importance = pd.DataFrame({
       'feature': predictor.feature_names,
       'importance': predictor.model.feature_importances_
   }).sort_values('importance', ascending=False)
   
   print(feature_importance.head(10))

Our task is to predict the match outcome from the first 10 Overs.