In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import joblib


# Step 2: Load the Full Dataset
df = pd.read_csv('../data/processed/t20s_combined.csv')

# Basic shape confirmation
print("Total rows:", len(df))
df.head()


  df = pd.read_csv('../data/processed/t20s_combined.csv')


Total rows: 929433


Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.1,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,
1,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.2,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,
2,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.3,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,
3,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.4,Australia,Sri Lanka,M Klinger,AJ Finch,...,0,,,,,,,,,
4,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.5,Australia,Sri Lanka,M Klinger,AJ Finch,...,0,,,,,,,,,


In [3]:
# Create 'match_date' column (if needed)
df['match_date'] = pd.to_datetime(df['start_date'])

# Aggregate runs per batter per match
match_runs = (
    df.groupby(['striker', 'match_id', 'match_date'], as_index=False)
    .agg({'runs_off_bat': 'sum'})
    .rename(columns={'runs_off_bat': 'runs_in_match'})
)

# Sort and apply rolling mean to get batter form
match_runs = match_runs.sort_values(['striker', 'match_date'])

# Rolling average of last 5 matches 
match_runs['batter_form'] = (
    match_runs.groupby('striker')['runs_in_match']
    .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
)

# Preview the result
match_runs.head(10)


Unnamed: 0,striker,match_id,match_date,runs_in_match,batter_form
0,A Adekunle,1411272,2023-12-14,1,
1,A Adekunle,1423472,2024-03-11,1,1.0
2,A Adekunle,1423473,2024-03-13,1,1.0
4,A Adekunle,1435623,2024-05-31,4,1.0
3,A Adekunle,1435621,2024-06-02,0,1.75
5,A Adekunle,1435648,2024-06-08,0,1.4
6,A Ahmadhel,1235832,2020-10-18,2,
7,A Ahmadhel,1275269,2021-09-02,4,2.0
8,A Ahmadhel,1275271,2021-09-03,0,3.0
9,A Ahmadhel,1443778,2024-08-25,2,2.0


In [5]:
# Merge batter_form back into the main dataset
df = df.merge(
    match_runs[['striker', 'match_id', 'batter_form']],
    on=['striker', 'match_id'],
    how='left'
)

# Confirm merge worked
df[['striker', 'match_id', 'runs_off_bat', 'batter_form']].head(10)


Unnamed: 0,striker,match_id,runs_off_bat,batter_form
0,AJ Finch,1001349,0,34.8
1,AJ Finch,1001349,0,34.8
2,AJ Finch,1001349,1,34.8
3,M Klinger,1001349,2,
4,M Klinger,1001349,0,
5,M Klinger,1001349,3,
6,M Klinger,1001349,0,
7,M Klinger,1001349,1,
8,AJ Finch,1001349,0,34.8
9,AJ Finch,1001349,0,34.8


In [7]:
# Prepare final training dataset
model_df = df[['batter_form', 'bowling_team', 'venue', 'runs_off_bat']].dropna()

# One-hot encode categorical features
model_df_encoded = pd.get_dummies(model_df, columns=['bowling_team', 'venue'])

# Split into features and target
X = model_df_encoded.drop('runs_off_bat', axis=1)
y = model_df_encoded['runs_off_bat']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
print(X_train.isnull().sum())


batter_form                                              0
bowling_team_Argentina                                   0
bowling_team_Australia                                   0
bowling_team_Austria                                     0
bowling_team_Bahamas                                     0
                                                        ..
venue_Yeonhui Cricket Ground, Incheon                    0
venue_Zahur Ahmed Chowdhury Stadium                      0
venue_Zahur Ahmed Chowdhury Stadium, Chattogram          0
venue_Zayed Cricket Stadium, Abu Dhabi                   0
venue_Zhejiang University of Technology Cricket Field    0
Length: 524, dtype: int64


In [11]:
print(X_train.shape)


(693893, 524)


In [13]:
X_train.head()


Unnamed: 0,batter_form,bowling_team_Argentina,bowling_team_Australia,bowling_team_Austria,bowling_team_Bahamas,bowling_team_Bahrain,bowling_team_Bangladesh,bowling_team_Barbados,bowling_team_Belgium,bowling_team_Belize,...,"venue_Windsor Park, Roseau, Dominica","venue_Woodley Cricket Field, Los Angeles",venue_YMCA Cricket Club,"venue_YSD-UKM Cricket Oval, Bangi",venue_Yeonhui Cricket Ground,"venue_Yeonhui Cricket Ground, Incheon",venue_Zahur Ahmed Chowdhury Stadium,"venue_Zahur Ahmed Chowdhury Stadium, Chattogram","venue_Zayed Cricket Stadium, Abu Dhabi",venue_Zhejiang University of Technology Cricket Field
828509,10.4,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
411357,8.8,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
87746,9.4,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
382035,10.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
787405,15.5,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
# Step 4: Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)