In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns


# Load processed T20 dataset
t20_df = pd.read_csv("../data/processed/t20s_combined.csv")
print("Shape of dataset:", t20_df.shape)
t20_df.head()

  t20_df = pd.read_csv("../data/processed/t20s_combined.csv")


Shape of dataset: (929433, 22)


Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.1,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,
1,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.2,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,
2,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.3,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,
3,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.4,Australia,Sri Lanka,M Klinger,AJ Finch,...,0,,,,,,,,,
4,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.5,Australia,Sri Lanka,M Klinger,AJ Finch,...,0,,,,,,,,,


In [56]:
# Step 1: Aggregate total runs scored by each batter per match
player_runs_df = t20_df.groupby(['match_id', 'striker'], as_index=False)['runs_off_bat'].sum()
player_runs_df.rename(columns={'striker': 'batsman', 'runs_off_bat': 'total_runs'}, inplace=True)

# Step 2: Merge additional context from original dataframe
# Get the first occurrence of batter in that match to extract context
context_cols = ['match_id', 'striker', 'batting_team', 'bowling_team', 'venue', 'season']
context_df = t20_df.drop_duplicates(subset=['match_id', 'striker'])[context_cols]

# Rename 'striker' → 'batsman'
context_df = context_df.rename(columns={'striker': 'batsman'})


# Step 3: Merge context with runs
merged_df = pd.merge(player_runs_df, context_df, on=['match_id', 'batsman'], how='left')

print("Shape of player-level dataset:", merged_df.shape)
merged_df.head()

# Step 4: Compute batter form (last 5 match average runs)
# Sort for rolling calculation
merged_df = merged_df.sort_values(by=['batsman', 'season', 'match_id'])

# Calculate rolling mean and reset index to merge back
rolling_form = (
    merged_df
    .groupby('batsman')['total_runs']
    .rolling(window=5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

# Add rolling average column to the main dataframe
merged_df['batter_form'] = rolling_form.values




Shape of player-level dataset: (64561, 7)


In [58]:
from sklearn.preprocessing import LabelEncoder

# Create a copy of the merged dataset
df_model = merged_df.copy()

# Columns to encode
cat_cols = ['batsman', 'batting_team', 'bowling_team', 'venue', 'season']
le_dict = {}

# Apply Label Encoding
for col in cat_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])
    le_dict[col] = le  # Save encoder in case needed later


In [60]:
# Define input features and target
features = ['batsman', 'batting_team', 'bowling_team', 'venue', 'batter_form']
target = 'total_runs'

X = df_model[features]
y = df_model[target]


In [62]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Random Forest RMSE: {rmse:.2f}")
print(f"Random Forest R²: {r2:.2f}")


Random Forest RMSE: 14.57
Random Forest R²: 0.34




In [64]:
import random
import pandas as pd

# Sample 5 random test indices
sample_indices = random.sample(range(len(X_test)), 5)

# Get the input features and true values
sample_inputs = X_test.iloc[sample_indices]
sample_true_runs = y_test.iloc[sample_indices].values

# Predict using the trained model
sample_preds = rf_model.predict(sample_inputs)

# Match batsman & match_id from original merged data (align indices)
original_sample = merged_df.iloc[X_test.index[sample_indices]].copy()
original_sample['Predicted Runs'] = sample_preds
original_sample['Actual Runs'] = sample_true_runs

# Display relevant columns
display_cols = ['batsman', 'match_id', 'venue', 'bowling_team', 'batting_team', 'batter_form', 'Predicted Runs', 'Actual Runs']
original_sample[display_cols]


Unnamed: 0,batsman,match_id,venue,bowling_team,batting_team,batter_form,Predicted Runs,Actual Runs
23605,Rahel Khan,1273133,"Svanholm Park, Brondby",Denmark,Sweden,18.0,30.8,35
7215,GWHM Perera,757515,Mercantile Cricket Association Ground,South Africa,Sri Lanka,8.4,45.09,4
26989,C Brown,1286683,"Sir Vivian Richards Stadium, North Sound, Antigua",Bermuda,Belize,11.5,4.76,8
62389,Ahmer Bin Nisar,1463661,"ICC Academy, Dubai",Oman,Bahrain,27.4,11.07,15
35200,S Mostary,1335789,"Sylhet International Cricket Stadium, Academy ...",Pakistan,Bangladesh,5.2,21.4,21
