In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

# Load dataset
df = pd.read_csv('../data/processed/t20s_combined.csv')  
print(df.shape)
df.head()


  df = pd.read_csv('../data/processed/t20s_combined.csv')


(929433, 22)


Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.1,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,
1,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.2,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,
2,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.3,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,
3,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.4,Australia,Sri Lanka,M Klinger,AJ Finch,...,0,,,,,,,,,
4,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.5,Australia,Sri Lanka,M Klinger,AJ Finch,...,0,,,,,,,,,


In [5]:
# Group by batter (striker) + match to get runs in the match
batter_match = (
    df.groupby(['striker', 'batting_team', 'bowling_team', 'venue', 'match_id'])
      .agg({'runs_off_bat':'sum'})
      .reset_index()
      .rename(columns={'runs_off_bat':'total_runs'})
)

print(batter_match.shape)
batter_match.head()



(64561, 6)


Unnamed: 0,striker,batting_team,bowling_team,venue,match_id,total_runs
0,A Adekunle,Nigeria,Kenya,"Gahanga B Ground, Rwanda",1435621,0
1,A Adekunle,Nigeria,Rwanda,"Gahanga B Ground, Rwanda",1435648,0
2,A Adekunle,Nigeria,Uganda,"Achimota Senior Secondary School A Field, Accra",1423473,1
3,A Adekunle,Nigeria,Uganda,Entebbe Cricket Oval,1411272,1
4,A Adekunle,Nigeria,Uganda,"Gahanga B Ground, Rwanda",1435623,4


In [7]:
# Get match start dates for chronological order
match_dates = df[['match_id', 'start_date']].drop_duplicates()
batter_match = batter_match.merge(match_dates, on='match_id', how='left')
batter_match['start_date'] = pd.to_datetime(batter_match['start_date'])

# Sort for rolling mean calculation
batter_match = batter_match.sort_values(['striker', 'start_date'])

# Calculate rolling average (last 5 matches before this one)
batter_match['batter_form'] = (
    batter_match.groupby('striker')['total_runs']
    .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
)

batter_match['batter_form'] = batter_match['batter_form'].fillna(0)  # Set NaN to 0 if needed

batter_match.head()


Unnamed: 0,striker,batting_team,bowling_team,venue,match_id,total_runs,start_date,batter_form
3,A Adekunle,Nigeria,Uganda,Entebbe Cricket Oval,1411272,1,2023-12-14,0.0
5,A Adekunle,Nigeria,Zimbabwe,"Achimota Senior Secondary School A Field, Accra",1423472,1,2024-03-11,1.0
2,A Adekunle,Nigeria,Uganda,"Achimota Senior Secondary School A Field, Accra",1423473,1,2024-03-13,1.0
4,A Adekunle,Nigeria,Uganda,"Gahanga B Ground, Rwanda",1435623,4,2024-05-31,1.0
0,A Adekunle,Nigeria,Kenya,"Gahanga B Ground, Rwanda",1435621,0,2024-06-02,1.75


In [9]:
# Optionally downsample for faster training
SAMPLE_N = 100_000  

if len(batter_match) > SAMPLE_N:
    batter_match_sample = batter_match.sample(SAMPLE_N, random_state=42)
else:
    batter_match_sample = batter_match.copy()


In [11]:
features = ['striker', 'batting_team', 'bowling_team', 'venue', 'batter_form']
X = pd.get_dummies(batter_match_sample[features])
y = batter_match_sample['total_runs']


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [15]:
# Train model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

from sklearn.metrics import mean_absolute_error, r2_score
y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))


MAE: 12.208738577709925
R2: -0.03750290335352391


In [17]:
import joblib
joblib.dump(model, 'score_model.pkl')
joblib.dump(list(X.columns), 'feature_names.pkl')


['feature_names.pkl']

In [10]:
import os, time
import pandas as pd, numpy as np, joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

t0 = time.time()

DATA = '../data/processed/t20s_combined.csv'  
USECOLS = ['match_id','start_date','runs_off_bat','striker','batting_team','bowling_team','venue']
df = pd.read_csv(DATA, usecols=USECOLS, parse_dates=['start_date'])
print("Loaded:", df.shape, "rows,cols")

# Trim to recent years to speed up
START_YEAR = 2010
df = df[df['start_date'].dt.year >= START_YEAR].copy()
print("After year filter (>= %d):" % START_YEAR, df.shape)

# Build match-level totals & batter_form (previous N matches rolling mean)
per_match = (
    df.groupby(['striker','batting_team','bowling_team','venue','match_id','start_date'])['runs_off_bat']
      .sum()
      .reset_index(name='total_runs')
      .sort_values(['striker','start_date'])
)
WINDOW = 5   
per_match['batter_form'] = (
    per_match.groupby('striker')['total_runs']
             .transform(lambda s: s.shift(1).rolling(window=WINDOW, min_periods=1).mean())
).fillna(0.0)

# ----Train model----
features = ['striker','batting_team','bowling_team','venue','batter_form']
target   = 'total_runs'

X = pd.get_dummies(per_match[features])
y = per_match[target].astype(float)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = RandomForestRegressor(
    n_estimators=180,      # quicker than 300
    max_depth=16,          # caps tree size -> faster
    min_samples_leaf=2,    # a bit of regularization
    n_jobs=-1,
    random_state=42
)
print("Fitting RF on", X_train.shape, "…")
model.fit(X_train, y_train)

pred = model.predict(X_test)
print("MAE:", round(mean_absolute_error(y_test, pred), 2))
print("R^2:", round(r2_score(y_test, pred), 2))
print("Features:", X.shape[1], "Rows:", len(X))
print("Elapsed:", round(time.time()-t0,1), "sec")


Loaded: (929433, 7) rows,cols
After year filter (>= 2010): (902897, 7)
Fitting RF on (50170, 6205) …
MAE: 11.92
R^2: 0.11
Features: 6205 Rows: 62713
Elapsed: 1144.4 sec


In [12]:
import joblib
joblib.dump(model, 'score_model.pkl')
joblib.dump(list(X.columns), 'feature_names.pkl')


['feature_names.pkl']

In [14]:
sample = per_match.tail(1)[['striker','batting_team','bowling_team','venue','batter_form']]
enc = pd.get_dummies(sample).reindex(columns=X.columns, fill_value=0)
print("One sample pred (runs):", float(model.predict(enc)[0]))

One sample pred (runs): 9.186340594627014


In [20]:
import pandas as pd
df = pd.read_csv("../CricketAIWebApp/data/t20s_combined.csv")
df.to_csv("../CricketAIWebApp/data/t20s_combined.csv.gz", index=False, compression="gzip")

  df = pd.read_csv("../CricketAIWebApp/data/t20s_combined.csv")
