In [1]:
import pandas as pd

# Load the IPL 2022 ball-by-ball data
df = pd.read_csv('../data/ipl_2022_deliveries.csv')

# Show the top 5 rows
df.head()


Unnamed: 0,match_id,season,match_no,date,venue,batting_team,bowling_team,innings,over,striker,bowler,runs_of_bat,extras,wide,legbyes,byes,noballs,wicket_type,player_dismissed,fielder
0,202201,2022,1,"Mar 26, 2022","Wankhede Stadium, Mumbai",CSK,KKR,1,0.1,Gaikwad,Umesh Yadav,0,1,0,0,0,1,,,
1,202201,2022,1,"Mar 26, 2022","Wankhede Stadium, Mumbai",CSK,KKR,1,0.1,Gaikwad,Umesh Yadav,0,0,0,0,0,0,,,
2,202201,2022,1,"Mar 26, 2022","Wankhede Stadium, Mumbai",CSK,KKR,1,0.2,Gaikwad,Umesh Yadav,0,1,1,0,0,0,,,
3,202201,2022,1,"Mar 26, 2022","Wankhede Stadium, Mumbai",CSK,KKR,1,0.2,Gaikwad,Umesh Yadav,0,0,0,0,0,0,,,
4,202201,2022,1,"Mar 26, 2022","Wankhede Stadium, Mumbai",CSK,KKR,1,0.3,Gaikwad,Umesh Yadav,0,0,0,0,0,0,caught,Gaikwad,Nitish Rana


In [2]:
# Drop columns we don't need for score prediction
df_cleaned = df.drop(columns=[
    'season', 'match_no', 'date', 'venue',
    'wicket_type', 'player_dismissed', 'fielder'
])

# Check the cleaned data
df_cleaned.head()


Unnamed: 0,match_id,batting_team,bowling_team,innings,over,striker,bowler,runs_of_bat,extras,wide,legbyes,byes,noballs
0,202201,CSK,KKR,1,0.1,Gaikwad,Umesh Yadav,0,1,0,0,0,1
1,202201,CSK,KKR,1,0.1,Gaikwad,Umesh Yadav,0,0,0,0,0,0
2,202201,CSK,KKR,1,0.2,Gaikwad,Umesh Yadav,0,1,1,0,0,0
3,202201,CSK,KKR,1,0.2,Gaikwad,Umesh Yadav,0,0,0,0,0,0
4,202201,CSK,KKR,1,0.3,Gaikwad,Umesh Yadav,0,0,0,0,0,0


In [3]:
# Step 1: Create a column for total runs in that ball
df_cleaned['total_runs'] = df_cleaned['runs_of_bat'] + df_cleaned['extras']

# Step 2: Group by match_id, innings, and over — sum total runs in each over
over_summary = df_cleaned.groupby(['match_id', 'innings', 'over']).agg({
    'total_runs': 'sum'
}).reset_index()

# Step 3: Add cumulative score column
over_summary['cumulative_score'] = over_summary.groupby(['match_id', 'innings'])['total_runs'].cumsum()

# Preview
over_summary.head(10)


Unnamed: 0,match_id,innings,over,total_runs,cumulative_score
0,202201,1,0.1,1,1
1,202201,1,0.2,1,2
2,202201,1,0.3,0,2
3,202201,1,0.4,1,3
4,202201,1,0.5,0,3
5,202201,1,0.6,0,3
6,202201,1,1.1,0,3
7,202201,1,1.2,0,3
8,202201,1,1.3,1,4
9,202201,1,1.4,4,8


In [4]:
# For each innings, get the last cumulative score (i.e., final score)
final_scores = over_summary.groupby(['match_id', 'innings'])['cumulative_score'].max().reset_index()
final_scores.rename(columns={'cumulative_score': 'final_score'}, inplace=True)

# Merge final score into over_summary (so we know the target)
data_for_model = pd.merge(over_summary, final_scores, on=['match_id', 'innings'])

# Let’s see how it looks
data_for_model.head(10)


Unnamed: 0,match_id,innings,over,total_runs,cumulative_score,final_score
0,202201,1,0.1,1,1,131
1,202201,1,0.2,1,2,131
2,202201,1,0.3,0,2,131
3,202201,1,0.4,1,3,131
4,202201,1,0.5,0,3,131
5,202201,1,0.6,0,3,131
6,202201,1,1.1,0,3,131
7,202201,1,1.2,0,3,131
8,202201,1,1.3,1,4,131
9,202201,1,1.4,4,8,131


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Features and label
X = data_for_model[['over', 'cumulative_score']]
y = data_for_model['final_score']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = LinearRegression()
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict(X_test)

# Evaluation
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))


MAE: 17.053196487309936
RMSE: 23.05785929414146
R² Score: 0.3418316881851009


In [7]:
# Simulate live match scenario
current_over = 10
current_score = 78

# Make prediction
predicted_final_score = model.predict([[current_over, current_score]])

print(f"Predicted Final Score: {predicted_final_score[0]:.2f} runs")


Predicted Final Score: 162.53 runs


