In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

print("Step 1: Load and prepare the dataset...")
df = pd.read_csv("IPL.csv")
df.fillna("NA", inplace=True)

print("Step 2: Aggregate data to over level within each match...")
# Group by match (ID), innings, and over
agg_df = df.groupby(['ID', 'innings', 'overs', 'BattingTeam']).agg({
    'total_run': 'sum',
    'batsman_run': 'sum',
    'extras_run': 'sum',
    'ballnumber': 'count',
    'batter': lambda x: x.mode()[0] if not x.mode().empty else 'NA',
    'bowler': lambda x: x.mode()[0] if not x.mode().empty else 'NA',
}).reset_index()

agg_df.rename(columns={
    'total_run': 'runs_in_over',
    'ballnumber': 'balls_bowled',
    'batter': 'main_batter',
    'bowler': 'main_bowler'
}, inplace=True)

print("Step 3: Encode categorical features...")
# Encode strings using LabelEncoder
label_cols = ['BattingTeam', 'main_batter', 'main_bowler']
for col in label_cols:
    le = LabelEncoder()
    agg_df[col] = le.fit_transform(agg_df[col])

# Feature matrix and target
features = ['innings', 'overs', 'balls_bowled', 'BattingTeam', 'main_batter', 'main_bowler']
target = 'runs_in_over'

X = agg_df[features]
y = agg_df[target]

print("Step 4: Split into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Step 5: Train the Random Forest Regressor...")
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

print("Step 6: Make predictions...")
y_pred = model.predict(X_test)

print("Step 7: Evaluate the model...")
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Check margin of error (±3 runs)
within_margin = np.abs(y_pred - y_test) <= 3
accuracy_margin = np.mean(within_margin)

print("\nModel Evaluation Results (Runs per Over):")
print("-----------------------------------------")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"Accuracy within ±3 runs: {accuracy_margin * 100:.2f}%")

# Show a few predictions for sanity check
print("\nSample Predictions:")
comparison = pd.DataFrame({
    'Actual Runs': y_test[:10].values,
    'Predicted Runs': y_pred[:10].round(2),
    'Error': (y_pred[:10] - y_test[:10]).round(2)
})
print(comparison)

Step 1: Load and prepare the dataset...
Step 2: Aggregate data to over level within each match...
Step 3: Encode categorical features...
Step 4: Split into training and test sets...
Step 5: Train the Random Forest Regressor...
Step 6: Make predictions...
Step 7: Evaluate the model...

Model Evaluation Results (Runs per Over):
-----------------------------------------
Mean Absolute Error: 3.60
Root Mean Squared Error: 4.56
R² Score: 0.02
Accuracy within ±3 runs: 49.33%

Sample Predictions:
       Actual Runs  Predicted Runs  Error
6415            14           10.13  -3.87
3725             0            7.29   7.29
18859           11            8.64  -2.36
16381            3            8.36   5.36
6933             6            7.29   1.29
12079            9            7.71  -1.29
7822             5            9.27   4.27
30975            3            9.16   6.16
22352            8            8.93   0.93
20476           19            7.81 -11.19


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

print("Step 1: Load and prepare the dataset...")
df = pd.read_csv("IPL.csv")
df.fillna("NA", inplace=True)

print("Step 2: Aggregate data to over level within each match...")
agg_df = df.groupby(['ID', 'innings', 'overs', 'BattingTeam']).agg({
    'total_run': 'sum',
    'batsman_run': 'sum',
    'extras_run': 'sum',
    'ballnumber': 'count',
    'batter': lambda x: x.mode()[0] if not x.mode().empty else 'NA',
    'bowler': lambda x: x.mode()[0] if not x.mode().empty else 'NA',
}).reset_index()

agg_df.rename(columns={
    'total_run': 'runs_in_over',
    'ballnumber': 'balls_bowled',
    'batter': 'main_batter',
    'bowler': 'main_bowler'
}, inplace=True)

print("Step 3: Encode categorical features...")
label_cols = ['BattingTeam', 'main_batter', 'main_bowler']
for col in label_cols:
    le = LabelEncoder()
    agg_df[col] = le.fit_transform(agg_df[col])

# Features and target
features = ['innings', 'overs', 'balls_bowled', 'BattingTeam', 'main_batter', 'main_bowler']
target = 'runs_in_over'

X = agg_df[features]
y = agg_df[target]

print("Step 4: Split into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Step 5: Train the Random Forest Regressor...")
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

print("Step 6: Make predictions...")
y_pred = model.predict(X_test)

print("Step 7: Evaluate the model...")
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Loop over multiple margins
margins = [1, 2, 3, 4, 5]
print("\nAccuracy within different margins of error:")
for m in margins:
    within_margin = np.abs(y_pred - y_test) <= m
    accuracy_margin = np.mean(within_margin)
    print(f"±{m} runs: {accuracy_margin * 100:.2f}% accurate")

# Final metrics summary
print("\nModel Evaluation Results (Runs per Over):")
print("-----------------------------------------")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

# Show sample predictions
print("\nSample Predictions:")
comparison = pd.DataFrame({
    'Actual Runs': y_test[:10].values,
    'Predicted Runs': y_pred[:10].round(2),
    'Error': (y_pred[:10] - y_test[:10]).round(2)
})
print(comparison)

Step 1: Load and prepare the dataset...
Step 2: Aggregate data to over level within each match...
Step 3: Encode categorical features...
Step 4: Split into training and test sets...
Step 5: Train the Random Forest Regressor...
Step 6: Make predictions...
Step 7: Evaluate the model...

Accuracy within different margins of error:
±1 runs: 17.49% accurate
±2 runs: 34.14% accurate
±3 runs: 49.33% accurate
±4 runs: 62.94% accurate
±5 runs: 74.17% accurate

Model Evaluation Results (Runs per Over):
-----------------------------------------
Mean Absolute Error: 3.60
Root Mean Squared Error: 4.56
R² Score: 0.02

Sample Predictions:
       Actual Runs  Predicted Runs  Error
6415            14           10.13  -3.87
3725             0            7.29   7.29
18859           11            8.64  -2.36
16381            3            8.36   5.36
6933             6            7.29   1.29
12079            9            7.71  -1.29
7822             5            9.27   4.27
30975            3            9.

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

print("Step 1: Load the dataset...")
df = pd.read_csv("IPL.csv")
df.fillna("NA", inplace=True)

print("Step 2: Aggregate to innings-level score per team...")
match_df = df.groupby(['ID', 'innings', 'BattingTeam'])['total_run'].sum().reset_index()

print("Step 3: Encode categorical team names...")
le = LabelEncoder()
match_df['BattingTeam'] = le.fit_transform(match_df['BattingTeam'])

# Features and target
X = match_df[['BattingTeam', 'innings']]
y = match_df['total_run']

print("Step 4: Split into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Step 5: Train the model...")
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

print("Step 6: Make predictions...")
y_pred = model.predict(X_test)

print("Step 7: Evaluate the model...")
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Margin-based accuracy
margins = [5, 10, 15, 20, 50]
print("\nAccuracy within margins of error:")
for m in margins:
    within_margin = np.abs(y_pred - y_test) <= m
    accuracy = np.mean(within_margin)
    print(f"±{m} runs: {accuracy * 100:.2f}%")

print("\nModel Evaluation Results (Using Only Batting Team & Innings):")
print("---------------------------------------------------------------")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

# Show sample predictions
print("\nSample Predictions:")
comparison = pd.DataFrame({
    'Actual Score': y_test[:10].values,
    'Predicted Score': y_pred[:10].round(2),
    'Error': (y_pred[:10] - y_test[:10]).round(2)
})
print(comparison)

Step 1: Load the dataset...
Step 2: Aggregate to innings-level score per team...
Step 3: Encode categorical team names...
Step 4: Split into train and test sets...
Step 5: Train the model...
Step 6: Make predictions...
Step 7: Evaluate the model...

Accuracy within margins of error:
±5 runs: 13.73%
±10 runs: 29.53%
±15 runs: 43.26%
±20 runs: 55.70%
±50 runs: 89.12%

Model Evaluation Results (Using Only Batting Team & Innings):
---------------------------------------------------------------
Mean Absolute Error: 23.39
Root Mean Squared Error: 30.65
R² Score: 0.12

Sample Predictions:
      Actual Score  Predicted Score  Error
1118           158           159.40   1.40
1652           131           165.43  34.43
1863           133           155.29  22.29
941            164           165.75   1.75
331            132           147.33  15.33
471            141           145.48   4.48
1344           167           165.43  -1.57
305            157           146.46 -10.54
1185           171      

In [5]:
mae = 50  # Example MAE value for margin of error

# Extract unique team names from the original dataframe
# This is necessary to get the list of all teams participating in the matches.
# The unique team names will be used to map them to their encoded values for prediction.
teams = df['BattingTeam'].unique()

# Encode the team names using the LabelEncoder
# The LabelEncoder was previously fitted on the 'BattingTeam' column.
# This ensures that the encoded values match the ones used during training.
teams_encoded = le.transform(teams)

# Create a mapping between encoded team values and their original names
# This mapping will help in displaying the team names alongside their predictions.
team_mapping = dict(zip(teams_encoded, teams))

print("\nEstimated Team Scores with Margin of Error and Confidence:")
print("-----------------------------------------------------------")
print(f"{'Team':35} {'Innings':>7} {'Predicted Score':>17} {'±MAE':>10} {'Confidence':>12}")

# Compute confidence: how often the prediction was within MAE
within_mae_margin = np.abs(y_pred - y_test) <= mae
confidence_mae = np.mean(within_mae_margin)

for team_id, team_name in zip(teams_encoded, teams):
    for innings in [1, 2]:
        # Create a sample input for this team and innings
        sample = pd.DataFrame([[team_id, innings]], columns=['BattingTeam', 'innings'])
        predicted_score = model.predict(sample)[0]
        predicted_score = int(predicted_score)

        # Print the estimated result
        print(f"{team_name:35} {innings:>7} {predicted_score:17} {mae:10.2f} {confidence_mae*100:11.2f}%")


Estimated Team Scores with Margin of Error and Confidence:
-----------------------------------------------------------
Team                                Innings   Predicted Score       ±MAE   Confidence
Rajasthan Royals                          1               160      50.00       89.12%
Rajasthan Royals                          2               150      50.00       89.12%
Gujarat Titans                            1               162      50.00       89.12%
Gujarat Titans                            2               171      50.00       89.12%
Royal Challengers Bangalore               1               165      50.00       89.12%
Royal Challengers Bangalore               2               147      50.00       89.12%
Lucknow Super Giants                      1               175      50.00       89.12%
Lucknow Super Giants                      2               181      50.00       89.12%
Sunrisers Hyderabad                       1               165      50.00       89.12%
Sunrisers Hyderabad 

In [6]:
def predict_team_score(team_name, innings, mae):

    # Encode the team name using the LabelEncoder
    if team_name not in team_mapping.values():
        raise ValueError(f"Team '{team_name}' not found in the dataset.")

    team_id = list(team_mapping.keys())[list(team_mapping.values()).index(team_name)]

    # Create a sample input for this team and innings
    sample = pd.DataFrame([[team_id, innings]], columns=['BattingTeam', 'innings'])
    predicted_score = model.predict(sample)[0]
    predicted_score = int(predicted_score)

    # Compute confidence: how often the prediction was within MAE
    within_mae_margin = np.abs(y_pred - y_test) <= mae
    confidence_mae = np.mean(within_mae_margin) * 100

    return predicted_score, confidence_mae

# Test the function with Sunrisers Hyderabad, innings 2, and mae 40
team_name = "Sunrisers Hyderabad"
innings = 2
mae = 40

predicted_score, confidence = predict_team_score(team_name, innings, mae)
print(f"Predicted Score for {team_name} in Innings {innings}: {predicted_score}")
print(f"Confidence Level: {confidence:.2f}%")

Predicted Score for Sunrisers Hyderabad in Innings 2: 149
Confidence Level: 83.16%


In [7]:
import joblib

joblib.dump(model, "model.pkl")
joblib.dump(le, "label_encoder.pkl")
joblib.dump(y_pred, "y_pred.pkl")
joblib.dump(y_test, "y_test.pkl")

['y_test.pkl']