In [13]:
# Data Engineering and Feature Enhancement
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE

In [14]:
# Load dataset (limit to first 2000 rows for speed, adjust as needed)
data = pd.read_csv("/kaggle/input/cricket/over_features.csv")
data = data.iloc[:2000]

# Check for missing values and drop any rows with missing values
print("Missing values in data:")
print(data.isnull().sum())
data.dropna(inplace=True)

# Remove identifier
data = data.drop(columns=["match_id"])

# Encode categorical columns: 'team' and 'match_phase'
team_mapping = {team: idx for idx, team in enumerate(data["team"].unique())}
data["team_encoded"] = data["team"].map(team_mapping)
phase_mapping = {phase: idx for idx, phase in enumerate(data["match_phase"].unique())}
data["match_phase_encoded"] = data["match_phase"].map(phase_mapping)

# Feature Engineering: Add enhanced features
data["pressure_index"] = data["dot_ball_pressure"] * data["required_desired_run_rate"]
data["wicket_pressure"] = data["number_of_wickets_lost"] * data["required_desired_run_rate"]
data["late_over_flag"] = (data["over"] > 15).astype(int)
data["bowler_pressure"] = data["current_bowler_economy"] * (data["bowler_wickets_in_match"] + 1)
data["aggressiveness_index"] = data["striker_strike_rate"] * (data["striker_boundaries_hit"] + 1)

# Drop the original categorical columns if not needed further
data = data.drop(columns=["team", "match_phase"])

# Define features and target
X = data.drop(columns=["wicket_next_over"])
y = data["wicket_next_over"]

# Train-Test Split (using stratification)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)


Missing values in data:
match_id                             0
innings                              0
team                                 0
over                                 0
balls_faced_by_striker               0
striker_strike_rate                  0
striker_boundaries_hit               0
dot_ball_pressure                    0
current_bowler_economy               0
bowler_wickets_in_match              0
total_overs_completed                0
overs_since_last_wicket              0
number_of_wickets_lost               0
required_desired_run_rate            0
current_run_rate                     0
wickets_lost_last_3_overs            0
number_of_boundaries_last_3_overs    0
number_of_dot_balls_last_over        0
powerplay_overs_remaining            0
match_phase                          0
wicket_next_over                     0
dtype: int64


In [15]:
# Define evaluation metric function
def calculate_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    return acc, prec, rec, f1

# Neural Network Implementation with PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Determine input dimension from our feature-enhanced data
input_dim = X_train.shape[1]

class ImprovedWicketPredictor(nn.Module):
    def __init__(self, input_dim):
        super(ImprovedWicketPredictor, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.fc4 = nn.Linear(16, 8)
        self.fc5 = nn.Linear(8, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(p=0.5)
        self.batch_norm1 = nn.BatchNorm1d(64)
        self.batch_norm2 = nn.BatchNorm1d(32)
        self.batch_norm3 = nn.BatchNorm1d(16)
        self.batch_norm4 = nn.BatchNorm1d(8)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.batch_norm1(x)
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.batch_norm2(x)
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.batch_norm3(x)
        x = self.dropout(x)
        x = self.relu(self.fc4(x))
        x = self.batch_norm4(x)
        x = self.fc5(x)
        x = self.sigmoid(x)
        return x

def NN_predict(X_train, X_test, Y_train, Y_test, epochs=100, patience=10):
    # Convert training and test data to PyTorch tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_train_tensor = torch.tensor(Y_train.to_numpy(), dtype=torch.float32).view(-1, 1)
    y_test_tensor = torch.tensor(Y_test.to_numpy(), dtype=torch.float32).view(-1, 1)
    
    # Create DataLoader for the training data
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    
    # Instantiate the model, loss function, and optimizer
    model = ImprovedWicketPredictor(input_dim)
    criterion = nn.BCELoss()
    optimizer = optim.AdamW(model.parameters(), lr=0.001)
    
    best_accuracy = 0
    no_improvement = 0
    
    # Training loop with early stopping
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        # Evaluate on the training set for early stopping
        model.eval()
        with torch.no_grad():
            y_train_pred = model(X_train_tensor).round()
            train_acc = accuracy_score(y_train_tensor.numpy(), y_train_pred.numpy())
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}, Train Accuracy: {train_acc:.4f}")
        
        if train_acc > best_accuracy:
            best_accuracy = train_acc
            no_improvement = 0
        else:
            no_improvement += 1
        
        if no_improvement >= patience:
            print("Early stopping triggered.")
            break
    
    # Final evaluation on the test set
    model.eval()
    with torch.no_grad():
        y_test_pred = model(X_test_tensor).round()
    
    return y_test_pred, model

In [16]:
# Run the Neural Network on the feature-enhanced, scaled data
y_test_pred, nn_model = NN_predict(X_train_scaled, X_test_scaled, y_train, y_test)
acc, prec, rec, f1 = calculate_metrics(y_test, y_test_pred.numpy())
print("\nNeural Network Test Metrics:")
print(f" Accuracy: {acc:.4f}")
print(f" Precision: {prec:.4f}")
print(f" Recall: {rec:.4f}")
print(f" F1 Score: {f1:.4f}")

Epoch 1/100, Loss: 0.6818, Train Accuracy: 0.7000
Epoch 2/100, Loss: 0.6550, Train Accuracy: 0.7181
Epoch 3/100, Loss: 0.6267, Train Accuracy: 0.7194
Epoch 4/100, Loss: 0.6241, Train Accuracy: 0.7225
Epoch 5/100, Loss: 0.6105, Train Accuracy: 0.7275
Epoch 6/100, Loss: 0.6060, Train Accuracy: 0.7262
Epoch 7/100, Loss: 0.6018, Train Accuracy: 0.7256
Epoch 8/100, Loss: 0.5908, Train Accuracy: 0.7250
Epoch 9/100, Loss: 0.5968, Train Accuracy: 0.7244
Epoch 10/100, Loss: 0.5907, Train Accuracy: 0.7244
Epoch 11/100, Loss: 0.5922, Train Accuracy: 0.7238
Epoch 12/100, Loss: 0.5911, Train Accuracy: 0.7238
Epoch 13/100, Loss: 0.5960, Train Accuracy: 0.7238
Epoch 14/100, Loss: 0.5901, Train Accuracy: 0.7225
Epoch 15/100, Loss: 0.5863, Train Accuracy: 0.7231
Early stopping triggered.

Neural Network Test Metrics:
 Accuracy: 0.7225
 Precision: 0.5000
 Recall: 0.0180
 F1 Score: 0.0348
