# in this file we are building a model 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout # type: ignore
from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import pandas as pd

# Load the cleaned dataset
cleaned_data_path = "data/cleaned_commentary_data.csv"
df_cleaned = pd.read_csv(cleaned_data_path)
print("Cleaned data loaded successfully!")

# Display the first few rows to confirm
print(df_cleaned.head())


  df_cleaned = pd.read_csv(cleaned_data_path)


Cleaned data loaded successfully!
  PlayType_description  Batting_Team_id Batting_Team_name Bowling_Team_id  \
0                  run                2         Australia               7   
1               no run                2         Australia               7   
2                 four                2         Australia               7   
3              leg bye                2         Australia               7   
4               no run                2         Australia               7   

  Bowling_Team_name      Innings  Total_Runs_on_delivery  \
0          Pakistan  1st innings                       1   
1          Pakistan  1st innings                       0   
2          Pakistan  1st innings                       4   
3          Pakistan  1st innings                       1   
4          Pakistan  1st innings                       0   

                                          Commentary  \
0  on the pads to start from Amir, no swing, work...   
1  drifts down leg this time, 

In [3]:
# Check for missing values
print("Missing values in the dataset:")
print(df_cleaned.isnull().sum())

# Display dataset shape
print(f"Dataset shape: {df_cleaned.shape}")

# Check total missing values in the entire dataset
total_missing = df_cleaned.isnull().sum().sum()
print(f"Total missing values in the dataset: {total_missing}")



Missing values in the dataset:
PlayType_description    0
Batting_Team_id         0
Batting_Team_name       0
Bowling_Team_id         0
Bowling_Team_name       0
                       ..
Runs                    0
Wickets                 0
Commentary_sentiment    0
Commentary_length       0
Event                   0
Length: 64, dtype: int64
Dataset shape: (321918, 64)
Total missing values in the dataset: 0


# Step 1: Data Splitting

In [4]:
from sklearn.model_selection import train_test_split

# Split the dataset into train, validation, and test sets
X = df_cleaned.drop(columns=["Commentary", "Commentary_short"])  # Features
y = df_cleaned["Commentary"]  # Target

# 70% Train, 15% Validation, 15% Test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")
print(f"Test set: {X_test.shape}, {y_test.shape}")


Training set: (225342, 62), (225342,)
Validation set: (48288, 62), (48288,)
Test set: (48288, 62), (48288,)


In [5]:
print(X_train.value_counts())


PlayType_description  Batting_Team_id  Batting_Team_name  Bowling_Team_id  Bowling_Team_name  Innings      Total_Runs_on_delivery  Score  Bowler_id  Bowler_name        Bowler_maiden  Bowler_balls  Bowler_wickets  Bowler_over  Bowler_conceded  Other_Bowler_id  Other_Bowler_name   Other_Bowler_maidens  Other_Bowler_balls  Other_Bowler_wickets  Other_Bowler_overs  Other_Bowler_conceded  Batsman_id  Batsman_name       Batsman_runs  Batsman_balls_faced  Batsman_four  Batsman_sixes  Other_Batsman_id  Other_Batsman_name  Other_Batsman_runs  Other_Batsman_balls_faced  Other_Batsman_four  Other_Batsman_sixes  Over_ball  Over_complete  Over_maiden  Over_no_Ball  Over_wide  Over_byes  Over_leg_byes  Over_number  Over_runs  Over_wickets  Dismissal_is_true  Dismissal_fielder_iskeeper  Dismissal_text  Innings_run_rate  Innings_byes  Innings_balls  Innings_no_balls  Innings_wickets  Innings_leg_byes  Innings_session  Innings_day  Innings_wides  Innings_runs  Runs  Wickets  Commentary_sentiment  Comme

In [6]:
print(y_train.value_counts())

Commentary
no commentory available                                                                                     2227
length on off stump, blocked on the front foot                                                               107
full outside off, left alone                                                                                  85
back of a length outside off, left alone                                                                      81
length outside off, left alone                                                                                81
                                                                                                            ... 
good length, angling in,<strong> bouncing and seaming away just past the outside edge</strong>                 1
stands tall and knocks this short of a length delivery past the bowler's right off the inside of the bat       1
tossed, on leg, tries to flick, misses , takes the pad to the off side               

# step 2 text processor 

In [7]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to C:\Users\Vandan
[nltk_data]     Prajapati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [8]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore

# Download NLTK data (if not already installed)
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize tools
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    # 1. Remove special characters and digits
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    
    # 2. Lowercase the text
    text = text.lower()
    
    # 3. Tokenize text
    tokens = word_tokenize(text)
    
    # 4. Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # 5. Rejoin tokens into a single string
    return " ".join(tokens)

# Apply preprocessing to the "Commentary" column
df_cleaned["Commentary_cleaned"] = df_cleaned["Commentary"].apply(preprocess_text)

# Tokenization and padding
# Initialize tokenizer
max_words = 10000  # Vocabulary size
max_sequence_length = 50  # Max length for sequences
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")  # Out-of-vocabulary token
tokenizer.fit_on_texts(df_cleaned["Commentary_cleaned"])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df_cleaned["Commentary_cleaned"])

# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding="post", truncating="post")

# Summary
print("Sample Original Commentary:", df_cleaned["Commentary"].iloc[0])
print("Sample Cleaned Commentary:", df_cleaned["Commentary_cleaned"].iloc[0])
print("Sample Tokenized Sequence:", sequences[0])
print("Sample Padded Sequence:", padded_sequences[0])


[nltk_data] Downloading package stopwords to C:\Users\Vandan
[nltk_data]     Prajapati\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Vandan
[nltk_data]     Prajapati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Vandan
[nltk_data]     Prajapati\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Sample Original Commentary: on the pads to start from Amir, no swing, worked down to fine leg to get off the mark.
Sample Cleaned Commentary: pad start amir swing worked fine leg get mark
Sample Tokenized Sequence: [26, 207, 810, 105, 120, 37, 4, 12, 362]
Sample Padded Sequence: [ 26 207 810 105 120  37   4  12 362   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0]


# tranformation of data from categorical to numerical data 

In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

# Encode categorical columns
bowler_encoder = LabelEncoder()
batsman_encoder = LabelEncoder()
df_cleaned["Bowler_name_encoded"] = bowler_encoder.fit_transform(df_cleaned["Bowler_name"].astype(str))
df_cleaned["Batsman_name_encoded"] = batsman_encoder.fit_transform(df_cleaned["Batsman_name"].astype(str))

# Scale numerical columns
scaler = StandardScaler()
df_cleaned["Bowler_over_scaled"] = scaler.fit_transform(df_cleaned[["Bowler_over"]])
df_cleaned["Innings_run_rate_scaled"] = scaler.fit_transform(df_cleaned[["Innings_run_rate"]])

# Combine features
X_features = np.hstack((
    df_cleaned[["Bowler_name_encoded", "Batsman_name_encoded"]].values,  # Encoded categorical features
    df_cleaned[["Bowler_over_scaled", "Innings_run_rate_scaled"]].values  # Scaled numerical features
))
y_encoded = tokenizer.texts_to_sequences(df_cleaned["Commentary_cleaned"])  # Target text sequences

print("X_features shape:", X_features.shape)
print("y_encoded length:", len(y_encoded))


X_features shape: (321918, 4)
y_encoded length: 321918


# Data spliting 

In [10]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore

# Split the data into training, validation, and test sets
padded_sequences_train, padded_sequences_temp, X_train_features, X_temp_features, y_train, y_temp = train_test_split(
    padded_sequences, X_features, y_encoded, test_size=0.3, random_state=42
)
padded_sequences_val, padded_sequences_test, X_val_features, X_test_features, y_val, y_test = train_test_split(
    padded_sequences_temp, X_temp_features, y_temp, test_size=0.5, random_state=42
)

# Pad the target sequences
max_target_sequence_length = 50
y_train_padded = pad_sequences(y_train, maxlen=max_target_sequence_length, padding="post")
y_val_padded = pad_sequences(y_val, maxlen=max_target_sequence_length, padding="post")
y_test_padded = pad_sequences(y_test, maxlen=max_target_sequence_length, padding="post")

# Ensure no extra dimensions
print("y_train_padded shape:", y_train_padded.shape)  # Should be (batch_size, sequence_length)



y_train_padded shape: (225342, 50)


In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import tensorflow as tf


# Pad the target sequences
max_target_sequence_length = 50
y_train_padded = pad_sequences(y_train, maxlen=max_target_sequence_length, padding="post")
y_val_padded = pad_sequences(y_val, maxlen=max_target_sequence_length, padding="post")
y_test_padded = pad_sequences(y_test, maxlen=max_target_sequence_length, padding="post")

# Pad the target sequences
max_target_sequence_length = 50
y_train_padded = pad_sequences(y_train, maxlen=max_target_sequence_length, padding="post")
y_val_padded = pad_sequences(y_val, maxlen=max_target_sequence_length, padding="post")
y_test_padded = pad_sequences(y_test, maxlen=max_target_sequence_length, padding="post")

# Ensure no extra dimensions
print("y_train_padded shape:", y_train_padded.shape)
print("y_val_padded shape:", y_val_padded.shape)
print("y_test_padded shape:", y_test_padded.shape)

# Ensure no extra dimensions
print("y_train_padded shape before adjustment:", y_train_padded.shape)
if len(y_train_padded.shape) > 2:
    y_train_padded = np.squeeze(y_train_padded)
print("y_train_padded shape after adjustment:", y_train_padded.shape)

print("y_val_padded shape before adjustment:", y_val_padded.shape)
if len(y_val_padded.shape) > 2:
    y_val_padded = np.squeeze(y_val_padded)
print("y_val_padded shape after adjustment:", y_val_padded.shape)

print("y_test_padded shape before adjustment:", y_test_padded.shape)
if len(y_test_padded.shape) > 2:
    y_test_padded = np.squeeze(y_test_padded)
print("y_test_padded shape after adjustment:", y_test_padded.shape)


y_train_padded shape: (225342, 50)
y_val_padded shape: (48288, 50)
y_test_padded shape: (48288, 50)
y_train_padded shape before adjustment: (225342, 50)
y_train_padded shape after adjustment: (225342, 50)
y_val_padded shape before adjustment: (48288, 50)
y_val_padded shape after adjustment: (48288, 50)
y_test_padded shape before adjustment: (48288, 50)
y_test_padded shape after adjustment: (48288, 50)


#  Model Architecture


In [12]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Concatenate, BatchNormalization, Reshape, RepeatVector
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

def debug_shape(tensor, message):
    """Debug helper to print tensor shapes during graph execution."""
    return tf.keras.layers.Lambda(
        lambda x: tf.print(message, tf.shape(x), summarize=-1) or x,
        output_shape=lambda input_shape: input_shape  # Ensure output shape matches input shape
    )(tensor)

def build_model(vocab_size, max_sequence_length, num_additional_features):
    text_input = Input(shape=(max_sequence_length,), name="Text_Input")
    embedding = Embedding(input_dim=vocab_size, output_dim=128)(text_input)
    lstm = LSTM(256, return_sequences=False, name="LSTM_Layer")(embedding)
    lstm = debug_shape(lstm, "Shape of LSTM output before RepeatVector:")

    # Expand LSTM output
    lstm = RepeatVector(max_sequence_length)(lstm)
    lstm = debug_shape(lstm, "Shape of LSTM output after RepeatVector:")

    additional_input = Input(shape=(num_additional_features,), name="Additional_Features")
    additional_input = debug_shape(additional_input, "Shape of additional input:")

    # Expand and tile additional features to match sequence dimensions
    additional_input_expanded = Dense(256, activation="relu")(additional_input)
    additional_input_expanded = debug_shape(additional_input_expanded, "Shape after Dense layer for additional input:")
    additional_input_expanded = RepeatVector(max_sequence_length)(additional_input_expanded)
    additional_input_expanded = debug_shape(additional_input_expanded, "Shape after RepeatVector:")

    # Combine LSTM output and expanded features
    combined = Concatenate(name="Feature_Concatenation")([lstm, additional_input_expanded])
    combined = debug_shape(combined, "Shape after concatenation:")

    # Fully Connected Layers
    x = Dense(128, activation="relu", name="Dense_Layer_1")(combined)
    x = BatchNormalization(name="Batch_Norm_1")(x)
    x = Dropout(0.3, name="Dropout_1")(x)
    x = Dense(64, activation="relu", name="Dense_Layer_2")(x)
    x = BatchNormalization(name="Batch_Norm_2")(x)
    x = Dropout(0.3, name="Dropout_2")(x)

    output = Dense(vocab_size, activation="softmax", name="Output_Layer")(x)

    model = Model(inputs=[text_input, additional_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model


# Build and summarize the model
vocab_size = 10000
max_sequence_length = 50
num_additional_features = X_features.shape[1]

model = build_model(vocab_size, max_sequence_length, num_additional_features)
model.summary()




In [13]:
print("Shape of padded_sequences_train:", padded_sequences_train.shape)
print("Shape of X_train_features:", X_train_features.shape)



Shape of padded_sequences_train: (225342, 50)
Shape of X_train_features: (225342, 4)


In [14]:
print("Shape of y_train_padded:", y_train_padded.shape)


Shape of y_train_padded: (225342, 50)


In [15]:
print("Shape of padded_sequences_train:", padded_sequences_train.shape)
print("Shape of X_train_features:", X_train_features.shape)
print("Shape of y_train_padded:", y_train_padded.shape)
print("Model Summary:")
model.summary()


Shape of padded_sequences_train: (225342, 50)
Shape of X_train_features: (225342, 4)
Shape of y_train_padded: (225342, 50)
Model Summary:


In [16]:
y_train_padded = tf.expand_dims(y_train_padded, axis=-1)
y_val_padded = tf.expand_dims(y_val_padded, axis=-1)


In [17]:
from tensorflow.keras.layers import Reshape

def build_model(vocab_size, max_sequence_length, num_additional_features):
    text_input = Input(shape=(max_sequence_length,), name="Text_Input")
    embedding = Embedding(input_dim=vocab_size, output_dim=128)(text_input)
    lstm = LSTM(256, return_sequences=True, name="LSTM_Layer")(embedding)

    additional_input = Input(shape=(num_additional_features,), name="Additional_Features")
    additional_input_expanded = Dense(256, activation="relu")(additional_input)
    additional_input_expanded = RepeatVector(max_sequence_length)(additional_input_expanded)

    # Combine LSTM output and expanded features
    combined = Concatenate(name="Feature_Concatenation")([lstm, additional_input_expanded])

    # Fully Connected Layers
    x = Dense(128, activation="relu", name="Dense_Layer_1")(combined)
    x = BatchNormalization(name="Batch_Norm_1")(x)
    x = Dropout(0.3, name="Dropout_1")(x)
    x = Dense(64, activation="relu", name="Dense_Layer_2")(x)
    x = BatchNormalization(name="Batch_Norm_2")(x)
    x = Dropout(0.3, name="Dropout_2")(x)

    output = Dense(vocab_size, activation="softmax", name="Output_Layer")(x)
    output = Reshape((max_sequence_length * vocab_size,), name="Reshaped_Output")(output)

    model = Model(inputs=[text_input, additional_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model

# Build and summarize the model
vocab_size = 10000
max_sequence_length = 50
num_additional_features = X_features.shape[1]

model = build_model(vocab_size, max_sequence_length, num_additional_features)
model.summary()

In [18]:
# Flatten target sequences
y_train_flattened = y_train_padded.flatten()
y_val_flattened = y_val_padded.flatten()
y_test_flattened = y_test_padded.flatten()

print("Shape of y_train_flattened:", y_train_flattened.shape)


AttributeError: 'tensorflow.python.framework.ops.EagerTensor' object has no attribute 'flatten'

In [64]:
print("Shape of y_train_padded:", y_train_padded.shape)  # Should match (225342, 50)
print("Shape of padded_sequences_train:", padded_sequences_train.shape)  # Should match (225342, 50)
print("Shape of X_train_features:", X_train_features.shape)  # Should match (225342, 4)


Shape of y_train_padded: (225342, 50)
Shape of padded_sequences_train: (225342, 50)
Shape of X_train_features: (225342, 4)


In [74]:
from tensorflow.keras.utils import to_categorical

# One-hot encode the target sequences to match model output shape
y_train_one_hot = to_categorical(y_train_padded, num_classes=vocab_size)
y_val_one_hot = to_categorical(y_val_padded, num_classes=vocab_size)
y_test_one_hot = to_categorical(y_test_padded, num_classes=vocab_size)

print("Shape of y_train_one_hot:", y_train_one_hot.shape)  # Should be (batch_size, sequence_length, vocab_size)
print("Shape of y_val_one_hot:", y_val_one_hot.shape)
print("Shape of y_test_one_hot:", y_test_one_hot.shape)


MemoryError: Unable to allocate 839. GiB for an array with shape (11267100, 10000) and data type float64

In [73]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow as tf

# Define callbacks
checkpoint_path = "best_model.h5"
callbacks = [
    ModelCheckpoint(checkpoint_path, monitor="val_loss", save_best_only=True, verbose=1),
    EarlyStopping(monitor="val_loss", patience=5, verbose=1, restore_best_weights=True),
]

# Train the model
history = model.fit(
    [padded_sequences_train, X_train_features],  # Features
    y_train_one_hot,                             # One-hot encoded targets
    validation_data=([padded_sequences_val, X_val_features], y_val_one_hot),
    batch_size=32,                               # Adjust batch size if necessary
    epochs=20,
    callbacks=callbacks
)


NameError: name 'y_train_one_hot' is not defined

In [19]:
import pandas as pd

# Load the cleaned dataset
cleaned_data_path = "data/cleaned_commentary_data.csv"
df_cleaned = pd.read_csv(cleaned_data_path)
print("Cleaned data loaded successfully!")

# Display the first few rows to confirm
print(df_cleaned.head())


  df_cleaned = pd.read_csv(cleaned_data_path)


Cleaned data loaded successfully!
  PlayType_description  Batting_Team_id Batting_Team_name Bowling_Team_id  \
0                  run                2         Australia               7   
1               no run                2         Australia               7   
2                 four                2         Australia               7   
3              leg bye                2         Australia               7   
4               no run                2         Australia               7   

  Bowling_Team_name      Innings  Total_Runs_on_delivery  \
0          Pakistan  1st innings                       1   
1          Pakistan  1st innings                       0   
2          Pakistan  1st innings                       4   
3          Pakistan  1st innings                       1   
4          Pakistan  1st innings                       0   

                                          Commentary  \
0  on the pads to start from Amir, no swing, work...   
1  drifts down leg this time, 