<a href="https://colab.research.google.com/github/udaydaroch/InternationalT20GameOutcomePredictor/blob/main/MatchPredictionModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import requests
import zipfile
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import joblib
from sklearn.tree import export_graphviz
import graphviz

# URL of the zip file
url = 'https://cricsheet.org/downloads/t20s_json.zip'
zip_path = 't20s_json.zip'
extract_path = 't20s_json/'

# Download the zip file
response = requests.get(url)
with open(zip_path, 'wb') as f:
    f.write(response.content)

# Unzip the downloaded file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Function to load a single JSON file and extract match-level data
def load_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
        match_info = data['info']
        outcome = match_info.get('outcome', {})

        if 'winner' in outcome:
            winner = outcome['winner']
        else:
            return None  # Skip matches with no outcome recorded

        toss_winner = match_info['toss']['winner']
        team1, team2 = match_info['teams']
        gender = match_info['gender']
        venue = match_info['venue']

        match_stats = {
            'team1': team1,
            'team2': team2,
            'toss_winner': toss_winner,
            'winner': winner,
            'gender': gender,
            'venue': venue
        }

        return match_stats

# List all JSON files in the extracted folder
file_paths = [os.path.join(extract_path, file) for file in os.listdir(extract_path) if file.endswith('.json')]

# Load data from all JSON files
all_matches_data = []
for file_path in file_paths:
    match_data = load_json(file_path)
    if match_data:
        all_matches_data.append(match_data)

# Convert list of dictionaries to a DataFrame
df = pd.DataFrame(all_matches_data)

# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values
df.dropna(inplace=True)

# Encode categorical variables
df_encoded = pd.get_dummies(df, columns=['team1', 'team2', 'toss_winner', 'gender', 'venue'])

# Prepare data for the model
features = df_encoded.drop('winner', axis=1)
target = df['winner']

# Encode the target variable using LabelEncoder
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(target)

# Ensure the feature and target arrays have the same length
print(f"Features shape: {features.shape}")
print(f"Target shape: {target_encoded.shape}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target_encoded, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Save the model
model_path = 'random_forest_model.joblib'
joblib.dump(model, model_path)
print(f"Model saved to {model_path}")

# Predict
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Save the combined DataFrame to a single JSON Lines file
output_path = 'combined_matches.jsonl'
df.to_json(output_path, orient='records', lines=True)
print(f"Combined data saved to {output_path}")

# Example: Predict a new match
new_match = {
    'team1': 'India',
    'team2': 'Pakistan',
    'toss_winner': 'India',
    'gender': 'male',
    'venue': 'Dubai International Cricket Stadium'
}
new_match_df = pd.DataFrame([new_match])
new_match_encoded = pd.get_dummies(new_match_df).reindex(columns=features.columns, fill_value=0)

# Predict probability
probability = model.predict_proba(new_match_encoded)
predicted_winner = label_encoder.inverse_transform([model.predict(new_match_encoded)[0]])
predicted_probability = probability[0][model.classes_ == label_encoder.transform(predicted_winner)[0]][0]

print(f'Predicted winner: {predicted_winner[0]} with probability {predicted_probability*100:.2f}%')

# Visualize one of the trees in the RandomForest
estimator = model.estimators_[0]
feature_names = features.columns

# Export as dot file
dot_file = 'tree.dot'
export_graphviz(estimator, out_file=dot_file, feature_names=feature_names, class_names=label_encoder.classes_, rounded=True, proportion=False, precision=2, filled=True)

# Convert to png using system command (requires Graphviz)
graphviz.Source.from_file(dot_file).render('tree')

# Display in jupyter notebook
graph = graphviz.Source.from_file(dot_file)
graph.view()


team1          0
team2          0
toss_winner    0
winner         0
gender         0
venue          0
dtype: int64
Features shape: (3450, 695)
Target shape: (3450,)
Model saved to random_forest_model.joblib
Accuracy: 0.6391304347826087
Precision: 0.6723437071099034
Recall: 0.6391304347826087
F1 Score: 0.6338456696662924
Combined data saved to combined_matches.jsonl
Predicted winner: India with probability 68.50%


  _warn_prf(average, modifier, msg_start, len(result))


'tree.dot.pdf'