Cricket Test Match Prediction
==============

**Author:** *Tom*

This is a personal project to build a model to predict the outcome of test matches in cricket using a random descision forest.

In [None]:
# import required libraries
import requests
import sys
sys.path.append('../')    

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.inspection import permutation_importance

from scipy.stats import randint

import matplotlib as plt

import seaborn as sns

from bs4 import BeautifulSoup

import pandas as pd

import re

import numpy as np

Chapter 1: Data Scraping and Cleaning
---
Data scraping, cleaning, calculation of metrics to asses team performance

In [None]:
def CleanDate(date_str):
    return date_str.split('-')[0] + ' ' + date_str.split('-')[1].split(', ')[1]

def get_boundary_runs(df):
    df = df.apply(pd.to_numeric, errors='coerce')
    df['6s'] = df['6s'].multiply(6)
    df['4s'] = df['4s'].multiply(4)
    return df.sum().sum()


Functions to scrape specific match stats, handling exceptions and then combining into a single set of stats for later analysis.

In [None]:
def get_match_stats(link):
    response = requests.get(link)
    soup = BeautifulSoup(response.content, 'html.parser')

    teams = soup.find_all('span', class_='ds-text-title-xs ds-font-bold ds-capitalize')
    test_number = soup.find_all('span', class_='ds-text-tight-s ds-font-medium ds-text-typo ds-underline ds-decoration-ui-stroke hover:ds-text-typo-primary hover:ds-decoration-ui-stroke-primary ds-block')

    for t in test_number:
        if 'Test no.' in t.text:
            test_number = t.text.split()[-1]

    
        batting_order = [t.text for t in teams]

    try:
        bowling_order = [batting_order[0] if t == batting_order[1] else batting_order[1] for t in batting_order]

        match_data = pd.read_html(response.text)

        batting_stats = [data.loc[data['Batting'] == 'TOTAL'] for data in match_data if 'SR' in data.columns]
        boundary_stats = [data.iloc[:-1] for data in match_data if 'SR' in data.columns]
        bowling_stats = [data for data in match_data if 'ECON' in data.columns]

        runs = [eval(re.search(r'\s(\d+)', str(stats['R'])).group(1)) for stats in batting_stats]
        boundaries = [stats[['4s', '6s']].dropna(axis=0).astype('int64').sum(axis=0) for stats in batting_stats]
        overs = [eval(re.search(r'\s(\d+(\.\d+)?)', str(stats['Unnamed: 1'])).group(1)) for stats in batting_stats]
        boundaries = [get_boundary_runs(stats[['4s', '6s']]) for stats in boundary_stats]
        wickets = [stats[pd.to_numeric(stats['W'], errors='coerce').notna()]['W'].astype('int64').sum() for stats in bowling_stats]

        data_per_innings = {
            **{
                batting_order[i] + f'_{i//2+1} batting': {
                    'runs': runs[i],
                    'overs': overs[i],
                    'wickets': wickets[i],
                    'boundaries': boundaries[i]
                }
                for i in range(len(batting_order))
            },
            **{
                bowling_order[i] + f'_{i//2+1} bowling': {
                    'runs': runs[i],
                    'overs': overs[i],
                    'wickets': wickets[i],
                    'boundaries': boundaries[i]
                }
                for i in range(len(bowling_order))
            }
        }

        teams = set(batting_order)

        data = {
            **{
                t + '_bowling': {
                    'runs': 0,
                    'overs': 0,
                    'wickets': 0,
                    'boundaries': 0
                }
                for t in teams
            },
            **{
                t + '_batting': {
                    'runs': 0,
                    'overs': 0,
                    'wickets': 0,
                    'boundaries': 0
                }
                for t in teams
            },
        }

        for key, stats in data_per_innings.items():
            for t in teams:
                if t in key:
                    if 'batting' in key:
                        data[f'{t}_batting']['runs'] += stats['runs']
                        data[f'{t}_batting']['overs'] += stats['overs']
                        data[f'{t}_batting']['wickets'] += stats['wickets']
                        data[f'{t}_batting']['boundaries'] += stats['boundaries']
                    elif 'bowling' in key:
                        data[f'{t}_bowling']['runs'] += stats['runs']
                        data[f'{t}_bowling']['overs'] += stats['overs']
                        data[f'{t}_bowling']['wickets'] += stats['wickets']
                        data[f'{t}_bowling']['boundaries'] += stats['boundaries']

        return {'Test # ' + test_number: data} 
    
    except:
        return None

Takes the above function and deduces 7 statistics to rate team performance:
- Batting Stats
    1. Run Rate
        - Runs scored per over
    2. Batting Average
        - Runs scored per wicket
    3. Boundary Percentage
        - Percentage of runs scored through boundaries - i.e. fours and sixes
    4. Strike Rate
        - Runs scored per 100 balls
- Bowling Stats

    5. Bowling Average
        - Runs conceded per wicket taken
    6. Economy Rate
        - Runs conceded per over
    7. Balls per Wicket
        - Number of balls bowled to take a wicket


In [None]:
def get_evaluation_metrics(match_stats):
    teams = set([t.split('_')[0] for t in match_stats])
    team_metrics = {t: {
        'RR': None,
        'batting_average': None,
        'BP': None,
        'SR': None,
        'bowling_average': None,
        'ER': None,
        'balls_per_wicket': None
    } 
    for t in teams
    }
    for key, stats in match_stats.items():
        for t in teams:
            if t in key:
                if 'bowling' in key: 
                    team_metrics[t]['bowling_average'] = stats['runs'] / stats['wickets']
                    team_metrics[t]['ER'] = stats['runs'] / stats['overs']
                    team_metrics[t]['balls_per_wicket'] = stats['overs'] * 6 / stats['wickets']
                elif 'batting' in key:
                    team_metrics[t]['RR'] = stats['runs'] / stats['overs']
                    team_metrics[t]['batting_average'] = stats['runs'] / stats['wickets']
                    team_metrics[t]['BP'] = stats['boundaries'] / stats['runs']
                    team_metrics[t]['SR'] = stats['runs'] / (stats['overs'] * 6) * 100


    return team_metrics


Data Scraping
===

Scrape data from ESPN cricket test match record. 

URL: https://www.espncricinfo.com/records/format/test-matches-1

In [None]:
years_url = ['https://www.espncricinfo.com/records/decade/team-match-results-year/2020s-202/test-matches-1', 'https://www.espncricinfo.com/records/decade/team-match-results-year/2010s-201/test-matches-1']
data = [requests.get(yrs) for yrs in years_url]

In [None]:
soup = [BeautifulSoup(dt.text, 'html.parser') for dt in data]

links = [sp.find_all('a', class_='ds-inline-flex ds-items-start ds-leading-none') for sp in soup]
links = [l.get('href') for sublist in links for l in sublist]
links = [l for l in links if 'records' in l]

year_urls = [f'https://www.espncricinfo.com{l}' for l in links]

In [None]:
data = [requests.get(yrs) for yrs in year_urls]
soup = [BeautifulSoup(dt.text, 'html.parser') for dt in data]

links = [sp.find_all('a', class_='ds-inline-flex ds-items-start ds-leading-none') for sp in soup]
links = [l.get('href') for sublist in links for l in sublist]
links = [l for l in links if 'full-scorecard' in l]

match_urls = [f'https://www.espncricinfo.com{l}' for l in links]

Clean Data
===
Cleaning the data and renaming columns for easier analysis. Codes are assigned to grounds and oppositions. A target is assigned with 1 for a win, 0 for a draw and -1 for a loss. An important point to note and improvement that could be considered to the model is that each match appears twice, and only one side's current performance is analysed.

In [None]:
matches = [pd.read_html(requests.get(yr).text)[0] for yr in year_urls]
matches = pd.concat(matches)
matches = matches.drop(['Margin'], axis=1)
matches[['RR', 'batting_average', 'BP', 'SR', 'bowling_average', 'ER', 'balls_per_wicket']] = np.NaN

columns = ['Team', 'Opposition', 'Winner', 'Ground', 'Match Date']
matches1 = matches.rename(columns={'Team 1' : 'Team', 'Team 2': 'Opposition'})
matches2 = matches.rename(columns={'Team 2' : 'Team', 'Team 1': 'Opposition'})
matches2 = matches2[columns + [c for c in matches2.columns if c not in columns]]
matches = pd.concat([matches1, matches2], ignore_index=True)

matches['Match Date'] = pd.to_datetime(matches['Match Date'].apply(CleanDate))
matches = matches.sort_values('Match Date')
matches = matches[matches['Match Date'] >= '2015-01-01']

matches['ground_code'] = matches['Ground'].astype('category').cat.codes
matches['day_code'] = matches['Match Date'].dt.dayofweek
matches['opp_code'] = matches['Opposition'].astype('category').cat.codes

matches['target'] = matches.apply(\
    lambda row: 1 if row['Team'] == row['Winner']
                else -1 if row['Opposition'] == row['Winner']
                else 0,
    axis = 1
)

The individual match data is scraped, and where exceptions are raised. 

In [None]:
match_data = {}
for url in match_urls:
    print(url)
    match_stats = get_match_stats(url)

    if match_stats is not None:
        match_data.update(match_stats)
    else:
        print(f"No data returned for URL: {url}")

https://www.espncricinfo.com/series/sri-lanka-in-sa-2020-21-1237354/south-africa-vs-sri-lanka-1st-test-1237356/full-scorecard
https://www.espncricinfo.com/series/india-in-australia-2020-21-1223867/australia-vs-india-2nd-test-1223870/full-scorecard
https://www.espncricinfo.com/series/new-zealand-v-pakistan-2020-21-1233950/new-zealand-vs-pakistan-1st-test-1233962/full-scorecard
https://www.espncricinfo.com/series/india-in-australia-2020-21-1223867/australia-vs-india-1st-test-1223869/full-scorecard
https://www.espncricinfo.com/series/west-indies-in-nz-2020-21-1233943/new-zealand-vs-west-indies-2nd-test-1233958/full-scorecard
https://www.espncricinfo.com/series/west-indies-in-nz-2020-21-1233943/new-zealand-vs-west-indies-1st-test-1233957/full-scorecard
https://www.espncricinfo.com/series/england-v-pakistan-2020-1198227/england-vs-pakistan-3rd-test-1198243/full-scorecard
https://www.espncricinfo.com/series/england-v-pakistan-2020-1198227/england-vs-pakistan-2nd-test-1198242/full-scorecard
h

Computed the above defined match statistic for each team

In [None]:
matches_stats = {key: get_evaluation_metrics(inner_dict) for key, inner_dict in match_data.items()}


  team_metrics[t]['bowling_average'] = stats['runs'] / stats['wickets']
  team_metrics[t]['balls_per_wicket'] = stats['overs'] * 6 / stats['wickets']
  team_metrics[t]['batting_average'] = stats['runs'] / stats['wickets']


In [None]:
for test, match_stats in matches_stats.items():
    for team, team_stats in match_stats.items():
        for stat, stat_value in team_stats.items():
            matches.loc[(matches['Scorecard'] == test) & (matches['Team'] == team), stat] = stat_value

Save the dataframe for future analysis as the above steps take ~ 15 minutes to run

In [None]:
matches.to_pickle('data/matches.pkl')