In [166]:
# src = "https://cricsheet.org/"
import json
import os
import polars as pl
root = "/home/tommy/Desktop/tbs-dev/cric-search/science/data"

In [167]:
jmb = "JM Bairstow"
btf = "BT Foakes"

In [168]:
from typing import TypedDict

class Extras(TypedDict):
    wides: int | None
    legbyes: int | None
    byes: int | None
    noballs: int | None
    penalty: int | None

class Runs(TypedDict):
    batter: int
    extras: int
    total: int

class Delivery(TypedDict):
    batter: str
    bowler: str
    extras: Extras | None
    non_striker: str
    runs: Runs

class Over(TypedDict):
    over: int
    deliveries: list[Delivery]

class Inning(TypedDict):
    team: str
    overs: list[Over]

In [169]:
from dataclasses import dataclass

@dataclass
class Data:
    byes: int
    catches: int
    net: int
    runs: int
    stumpings: int

    @classmethod
    def new(cls) -> "Data":
        return cls(0, 0, 0, 0, 0)

    def __str__(self) -> str:
        return f"Runs: {self.runs}, Byes: {self.byes}, Net: {self.net}, Catches: {self.catches}, Stumpings: {self.stumpings}"
    
    def __repr__(self) -> str:
        return self.__str__()
    
    def __add__(self, other: "Data") -> "Data":
        return Data(
            self.byes + other.byes,
            self.catches + other.catches,
            self.net + other.net,
            self.runs + other.runs,
            self.stumpings + other.stumpings,
        )
    
    def __sub__(self, other: "Data") -> "Data":
        return Data(
            self.byes - other.byes,
            self.catches - other.catches,
            self.net - other.net,
            self.runs - other.runs,
            self.stumpings - other.stumpings,
        )
    
    def efficiency(self) -> float | None:
        ''' Only none when there is no net contribution, e.g. 0 runs scored and 0 byes conceded '''
        denom = self.runs + self.byes
        if denom == 0:
            return None
        return self.runs / denom
    
    def dismissals(self) -> int:
        return self.catches + self.stumpings
    

@dataclass
class Player:
    name: str
    matches: list[Data]
    total: Data

    def dismissal_weighted_average(self) -> float:
        ''' The summed net contribution weighted by the dismissals across all matches '''
        return sum([match.dismissals() * match.net for match in self.matches]) / sum([match.dismissals() for match in self.matches])

    def dismissals_per_match(self) -> float:
        return self.total.dismissals() / len(self.matches)
    
    def runs_per_match(self) -> float:
        return self.total.runs / len(self.matches)
    
    def byes_per_match(self) -> float:
        return self.total.byes / len(self.matches)

    def to_df(self) -> pl.DataFrame:
        x: list[int] = []
        runs: list[int] = []
        byes: list[int] = []
        net: list[int] = []
        efficiency: list[float] = []
        for i, match in enumerate(self.matches):
            x.append(i)
            runs.append(match.runs)
            byes.append(match.byes)
            net.append(match.net)
            efficiency.append(match.efficiency())
        return pl.DataFrame({
            "test #": x,
            "runs": runs,
            "byes": byes,
            "net": net,
            "efficiency": efficiency
        })

@dataclass
class Match:
    innings: list[Inning]

In [170]:
df: pl.DataFrame = pl.DataFrame()
jmb_matches: list[Match] = []
btf_matches: list[Match] = []
for subdir, dirs, files in os.walk(f"{root}/tests_json"):
    for file in files:
        try:
            with open(f"{subdir}/{file}", "r") as f:
                data = json.load(f)
        except json.JSONDecodeError as e:
            print(data)
            print(e)
            continue
        players = data["info"]["players"]
        if "England" not in players:
            continue
        english_players = players["England"]
        jmb_playing = jmb in english_players
        btf_playing = btf in english_players
        if not jmb_playing and not btf_playing:
            continue
        innings: list[Inning] = data["innings"]
        # If Foakes is playing he's keeping, otherwise Bairstow is
        if btf_playing:
            btf_matches.append(Match(innings))
        elif jmb_playing:
            jmb_matches.append(Match(innings))
        else:
            raise Exception("Unreachable")
        

{'meta': {'data_version': '1.1.0', 'created': '2022-12-29', 'revision': 1}, 'info': {'balls_per_over': 6, 'city': 'Melbourne', 'dates': ['2022-12-26', '2022-12-27', '2022-12-28', '2022-12-29'], 'event': {'name': 'South Africa tour of Australia', 'match_number': 2}, 'gender': 'male', 'match_type': 'Test', 'match_type_number': 2485, 'officials': {'match_referees': ['RB Richardson'], 'reserve_umpires': ['PJ Gillespie'], 'tv_umpires': ['CB Gaffaney'], 'umpires': ['PR Reiffel', 'RA Kettleborough']}, 'outcome': {'winner': 'Australia', 'by': {'innings': 1, 'runs': 182}}, 'player_of_match': ['DA Warner'], 'players': {'South Africa': ['D Elgar', 'SJ Erwee', 'TB de Bruyn', 'T Bavuma', 'K Zondo', 'K Verreynne', 'M Jansen', 'KA Maharaj', 'K Rabada', 'A Nortje', 'L Ngidi'], 'Australia': ['DA Warner', 'UT Khawaja', 'M Labuschagne', 'SPD Smith', 'TM Head', 'C Green', 'AT Carey', 'PJ Cummins', 'NM Lyon', 'MA Starc', 'SM Boland']}, 'registry': {'people': {'A Nortje': 'acdc62f5', 'AT Carey': '69d03465',

In [171]:
def calculate_data(matches: list[Match], player: str) -> Player:
    matches_data: list[Data] = []
    total = Data.new()
    for match in matches:
        match_data = Data.new()
        for inning in match.innings:
            is_fielding = inning["team"] != "England"
            for over in inning["overs"]:
                for delivery in over["deliveries"]:
                    if delivery["batter"] == player:
                        match_data.runs += delivery["runs"]["batter"]
                        match_data.net += delivery["runs"]["batter"]
                    elif is_fielding:
                        if (wickets := delivery.get("wickets")) is not None:
                            for wicket in wickets:
                                if (fielders := wicket.get("fielders")) is not None:
                                    if player in [fielder["name"] for fielder in fielders]:
                                        if wicket["kind"] == "caught":
                                            match_data.catches += 1
                                        if wicket["kind"] == "stumped":
                                            match_data.stumpings += 1
                        if (extras := delivery.get("extras")) is not None:
                            if (byes := extras.get("byes")) is not None:
                                match_data.byes += byes
                                match_data.net -= byes

        total += match_data
        matches_data.append(match_data)
        
    return Player(player, matches_data, total)

In [172]:
jmb_data = calculate_data(jmb_matches, jmb)
jmb_data.total

Runs: 4675, Byes: 575, Net: 4100, Catches: 212, Stumpings: 13

In [173]:
btf_data = calculate_data(btf_matches, btf)
btf_data.total

Runs: 934, Byes: 125, Net: 809, Catches: 57, Stumpings: 6

In [174]:
import plotly.express as px

joined = (
    jmb_data
        .to_df()
        .join(
            btf_data.to_df(),
            on="test #",
            how="outer",
        )
        .rename({
            "runs": f"{jmb} runs",
            "byes": f"{jmb} byes",
            "net": f"{jmb} net",
            "efficiency": f"{jmb} efficiency",
            "runs_right": f"{btf} runs",
            "byes_right": f"{btf} byes",
            "net_right": f"{btf} net",
            "efficiency_right": f"{btf} efficiency",
        })
)

fig = px.line(
    data_frame=joined.to_pandas(),
    x="test #",
    y=[f"{jmb} efficiency", f"{btf} efficiency"],
    title=f"{jmb} vs {btf} efficiency (JMB Fitness: {jmb_data.dismissal_weighted_average():.2f} | BTF Fitness: {btf_data.dismissal_weighted_average():.2f})",
    line_shape="spline",
)
fig.update_layout(
    font=dict(
        family="Lato, sans-serif",
        size=24,
        color="#7f7f7f"
    )
)

In [175]:
print(
    jmb_data.dismissals_per_match(),
    btf_data.dismissals_per_match(),
)
print(
    jmb_data.runs_per_match(),
    btf_data.runs_per_match(),
)
print(
    jmb_data.byes_per_match(),
    btf_data.byes_per_match(),
)

2.8125 3.15
58.4375 46.7
7.1875 6.25
