In [2]:
from collections import namedtuple
from functools import partial
from itertools import product
import pandas as pd
from toolz import groupby, pipe
from typing import Iterator, List, Text, Tuple

In [3]:
def process_first_innings_df():
    df = pd.read_csv('ODIs/win_probability_1st_innings.csv')
    # we only want the elements at the end of the over
    df = df[df.Ball == 6]
    # there seems to be some spurious data in the Runs column - Cleaning that up
    df['Runs1'] = df['overs_balls'] * df['RunRate']
    df['Runs1'] = df.Runs1.astype(int)
    df = df[['Over', 'Wickets', 'Runs1', 'Runs',
             'win_probability', 'score', 
             'score_lo', 'score_hi', 
             'overs_balls', 'RunRate']].drop_duplicates()
    df = df[(df.Runs == df.Runs1)]
    df = df[['Over', 'Wickets', 'Runs', 'win_probability', 'score', 'score_lo', 'score_hi']]
    df.sort_values(by=['Over', 'Wickets', 'Runs'], inplace=True)
    # If there are repeated elements, get the max win probability element
    df['max_repeat'] = (df.groupby(['Over', 'Wickets', 'Runs'], 
                                   as_index=False)['win_probability'].
                       transform(lambda x: x.max()))
    df = df[df.win_probability == df.max_repeat]
    return df
    
def process_second_innings_df():
    df = pd.read_csv('ODIs/win_probability_2nd_innings.csv')
    #we only want the elements at the end of the over
    #df['max_ball'] = df.groupby(['Over', 'game_id'], as_index=False)['Ball'].transform(lambda x: x.max())
    df = df[df.Ball == 6]
    # there seems to be some spurious data in the Runs column - Cleaning that up
    # recall that run rate is the required run rate for the second innings
    df['Runs1'] = df.Target - ((50 - df['overs_balls']) * df['RunRate'])
    df['Runs1'] = df.Runs1.astype(int)
    df = df[['Over', 'Wickets', 'Runs1', 'Runs', 'Target',
             'win_probability', 'score', 
             'score_lo', 'score_hi', 
             'overs_balls', 'RunRate']].drop_duplicates()
    df = df[(df.Runs == df.Runs1)]
    df = df[['Over', 'Wickets', 'Runs', 'Target', 'win_probability', 'score', 'score_lo', 'score_hi']]
    df.sort_values(by=['Over', 'Wickets', 'Runs'], inplace=True)
    # # If there are repeated elements, get the max win probability element
    df['max_repeat'] = (df.groupby(['Over', 'Wickets', 'Runs'], 
                                   as_index=False)['win_probability'].
                       transform(lambda x: x.max()))
    df = df[df.win_probability == df.max_repeat]

    # Now we want to change the meaning of the state
    df['Over'] = 50 - df['Over']
    df['Runs'] = df['Target'] - df['Runs']
    df['Wickets'] = 10 - df['Wickets']

    assert max(df.Wickets) == 10
    assert max(df.Over) == 49
    assert min(df.Over) == 0
    assert min(df.Wickets) == 0

    df['score'] = -1
    df['score_lo'] = -1
    df['score_hi'] = -1
    return df

In [5]:
State = namedtuple('State', ["Over", "Wicket", "Runs"])
WinProbability = namedtuple('WinProbability', ["win_probability", "mean", "lo", "hi"])

def extract_win_probability(row) -> Tuple[State, WinProbability]:
    state = State(row.Over, row.Wickets, row.Runs)
    win_probability = WinProbability(row.win_probability, row.score, row.score_lo, row.score_hi)
    return state, win_probability

def match(pairs: List[Tuple[State, WinProbability]], 
          attribute_value: int,
          attribute_name: Text) -> List[Tuple[State, WinProbability]]:
    grped = groupby(lambda pair: getattr(pair[0], attribute_name ), pairs)
    matched_attribute_value = sorted(((attr_val, abs(attr_val - attribute_value)) 
                                      for attr_val in grped), key=lambda k: k[1])[0][0]
    
    return grped[matched_attribute_value]

def closest(state: State, pairs: List[Tuple[State, WinProbability]]) -> WinProbability:
    match_over = partial(match, attribute_value=state.Over, attribute_name='Over')
    match_wickets = partial(match, attribute_value=state.Wicket, attribute_name='Wicket')
    match_runs = partial(match, attribute_value=state.Runs, attribute_name="Runs")
    best = pipe(pairs, match_over, match_runs, match_wickets)
    assert len(best) > 0
    return best[0][1]

def fill_wp_matrix(pairs: List[Tuple[State, WinProbability]],
                  innings: int) -> Iterator[Tuple[State, WinProbability]]:
    overs = range(0, 50) if innings == 2 else range(1, 51)
    wickets = range(0, 11)
    runs = range(0, 500)
    for (over, wicket, run) in product(overs, wickets, runs):
        state = State(over, wicket, run)
        win_probability = closest(state, pairs)
        yield state, win_probability
        
def write_to_file(location: Text, innings: int, df: pd.DataFrame) -> None: 
    pairs = [extract_win_probability(row) for row in df.itertuples()]
    with open(location, "w") as f:
        for c, (state, win_probability) in enumerate(fill_wp_matrix(pairs, innings=innings)):
            if c % 27500 == 0:
                print(c)
            wp = win_probability.win_probability
            score = win_probability.mean
            lo = win_probability.lo
            hi = win_probability.hi
            over = state.Over
            wicket = state.Wicket
            run = state.Runs
            f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(
                over, wicket, run, wp, score, lo, hi))

In [6]:
innings = 2
write_to_file("ODIs/probs_i2_odi.txt" , 2, process_second_innings_df())

0
27500
55000
82500
110000
137500
165000
192500
220000
247500


In [7]:
df =  process_second_innings_df()
pairs = [extract_win_probability(row) for row in df.itertuples()]

In [10]:
pairs[-102]

(State(Over=1, Wicket=4, Runs=2),
 WinProbability(win_probability=0.02080609680676171, mean=-1, lo=-1, hi=-1))

In [None]:
closest(St)