## Using GLM

In [2]:
import os
import numpy as np
import pandas as pd
from datetime import datetime,time
import utils as u
import signals
from tqdm import tqdm

In [9]:
def signal_data_pipeline(event_id):
    data = pd.read_csv('glm/'+event_id+'.csv')
    # data = data[['event_id','minutes','chl_line','chl_hi','chl_low']]
    data = u.remove_empty_rows(data)
    # if remove_empty_rows removes ALL rows, then there will be an empty df
    # to prevent raise of error, only process the pipeline when df is not empty
    # if df is empty, return None
    if data.empty == False:
        data['line_odds'] = data.apply(lambda x: [x.chl_line, x.chl_hi, x.chl_low], axis = 1)
        odd_list = data[['minutes', 'line_odds']].groupby('minutes')['line_odds'].apply(list).reset_index(name='line_odds')
        data = odd_list.merge(data[['event_id',
                                    'minutes',
                                    'total_corner', 
                                    'status', 
                                    'home_score', 
                                    'away_score', 
                                    'home_odd', 
                                    'draw_odd', 
                                    'away_odd', 
                                    'nts_home', 
                                    'nts_no', 
                                    'nts_away']], how='inner', on='minutes')
        # data = data[['event_id', 'minutes', 'line_odds','total_corner']]

        data['min_odds_info'] = data.line_odds.apply(u.lowest_odd)
        data['line'] = data.min_odds_info.apply(lambda x: x[0])
        data['chl_low'] = data.min_odds_info.apply(lambda x: x[-1])
        data['chl_hi'] = data.min_odds_info.apply(lambda x: x[1])
#         data = data[['event_id','minutes','total_corner','line','chl_hi','chl_low']]
        del data['line_odds']
        data['minutes'] = data['minutes'].apply(lambda x: datetime.strptime(event_id[:8]+x, "%Y%m%d%H:%M:%S"))
        return data
    else:
        return None


def signal_rules(event_id, data, t, min_peak_change):
    # create peak df
    data['odd_change'] = data.chl_low/data.chl_low.shift(1)
    # line change
    #data['line_change'] = data.line - data.line.shift(1)
    # is a peak
    data['peak'] = np.where(data.odd_change > 1, 1, 0)
    peaks = data[data.peak == 1]
    peaks['peak_change'] = peaks.chl_low/peaks.chl_low.shift(1)
    peaks['peak_change'] = peaks.peak_change.apply(lambda x: round(x,4))


    # apply signal rules---------------------------------------
    peaks['signal'] = np.where(peaks.peak_change < min_peak_change, 1,
                              np.where(peaks.peak_change > 1.01,-1,0))
    # signal = 1 means that we predict results will be lower than chl_line, -1 vice versa
    peaks = peaks[peaks.minutes >= datetime.combine(datetime.strptime(event_id[:8],"%Y%m%d"), t)]
    #----------------------------------------------------------
    return peaks


def return_calc(signal_list):
    # merge current results
    result = pd.read_csv('/Users/TysonWu/dev/odds-crawl-app/odds-crawl-app/development/result_collection/match_corner_result.csv')
    signals = signal_list.merge(result[['event_id', 'result_corner', 'league']], how='inner', on='event_id')

    # exclude games without results
    signals = signals[~signals.result_corner.isna()]

    # calculate return
    signals['correct_prediction'] = np.where(signals.signal == 1,
                                             np.where(signals.line > signals.result_corner, 1, 0),
                                             np.where(signals.signal == 0, None,
                                                      np.where(signals.line < signals.result_corner, 1, 0)))
    signals['return'] = np.where(signals.signal == 1,
                                 np.where(signals.correct_prediction == 1, signals.chl_low-1, -1),
                                 np.where(signals.signal == 0, 0,
                                         np.where(signals.correct_prediction == 1, signals.chl_hi-1, -1)))
    signals['return'] = signals['return'].apply(lambda x: round(x,2))
    signals = signals.sort_values(by='event_id').reset_index(drop=True)
    return signals


def signal_analysis(t=time(1,30,0), min_peak_change=0.98): # returns a df
    signal_list = None
    for event_id in tqdm([file[:-4] for file in os.listdir('glm/') if '2020' in file]):
        # data pipeline
        data = signal_data_pipeline(event_id)
        # signal_data_pipeline returns none when the df is empty after undergo pipeline
        if data is not None:
            peaks = signal_rules(event_id, data, t, min_peak_change)
        else: # if df is empty then return an empty peaks df
            peaks = pd.DataFrame({})

        # if peaks df is empty then nothing will be concat
        if signal_list is None:
            if peaks.empty == False:
                signal_list = peaks.iloc[[0]]
        else:
            if peaks.empty == False:
                signal_list = pd.concat([signal_list, peaks.iloc[[0]]], ignore_index=True)

    signals = return_calc(signal_list)
    signals['date'] = signals.event_id.apply(lambda x: x[:8])
    signals['number'] = signals.event_id.apply(lambda x: int(x[11:]))
    signals = signals.sort_values(by=['date','number']).reset_index(drop=True)
    del signals['date']
    del signals['number']
    return signals


def graph_profit(signal):
    u.graph(signal.index, signal['return'].cumsum(), 'Profit over games')

In [25]:
s = signal_analysis()

100%|██████████| 11/11 [00:02<00:00,  4.49it/s]


In [26]:
s

Unnamed: 0,minutes,event_id,total_corner,status,home_score,away_score,home_odd,draw_odd,away_odd,nts_home,...,chl_low,chl_hi,odd_change,peak,peak_change,signal,result_corner,league,correct_prediction,return
0,2020-03-01 01:33:45,20200301SUN3,5,Match Status : 2nd Half In Progress;Scoring In...,2,2,1.92,1.94,11.5,1.84,...,1.22,3.8,1.008264,1,0.8905,1,8.0,Spanish Division 1,1.0,0.22
1,2020-03-01 01:33:36,20200301SUN9,5,Match Status : 2nd Half In Progress;Scoring In...,2,1,1.46,3.16,8.0,4.3,...,1.53,2.33,1.254098,1,0.9745,1,7.0,German Division 1,1.0,0.53
2,2020-03-01 01:35:01,20200301SUN13,11,Match Status : 2nd Half In Progress;Scoring In...,1,0,1.18,4.45,20.0,3.35,...,1.5,2.4,1.293103,1,0.9554,1,11.0,Spanish Division 1,1.0,0.5
3,2020-03-01 01:36:58,20200301SUN17,15,Match Status : 2nd Half In Progress;Scoring In...,1,1,5.9,1.51,3.55,4.75,...,1.48,2.45,1.275862,1,0.9737,1,21.0,Eng Premier,0.0,-1.0
4,2020-03-01 01:34:36,20200301SUN18,9,Match Status : 2nd Half In Progress;Scoring In...,2,3,21.0,4.4,1.2,3.32,...,1.48,2.45,1.275862,1,0.9427,1,9.0,Eng Premier,1.0,0.48
5,2020-03-01 01:36:22,20200301SUN24,6,Match Status : 2nd Half In Progress;Scoring In...,1,1,6.8,1.39,3.85,5.8,...,1.47,2.48,1.300885,1,0.98,0,7.0,Spanish Division 1,,0.0
6,2020-03-01 01:33:16,20200301SUN31,10,Match Status : 2nd Half In Progress;Scoring In...,1,2,80.0,8.5,1.04,6.4,...,1.5,2.4,1.282051,1,0.9868,0,16.0,Eng League Cup,,0.0
7,2020-03-01 01:32:31,20200301SUN32,9,Match Status : 2nd Half In Progress;Scoring In...,1,3,100.0,21.0,1.02,3.95,...,1.48,2.45,1.275862,1,0.9737,1,9.0,Italian Division 1,1.0,0.48
8,2020-03-01 01:31:49,20200301SUN42,9,Match Status : 2nd Half In Progress;Scoring In...,1,0,1.16,4.6,22.0,3.04,...,1.43,2.6,1.265487,1,0.9533,1,10.0,Spanish Division 1,1.0,0.43
9,2020-03-01 01:33:58,20200301SUN48,10,Match Status : 2nd Half In Progress;Scoring In...,1,0,1.05,7.8,50.0,1.7,...,1.5,2.4,1.25,1,0.9804,0,15.0,US Football League,,0.0


In [27]:
import math
s['actual_result'] = np.where(s.line > s.result_corner, 1, 0) # 1: low wins, 0: hi wins
s['home_away_diff'] = abs(s.home_score.apply(lambda x: int(x)) - s.away_score.apply(lambda x: int(x)))
s['home_odd_p'] = 1/s['home_odd']
s['draw_odd_p'] = 1/s['draw_odd']
s['away_odd_p'] = 1/s['away_odd']
s['nts_home_p'] = 1/s['nts_home']
s['nts_no_p'] = 1/s['nts_no']
s['nts_away_p'] = 1/s['nts_away']
s['corner_diff'] = (s.line - s.total_corner).apply(math.floor)

In [28]:
s

Unnamed: 0,minutes,event_id,total_corner,status,home_score,away_score,home_odd,draw_odd,away_odd,nts_home,...,return,actual_result,home_away_diff,home_odd_p,draw_odd_p,away_odd_p,nts_home_p,nts_no_p,nts_away_p,corner_diff
0,2020-03-01 01:33:45,20200301SUN3,5,Match Status : 2nd Half In Progress;Scoring In...,2,2,1.92,1.94,11.5,1.84,...,0.22,1,0,0.520833,0.515464,0.086957,0.543478,0.47619,0.111111,3
1,2020-03-01 01:33:36,20200301SUN9,5,Match Status : 2nd Half In Progress;Scoring In...,2,1,1.46,3.16,8.0,4.3,...,0.53,1,1,0.684932,0.316456,0.125,0.232558,0.431034,0.469484,3
2,2020-03-01 01:35:01,20200301SUN13,11,Match Status : 2nd Half In Progress;Scoring In...,1,0,1.18,4.45,20.0,3.35,...,0.5,1,1,0.847458,0.224719,0.05,0.298507,0.540541,0.292398,2
3,2020-03-01 01:36:58,20200301SUN17,15,Match Status : 2nd Half In Progress;Scoring In...,1,1,5.9,1.51,3.55,4.75,...,-1.0,0,0,0.169492,0.662252,0.28169,0.210526,0.598802,0.322581,2
4,2020-03-01 01:34:36,20200301SUN18,9,Match Status : 2nd Half In Progress;Scoring In...,2,3,21.0,4.4,1.2,3.32,...,0.48,1,1,0.047619,0.227273,0.833333,0.301205,0.591716,0.238095,2
5,2020-03-01 01:36:22,20200301SUN24,6,Match Status : 2nd Half In Progress;Scoring In...,1,1,6.8,1.39,3.85,5.8,...,0.0,1,0,0.147059,0.719424,0.25974,0.172414,0.675676,0.285714,1
6,2020-03-01 01:33:16,20200301SUN31,10,Match Status : 2nd Half In Progress;Scoring In...,1,2,80.0,8.5,1.04,6.4,...,0.0,0,1,0.0125,0.117647,0.961538,0.15625,0.414938,0.561798,2
7,2020-03-01 01:32:31,20200301SUN32,9,Match Status : 2nd Half In Progress;Scoring In...,1,3,100.0,21.0,1.02,3.95,...,0.48,1,2,0.01,0.047619,0.980392,0.253165,0.502513,0.37594,2
8,2020-03-01 01:31:49,20200301SUN42,9,Match Status : 2nd Half In Progress;Scoring In...,1,0,1.16,4.6,22.0,3.04,...,0.43,1,1,0.862069,0.217391,0.045455,0.328947,0.5,0.30303,2
9,2020-03-01 01:33:58,20200301SUN48,10,Match Status : 2nd Half In Progress;Scoring In...,1,0,1.05,7.8,50.0,1.7,...,0.0,0,1,0.952381,0.128205,0.02,0.588235,0.34965,0.192308,3


In [34]:
s2 = s[~s['nts_home_p'].isna()]
X = s2[['corner_diff','home_away_diff','home_odd_p',
          'draw_odd_p','away_odd_p','nts_home_p',
          'nts_no_p','nts_away_p','odd_change','peak_change']]
y = s2[['actual_result']]
# logmodel = LogisticRegression()
# logmodel.fit(X, y)

# import statsmodels.api as sm
# logit_model = sm.Logit(y, sm.add_constant(X)).fit()

In [22]:
print(logit_model.summary())