In [16]:
import numpy as np
import pandas as pd
import pickle
import random

In [17]:
data=pickle.load(open('data.pkl','rb'))

In [18]:
print("len(data) %d"%len(data))
print("len(result.groupby('name')) %d"%len(data.groupby('name')))

len(data) 14051
len(result.groupby('name')) 1005


In [19]:
samples=[]
for i,row in data.iterrows():
    sample={}
    name,date,minute=row[['name','date','minute']]
    row=row['data']
    sample['name']=name
    sample['date']=date
    sample['league']=row['league']
    sample['minute']=row['minute']
    #for c in ['name','corner','yellow','red','throw','freekick','goal']:
    for c in ['name','corner','yellow','red','goal']:
        if len(row[c])!=2:
            sample[c+'_h'],sample[c+'_a']=np.NaN, np.NaN
        else:
            sample[c+'_h'],sample[c+'_a']=row[c][0],row[c][1]
    if 'titles' in row:
        for i,c in enumerate(row['titles']):
            sample[c+'_h'],sample[c+'_a']=row['stats'][0,i],row['stats'][1,i]
    #for c in ['Fulltime Result','Double Chance','Half Time Result','1st Goal']:
    for c in ['Fulltime Result','Double Chance']:
        sample[c+'_h']=row['odds'][c][0] if c in row['odds'].keys() else np.NaN
        sample[c+'_d']=row['odds'][c][1] if c in row['odds'].keys() else np.NaN
        sample[c+'_a']=row['odds'][c][2] if c in row['odds'].keys() else np.NaN
    samples.append(pd.Series(sample))
samples=pd.concat(samples,axis=1,sort=False).T

In [20]:
#clean
samples=samples.dropna(subset=['goal_h','goal_a'])
print('len(samples) %d'%len(samples))

len(samples) 14049


In [21]:
final_minute=85

def add_label(group):
    if group.iloc[-1]['goal_h']>group.iloc[-1]['goal_a']: label=1
    elif group.iloc[-1]['goal_h']==group.iloc[-1]['goal_a']: label=0
    elif group.iloc[-1]['goal_h']<group.iloc[-1]['goal_a']: label=-1
    group['label']=label
    return group

def add_orig_odds(group):
    if group.iloc[0]['minute']!=0:
        group['Orig Fulltime Result_h']=np.NaN
        group['Orig Fulltime Result_d']=np.NaN
        group['Orig Fulltime Result_a']=np.NaN
    else:
        group['Orig Fulltime Result_h']=group.iloc[0]['Fulltime Result_h']
        group['Orig Fulltime Result_d']=group.iloc[0]['Fulltime Result_d']
        group['Orig Fulltime Result_a']=group.iloc[0]['Fulltime Result_a']
    return group

def filter_minute(group):
    group=group.sort_values('minute')
    result=[]
    for i,row in group.iterrows():
        if not result:
            result.append(row)
        elif row['minute']>=result[-1]['minute']+5:
            result.append(row)
    result=pd.concat(result,axis=1,sort=False).T
    return result

def process_group(group,name=None,date=None):
    group=group.sort_values('minute')
    if group.iloc[-1]['minute']<final_minute:
        #print(name)
        return None
    group=add_label(group)
    group=add_orig_odds(group)
    group=filter_minute(group)
    return group

groups=samples.groupby(['name','date'])
result=[process_group(group,name,date) for (name,date), group in groups]
samples_gp=[x for x in result if x is not None ]
samples_concat=pd.concat(result,sort=False)
print('samples_concat.shape %d'%samples_concat.shape[0])
print("len(samples_gp) %d"%len(samples_gp))

samples_concat.shape 6955
len(samples_gp) 667


In [22]:
#clean again
gp_cleaned=[]
for group in samples_gp:
    #group=group.dropna(subset=['Fulltime Result_h', 'Fulltime Result_d','Fulltime Result_a'])
    #group=group.dropna(subset=['Orig Fulltime Result_%s'%i for i in ['h','d','a']])
    #group=group.dropna(subset=['Possession %_h','Possession %_a'])
    group=group.drop(['Double Chance_h','Double Chance_d','Double Chance_a'],axis=1)
    group=group.dropna()
    gp_cleaned.append(group)

```
s=pd.concat(gp_cleaned,sort=False)
print('len %d'%len(s))
print('s.groupby len %d'%len(s.groupby(['name','date'])))
s.isna().sum()
```

In [None]:
#random sample from samples_gp
num_each_group=5
gp_final=[]
for group in gp_cleaned:
    group=group[(group['minute']<80)&(group['minute']>30)]
    if group.empty:
        continue
    idx=[random.randrange(0,len(group)) for _ in range(num_each_group)]
    group=group.iloc[idx].set_index(['name','date','league','name_h','name_a'])
    gp_final.append(group)
print('len(gp_final) %d'%len(gp_final))

In [29]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score as acc
kfold=KFold(20)
acc_train,acc_test,fullmatch=[],[],[]
for idx_train, idx_test in kfold.split(gp_final):
    gp_train=[gp_final[idx] for idx in idx_train]
    gp_test=[gp_final[idx] for idx in idx_test]
    gp_train=pd.concat(gp_train,sort=False)
    gp_test=pd.concat(gp_test,sort=False)
    y_train,y_test=gp_train['label'].astype(int),gp_test['label'].astype(int)
    x_train,x_test=gp_train.drop(['label'],axis=1),gp_test.drop(['label'],axis=1)
    cls=SVC(kernel='linear')
    cls.fit(x_train,y_train)
    y_pred_train=cls.predict(x_train)
    y_pred_test=cls.predict(x_test)
    prob_pred_test=cls.decision_function(x_test)
    acc_train.append(acc(y_train,y_pred_train))
    acc_test.append(acc(y_test,y_pred_test))
    
    #full_match
    prob_first=np.max(prob_pred_test,axis=1)
    prob_second=np.median(prob_pred_test,axis=1)
    prob=prob_first/prob_second
    sel=prob>np.median(prob)
    y_test_sel=y_test[sel]
    y_pred_test_sel=y_test[sel]
    x_test_sel=x_test[sel]
    bingo=y_test.values==y_pred_test
    fm=[]
    for i in range(len(y_test_sel)):
        if not bingo[i]:
            fm.append(-1)
        else:
            if y_pred_test_sel[i]==1: fm.append(x_test_sel.iloc[i]['Fulltime Result_h'])
            if y_pred_test_sel[i]==0: fm.append(x_test_sel.iloc[i]['Fulltime Result_d'])
            if y_pred_test_sel[i]==-1:fm.append(x_test_sel.iloc[i]['Fulltime Result_a'])
    fullmatch.append(np.mean(fm))
    
    
print('acc_train:%s'%acc_train)
print('mean:%f'%np.mean(acc_train))
print('acc_test:%s'%acc_test)
print('mean:%f'%np.mean(acc_test))
print('fullmatch:%s'%fullmatch)
print('mean:%f'%np.mean(fullmatch))

acc_train:[0.7578544061302682, 0.7417624521072796, 0.7371647509578544, 0.7762452107279694, 0.7440613026819923, 0.7563218390804598, 0.7478927203065134, 0.7310344827586207, 0.7440613026819923, 0.7432950191570882, 0.7386973180076628, 0.7318007662835249, 0.7379310344827587, 0.7371647509578544, 0.7310344827586207, 0.7412213740458016, 0.7412213740458016, 0.7633587786259542, 0.7351145038167939, 0.7633587786259542]
mean:0.745030
acc_test:[0.4857142857142857, 0.5714285714285714, 0.7428571428571429, 0.5571428571428572, 0.7571428571428571, 0.5714285714285714, 0.7428571428571429, 0.7, 0.5428571428571428, 0.7142857142857143, 0.6571428571428571, 0.7428571428571429, 0.6714285714285714, 0.8571428571428571, 0.6142857142857143, 0.7230769230769231, 0.8, 0.4307692307692308, 0.5692307692307692, 0.5538461538461539]
mean:0.650275
fullmatch:[0.7901428571428571, 0.822857142857143, 0.4009142857142856, 1.0179999999999998, 0.6338484848484848, 1.0736857142857141, 0.24840000000000004, 0.5551142857142858, 1.10679411

In [31]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
from sklearn.model_selection import train_test_split

In [32]:
gp_train,gp_test=train_test_split(gp_final)
gp_train=pd.concat(gp_train,sort=False)
gp_test=pd.concat(gp_test,sort=False)
y_train,y_test=gp_train['label'].astype(int),gp_test['label'].astype(int)
x_train,x_test=gp_train.drop(['label'],axis=1),gp_test.drop(['label'],axis=1)
odds_train,odds_test=x_train[['Fulltime Result_h','Fulltime Result_d','Fulltime Result_a']],x_test[['Fulltime Result_h','Fulltime Result_d','Fulltime Result_a']]

In [33]:
def model(x):
    net=slim.fully_connected(x,5)
    net=slim.fully_connected(x,3)
    return net

In [34]:
def get_loss(y_pred,y_true,odds):
    y_pred_max=tf.reduce_max(y_pred,axis=1,keep_dims=True)
    y_pred=tf.where(
        tf.equal(y_pred_max, y_pred), 
        tf.constant(1, shape=y_pred.shape), 
        tf.constant(0, shape=y_pred.shape)
    )
    mask=tf.equal(y_pred,y_true)
    earns=tf.where(mask,odds,-1)
    return tf.reduce_mean(earns)    

In [39]:
dataset = tf.data.Dataset.from_tensor_slices((x_train.values.astype(np.float32), y_train.values.astype(np.float32), odds_train.values.astype(np.float32)))
dataset = dataset.shuffle(50).repeat().batch(10)
iter = dataset.make_one_shot_iterator()
x,y,odds=iter.get_next()

with slim.arg_scope([slim.fully_connected],normalizer_fn=slim.batch_norm):
    output=model(x)

loss=get_loss(output,y,odds)