## 1. prepare

In [1]:
import pandas as pd
import os
import numpy as np
# train data
summary_folder_path = '../../datasets/train_data/summary'
mining_folder_path = '../../datasets/train_data/data_mining'
result_path = '../../datasets/results'
games = { 'baseball':'/baseball/mlb',
         'iceball': '/iceball/nhl',
         'soccer_champion':'/soccer/champ_league',
         'scoccer_england':'/soccer/epl',
         'soccer_major':'/soccer/majorleague'}
# folder name is games, result name is games_results.csv

games_list = ['baseball','iceball']
soccer_list = ['soccer_champion','scoccer_england','soccer_major']

soccer_features_name = ['maximum', 'minimum', 'ave', 'ave_normalized', 
            'start', 'start_normalized', 'end', 'end_normalized', 'start2end',
            'start2end_normalized', 'start2max', 'start2max_normalized', 'start2min', 'start2min_normalized', 
            'std', 'no_price', 'length', 
            'up_num', 'down_num', 'up_rate', 'down_rate', 'duration', 
            'bias_max', 'bias_min', 'bias_ave', 'bias_st', 'draw_odds','away_odds']

games_features_name = ['maximum', 'minimum', 'ave', 'ave_normalized', 
            'start', 'start_normalized', 'end', 'end_normalized', 'start2end',
            'start2end_normalized', 'start2max', 'start2max_normalized', 'start2min', 'start2min_normalized', 
            'std', 'no_price', 'length', 
            'up_num', 'down_num', 'up_rate', 'down_rate', 'duration', 
            'bias_max', 'bias_min', 'bias_ave', 'bias_st']

## 2. models
logistic regression,
KNN, 
Naive Bayes,
DT,
RF,
AdaBoost,
Gradient tree boosting,
ANN

In [53]:
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
# classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
# regression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor


"""
input: 
training: matrix of features, target: two or three target
validate: matrix of features, target: two or three target

output: 
acc, auc, or others
"""

#  train ml models
# input: features and labels
# output: all acc
def predict_soccer_result(features, label):
    acc_dic = {}
#     knn
    knn = make_pipeline(preprocessing.StandardScaler(), KNeighborsClassifier(n_neighbors=60))
    acc_dic['knn'] = cross_val_score(knn, features, label, cv=5).max()
#     logistic regression
    logreg = make_pipeline(preprocessing.StandardScaler(), LogisticRegression(C=0.3))
    acc_dic['logistic_regression'] = cross_val_score(logreg, features, label, cv=5).max()
#     naive bayes
    bayes = make_pipeline(preprocessing.StandardScaler(), BernoulliNB())
    acc_dic['bayes'] = cross_val_score(bayes, features, label, cv=5).max()
#     decision tree   
    dt = make_pipeline(preprocessing.StandardScaler(), DecisionTreeClassifier(random_state=0,max_depth=2))
    acc_dic['decision_tree'] = cross_val_score(dt, features, label, cv=5).max()
#     random forest
    rf = make_pipeline(preprocessing.StandardScaler(), RandomForestClassifier(max_depth=5, random_state=0))
    acc_dic['random_forest'] = cross_val_score(rf, features, label, cv=5).max()
#     adaboost
    adaboost = make_pipeline(preprocessing.StandardScaler(), AdaBoostClassifier(n_estimators=230))
    acc_dic['AdaBoost'] = cross_val_score(adaboost, features, label, cv=5).max()
#     gradient boosting
    gb = make_pipeline(preprocessing.StandardScaler(),
                        GradientBoostingClassifier(n_estimators=240, 
                                                   learning_rate=1.0,
                                                   max_depth=1, 
                                                   random_state=0))
    acc_dic['GradientBoosting'] = cross_val_score(gb, features, label, cv=5).max()    

    return acc_dic


def regression_next(features,label):
    mse_dic = {}
#     lasso
    alpha = 0.02
    lasso = make_pipeline(preprocessing.StandardScaler(), Lasso(alpha=alpha))
    mse_dic['Lasso'] = cross_val_score(lasso, features, label, scoring='r2', cv=5).max()
#     elasticnet
    elasticnet = make_pipeline(preprocessing.StandardScaler(), ElasticNet(alpha=alpha, l1_ratio=0.7))
    mse_dic['ElasticNet'] = cross_val_score(elasticnet, features, label, scoring='r2', cv=5).max()   
#     bayes regression
    bayesianRidge = make_pipeline(preprocessing.StandardScaler(), BayesianRidge())
    mse_dic['BayesianRidge'] = cross_val_score(bayesianRidge, features, label, scoring='r2', cv=5).max()     
#     svm regression
    svr = make_pipeline(preprocessing.StandardScaler(), svm.SVR(C=0.01))
    mse_dic['svm'] = cross_val_score(svr, features, label, scoring='r2', cv=5).max()   
#     KNN regression
    knn = make_pipeline(preprocessing.StandardScaler(), KNeighborsRegressor(n_neighbors=60))
    mse_dic['KNN'] = cross_val_score(knn, features, label, scoring='r2', cv=5).max()     
#     decision tree   
    dt = make_pipeline(preprocessing.StandardScaler(), DecisionTreeRegressor(random_state=0,max_depth=2))
    mse_dic['decision_tree'] = cross_val_score(dt, features, label, scoring='r2', cv=5).max()
#     random forest
    rf = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(max_depth=5, random_state=0))
    mse_dic['random_forest'] = cross_val_score(rf, features, label, scoring='r2', cv=5).max()
#     adaboost
    adaboost = make_pipeline(preprocessing.StandardScaler(), AdaBoostRegressor(n_estimators=230))
    mse_dic['AdaBoost'] = cross_val_score(adaboost, features, label,scoring='r2', cv=5).max()
#     gradient boosting
    gb = make_pipeline(preprocessing.StandardScaler(),
                        GradientBoostingRegressor(n_estimators=240, 
                                                   learning_rate=1.0,
                                                   max_depth=1, 
                                                   random_state=0))
    mse_dic['GradientBoosting'] = cross_val_score(gb, features, label, scoring='r2', cv=5).max()   
#     MLP
    mlp = make_pipeline(preprocessing.StandardScaler(), MLPRegressor(hidden_layer_sizes=(10,),
                                       activation='logistic',
                                       solver='adam',
                                       learning_rate='adaptive',
                                       max_iter=1000,
                                       learning_rate_init=0.01,
                                       alpha=0.002))
    mse_dic['MLP'] = cross_val_score(mlp, features, label, scoring='r2', cv=5).max()  
    
    return mse_dic



'\ninput: \ntraining: matrix of features, target: two or three target\nvalidate: matrix of features, target: two or three target\n\noutput: \nacc, auc, or others\n'

## 3. data

## soccer

In [3]:
def random_guess(sec_no,series):
    count = 0
    for i in series.index.tolist():
        if series.loc[i,'end'+sec_no] > series.loc[i,'draw_odds'+sec_no] and \
        series.loc[i,'end'+sec_no] > series.loc[i,'away_odds'+sec_no]:
            if int(series.loc[i,'home']) == 1:
                count +=1
        else:
            if int(series.loc[i,'home']) == 0:
                count +=1
    return count/len(unbiased)

def summary(sec_name,sec_no):
    results = pd.DataFrame()
    dic = predict_soccer_result(unbiased[sec_name].values, labels)
    dic['odds_guess'] = random_guess(sec_no,unbiased)
    pred = pd.Series(dic)
    pred.name = 'unbiased'
    results = results.append(pred)

    dic = predict_soccer_result(hourly_corrected[sec_name].values, labels)
    dic['odds_guess'] = random_guess(sec_no,hourly_corrected)
    pred = pd.Series(dic)
    pred.name = 'hourly_corrected'
    results = results.append(pred)
    
    dic = predict_soccer_result(daily_corrected[sec_name].values, labels)
    dic['odds_guess'] = random_guess(sec_no,daily_corrected)
    pred = pd.Series(dic)
    pred.name = 'daily_corrected'
    results = results.append(pred)

    dic = predict_soccer_result(global_corrected[sec_name].values, labels)
    dic['odds_guess'] = random_guess(sec_no,global_corrected)
    pred = pd.Series(dic) 
    pred.name = 'global_corrected'
    results = results.append(pred)
    return results


In [4]:
# soccer
game_name = 'soccer'

unbiased = pd.read_csv(mining_folder_path+'/'+game_name+'_unbiased.csv').iloc[:,1:].fillna(0)
hourly_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_hourly_corrected.csv').iloc[:,1:].fillna(0)
daily_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_daily_corrected.csv').iloc[:,1:].fillna(0)
global_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_global_corrected.csv').iloc[:,1:].fillna(0)
# all target home, draw , away
target = unbiased[['home','draw','away']]
target['target'] = (target['home']+target['draw']*2+target['away']*3).apply(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [96]:

# risk analysis
# high risk index / low risk index

risk_value = 'no_risk'
labels = target['home'].apply(int).values

# sec1
print('sec1 to predict:')
tag = '_1'
sec1_name = [i+tag for i in soccer_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# sec2
print('sec2 to predict:')
tag = '_2'
sec1_name = [i+tag for i in soccer_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# # sec3
print('sec3 to predict:')
tag = '_3'
sec1_name = [i+tag for i in soccer_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# all
print('all to predict:')
tag = '_all'
sec1_name = [i+tag for i in soccer_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

print('all features to predict:')
tag = '_all'
all_features =[]
all_features.extend([i+'_1' for i in soccer_features_name])
all_features.extend([i+'_2' for i in soccer_features_name])
all_features.extend([i+'_3' for i in soccer_features_name])
all_features.extend([i+'_all' for i in soccer_features_name])
results_1 = summary(all_features,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'features.csv')

sec1 to predict:
sec2 to predict:
sec3 to predict:
all to predict:
all features to predict:


In [113]:
unbiased = pd.read_csv(mining_folder_path+'/'+game_name+'_unbiased.csv').iloc[:,1:].fillna(0)
hourly_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_hourly_corrected.csv').iloc[:,1:].fillna(0)
daily_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_daily_corrected.csv').iloc[:,1:].fillna(0)
global_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_global_corrected.csv').iloc[:,1:].fillna(0)

# all target home, draw , away
target = unbiased[['home','draw','away']]
target['target'] = (target['home']+target['draw']*2+target['away']*3).apply(int)


risk_value = 'high_risk'
high_risk = unbiased[((unbiased['start_all']<0.7) & (unbiased['start_all']>0.3))].index.tolist()
len(high_risk)
unbiased = unbiased.loc[high_risk]
hourly_corrected = hourly_corrected.loc[high_risk]
daily_corrected = daily_corrected.loc[high_risk]
global_corrected = global_corrected.loc[high_risk]
target = target.loc[high_risk]
labels = target['home'].apply(int).values
# sec1
print('sec1 to predict:')
tag = '_1'
sec1_name = [i+tag for i in soccer_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# sec2
print('sec2 to predict:')
tag = '_2'
sec1_name = [i+tag for i in soccer_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# # sec3
print('sec3 to predict:')
tag = '_3'
sec1_name = [i+tag for i in soccer_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# all
print('all to predict:')
tag = '_all'
sec1_name = [i+tag for i in soccer_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

print('all features to predict:')
tag = '_all'
all_features =[]
all_features.extend([i+'_1' for i in soccer_features_name])
all_features.extend([i+'_2' for i in soccer_features_name])
all_features.extend([i+'_3' for i in soccer_features_name])
all_features.extend([i+'_all' for i in soccer_features_name])
results_1 = summary(all_features,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'features.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


3180

sec1 to predict:
sec2 to predict:
sec3 to predict:
all to predict:
all features to predict:


In [114]:
unbiased = pd.read_csv(mining_folder_path+'/'+game_name+'_unbiased.csv').iloc[:,1:].fillna(0)
hourly_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_hourly_corrected.csv').iloc[:,1:].fillna(0)
daily_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_daily_corrected.csv').iloc[:,1:].fillna(0)
global_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_global_corrected.csv').iloc[:,1:].fillna(0)

# all target home, draw , away
target = unbiased[['home','draw','away']]
target['target'] = (target['home']+target['draw']*2+target['away']*3).apply(int)


risk_value = 'low_risk'
high_risk = unbiased[((unbiased['start_all']>=0.7) | (unbiased['start_all']<=0.3))].index.tolist()
len(high_risk)
unbiased = unbiased.loc[high_risk]
hourly_corrected = hourly_corrected.loc[high_risk]
daily_corrected = daily_corrected.loc[high_risk]
global_corrected = global_corrected.loc[high_risk]
target = target.loc[high_risk]
labels = target['home'].apply(int).values

# sec1
print('sec1 to predict:')
tag = '_1'
sec1_name = [i+tag for i in soccer_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# sec2
print('sec2 to predict:')
tag = '_2'
sec1_name = [i+tag for i in soccer_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# # sec3
print('sec3 to predict:')
tag = '_3'
sec1_name = [i+tag for i in soccer_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# all
print('all to predict:')
tag = '_all'
sec1_name = [i+tag for i in soccer_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

print('all features to predict:')
tag = '_all'
all_features =[]
all_features.extend([i+'_1' for i in soccer_features_name])
all_features.extend([i+'_2' for i in soccer_features_name])
all_features.extend([i+'_3' for i in soccer_features_name])
all_features.extend([i+'_all' for i in soccer_features_name])
results_1 = summary(all_features,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'features.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


1115

sec1 to predict:
sec2 to predict:
sec3 to predict:
all to predict:
all features to predict:


## iceball

In [115]:
def random_guess(sec_no,series):
    count = 0
    for i in series.index.tolist():
        if series.loc[i,'end'+sec_no] >= 0.5:
            if int(series.loc[i,'home']) == 1:
                count +=1
        else:
            if int(series.loc[i,'home']) == 0:
                count +=1
    return count/len(unbiased)

def summary(sec_name,sec_no):
    results = pd.DataFrame()
    dic = predict_soccer_result(unbiased[sec_name].values, labels)
    dic['odds_guess'] = random_guess(sec_no,unbiased)
    pred = pd.Series(dic)
    pred.name = 'unbiased'
    results = results.append(pred)

    dic = predict_soccer_result(hourly_corrected[sec_name].values, labels)
    dic['odds_guess'] = random_guess(sec_no,hourly_corrected)
    pred = pd.Series(dic)
    pred.name = 'hourly_corrected'
    results = results.append(pred)
    
    dic = predict_soccer_result(daily_corrected[sec_name].values, labels)
    dic['odds_guess'] = random_guess(sec_no,daily_corrected)
    pred = pd.Series(dic)
    pred.name = 'daily_corrected'
    results = results.append(pred)

    dic = predict_soccer_result(global_corrected[sec_name].values, labels)
    dic['odds_guess'] = random_guess(sec_no,global_corrected)
    pred = pd.Series(dic) 
    pred.name = 'global_corrected'
    results = results.append(pred)
    return results

In [118]:
game_name = 'iceball'
unbiased = pd.read_csv(mining_folder_path+'/'+game_name+'_unbiased.csv').iloc[:,1:].fillna(0)
hourly_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_hourly_corrected.csv').iloc[:,1:].fillna(0)
daily_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_daily_corrected.csv').iloc[:,1:].fillna(0)
global_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_global_corrected.csv').iloc[:,1:].fillna(0)

# all target home, draw , away
target = unbiased['home']

risk_value = 'low_risk'
high_risk = unbiased[((unbiased['start_all']>=0.7) | (unbiased['start_all']<=0.3))].index.tolist()
len(high_risk)
unbiased = unbiased.loc[high_risk]
hourly_corrected = hourly_corrected.loc[high_risk]
daily_corrected = daily_corrected.loc[high_risk]
global_corrected = global_corrected.loc[high_risk]
target = target.loc[high_risk]
labels = unbiased['home'].apply(int).values

# sec1
print('sec1 to predict:')
tag = '_1'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# sec2
print('sec2 to predict:')
tag = '_2'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# # sec3
print('sec3 to predict:')
tag = '_3'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# all
print('all to predict:')
tag = '_all'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

print('all features to predict:')
tag = '_all'
all_features =[]
all_features.extend([i+'_1' for i in games_features_name])
all_features.extend([i+'_2' for i in games_features_name])
all_features.extend([i+'_3' for i in games_features_name])
all_features.extend([i+'_all' for i in games_features_name])
results_1 = summary(all_features,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'features.csv')

286

sec1 to predict:
sec2 to predict:
sec3 to predict:
all to predict:
all features to predict:


In [119]:
game_name = 'iceball'
unbiased = pd.read_csv(mining_folder_path+'/'+game_name+'_unbiased.csv').iloc[:,1:].fillna(0)
hourly_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_hourly_corrected.csv').iloc[:,1:].fillna(0)
daily_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_daily_corrected.csv').iloc[:,1:].fillna(0)
global_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_global_corrected.csv').iloc[:,1:].fillna(0)

# all target home, draw , away
target = unbiased['home']

risk_value = 'high_risk'
high_risk = unbiased[((unbiased['start_all']<0.7) & (unbiased['start_all']>0.3))].index.tolist()
len(high_risk)
unbiased = unbiased.loc[high_risk]
hourly_corrected = hourly_corrected.loc[high_risk]
daily_corrected = daily_corrected.loc[high_risk]
global_corrected = global_corrected.loc[high_risk]
target = target.loc[high_risk]
labels = unbiased['home'].apply(int).values

# sec1
print('sec1 to predict:')
tag = '_1'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# sec2
print('sec2 to predict:')
tag = '_2'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# # sec3
print('sec3 to predict:')
tag = '_3'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# all
print('all to predict:')
tag = '_all'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

print('all features to predict:')
tag = '_all'
all_features =[]
all_features.extend([i+'_1' for i in games_features_name])
all_features.extend([i+'_2' for i in games_features_name])
all_features.extend([i+'_3' for i in games_features_name])
all_features.extend([i+'_all' for i in games_features_name])
results_1 = summary(all_features,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'features.csv')

6663

sec1 to predict:
sec2 to predict:
sec3 to predict:
all to predict:
all features to predict:


In [120]:
game_name = 'iceball'
unbiased = pd.read_csv(mining_folder_path+'/'+game_name+'_unbiased.csv').iloc[:,1:].fillna(0)
hourly_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_hourly_corrected.csv').iloc[:,1:].fillna(0)
daily_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_daily_corrected.csv').iloc[:,1:].fillna(0)
global_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_global_corrected.csv').iloc[:,1:].fillna(0)

# all target home, draw , away
target = unbiased['home']

risk_value = 'no_risk'

labels = unbiased['home'].apply(int).values

# sec1
print('sec1 to predict:')
tag = '_1'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# sec2
print('sec2 to predict:')
tag = '_2'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# # sec3
print('sec3 to predict:')
tag = '_3'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# all
print('all to predict:')
tag = '_all'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

print('all features to predict:')
tag = '_all'
all_features =[]
all_features.extend([i+'_1' for i in games_features_name])
all_features.extend([i+'_2' for i in games_features_name])
all_features.extend([i+'_3' for i in games_features_name])
all_features.extend([i+'_all' for i in games_features_name])
results_1 = summary(all_features,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'features.csv')

sec1 to predict:
sec2 to predict:
sec3 to predict:
all to predict:
all features to predict:


## baseball

In [121]:
game_name = 'baseball'
unbiased = pd.read_csv(mining_folder_path+'/'+game_name+'_unbiased.csv').iloc[:,1:].fillna(0)
hourly_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_hourly_corrected.csv').iloc[:,1:].fillna(0)
daily_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_daily_corrected.csv').iloc[:,1:].fillna(0)
global_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_global_corrected.csv').iloc[:,1:].fillna(0)

# all target home, draw , away
target = unbiased['home']

risk_value = 'no_risk'

labels = unbiased['home'].apply(int).values

# sec1
print('sec1 to predict:')
tag = '_1'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# sec2
print('sec2 to predict:')
tag = '_2'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# # sec3
print('sec3 to predict:')
tag = '_3'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# all
print('all to predict:')
tag = '_all'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

print('all features to predict:')
tag = '_all'
all_features =[]
all_features.extend([i+'_1' for i in games_features_name])
all_features.extend([i+'_2' for i in games_features_name])
all_features.extend([i+'_3' for i in games_features_name])
all_features.extend([i+'_all' for i in games_features_name])
results_1 = summary(all_features,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'features.csv')

sec1 to predict:
sec2 to predict:
sec3 to predict:
all to predict:
all features to predict:


In [122]:
game_name = 'baseball'
unbiased = pd.read_csv(mining_folder_path+'/'+game_name+'_unbiased.csv').iloc[:,1:].fillna(0)
hourly_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_hourly_corrected.csv').iloc[:,1:].fillna(0)
daily_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_daily_corrected.csv').iloc[:,1:].fillna(0)
global_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_global_corrected.csv').iloc[:,1:].fillna(0)

# all target home, draw , away
target = unbiased['home']

risk_value = 'high_risk'
high_risk = unbiased[((unbiased['start_all']<0.7) & (unbiased['start_all']>0.3))].index.tolist()
len(high_risk)
unbiased = unbiased.loc[high_risk]
hourly_corrected = hourly_corrected.loc[high_risk]
daily_corrected = daily_corrected.loc[high_risk]
global_corrected = global_corrected.loc[high_risk]
target = target.loc[high_risk]
labels = unbiased['home'].apply(int).values

# sec1
print('sec1 to predict:')
tag = '_1'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# sec2
print('sec2 to predict:')
tag = '_2'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# # sec3
print('sec3 to predict:')
tag = '_3'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# all
print('all to predict:')
tag = '_all'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

print('all features to predict:')
tag = '_all'
all_features =[]
all_features.extend([i+'_1' for i in games_features_name])
all_features.extend([i+'_2' for i in games_features_name])
all_features.extend([i+'_3' for i in games_features_name])
all_features.extend([i+'_all' for i in games_features_name])
results_1 = summary(all_features,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'features.csv')

11743

sec1 to predict:
sec2 to predict:
sec3 to predict:
all to predict:
all features to predict:


In [123]:
game_name = 'baseball'
unbiased = pd.read_csv(mining_folder_path+'/'+game_name+'_unbiased.csv').iloc[:,1:].fillna(0)
hourly_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_hourly_corrected.csv').iloc[:,1:].fillna(0)
daily_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_daily_corrected.csv').iloc[:,1:].fillna(0)
global_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_global_corrected.csv').iloc[:,1:].fillna(0)

# all target home, draw , away
target = unbiased['home']

risk_value = 'low_risk'
high_risk = unbiased[((unbiased['start_all']>=0.7) | (unbiased['start_all']<=0.3))].index.tolist()
len(high_risk)
unbiased = unbiased.loc[high_risk]
hourly_corrected = hourly_corrected.loc[high_risk]
daily_corrected = daily_corrected.loc[high_risk]
global_corrected = global_corrected.loc[high_risk]
target = target.loc[high_risk]
labels = unbiased['home'].apply(int).values

# sec1
print('sec1 to predict:')
tag = '_1'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# sec2
print('sec2 to predict:')
tag = '_2'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# # sec3
print('sec3 to predict:')
tag = '_3'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

# all
print('all to predict:')
tag = '_all'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary(sec1_name,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'.csv')

print('all features to predict:')
tag = '_all'
all_features =[]
all_features.extend([i+'_1' for i in games_features_name])
all_features.extend([i+'_2' for i in games_features_name])
all_features.extend([i+'_3' for i in games_features_name])
all_features.extend([i+'_all' for i in games_features_name])
results_1 = summary(all_features,tag)
results_1.to_csv(result_path+'/home_win/'+game_name+'/'+risk_value+tag+'features.csv')

318

sec1 to predict:
sec2 to predict:
sec3 to predict:
all to predict:
all features to predict:


### predict direction

In [11]:
def direction_find(x):
    if x>0:
        return 1
    elif x==0:
        return 0
    else:
        return -1
def summary_direction_results(sec_name,sec_no,next_no):
    results = pd.DataFrame()
    target = (unbiased['end'+next_no] - unbiased['end'+sec_no]).apply(direction_find)
    dic = predict_soccer_result(unbiased[sec_name].values, target)
    pred = pd.Series(dic)
    pred.name = 'unbiased'
    results = results.append(pred)

    
#     target = (hourly_corrected['end'+next_no] - hourly_corrected['end'+sec_no]).apply(direction_find)
    dic = predict_soccer_result(hourly_corrected[sec_name].values, target)
    pred = pd.Series(dic)
    pred.name = 'hourly_corrected'
    results = results.append(pred)

#     target = (daily_corrected['end'+next_no] - daily_corrected['end'+sec_no]).apply(direction_find)
    dic = predict_soccer_result(daily_corrected[sec_name].values, target)
    pred = pd.Series(dic)
    pred.name = 'daily_corrected'
    results = results.append(pred)

#     target = (global_corrected['end'+next_no] - global_corrected['end'+sec_no]).apply(direction_find)
    dic = predict_soccer_result(global_corrected[sec_name].values, target)
    pred = pd.Series(dic) 
    pred.name = 'global_corrected'
    results = results.append(pred)
    return results

# soccer
game_name = 'soccer'
unbiased = pd.read_csv(mining_folder_path+'/'+game_name+'_unbiased.csv').iloc[:,1:].fillna(0)
hourly_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_hourly_corrected.csv').iloc[:,1:].fillna(0)
daily_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_daily_corrected.csv').iloc[:,1:].fillna(0)
global_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_global_corrected.csv').iloc[:,1:].fillna(0)

# sec1
print('sec1 to predict:')
tag = '_1'
sec_name = [i+tag for i in soccer_features_name]
results_1 = summary_direction_results(sec_name,tag,'_2')
results_1.to_csv(result_path+'/direction/'+game_name+tag+'.csv')

# sec2
print('sec2 to predict:')
tag = '_2'
sec1_name = [i+tag for i in soccer_features_name]
results_1 = summary_direction_results(sec_name,tag,'_3')
results_1.to_csv(result_path+'/direction/'+game_name+tag+'.csv')

sec1 to predict:
sec2 to predict:


In [12]:
# iceball
game_name = 'iceball'
unbiased = pd.read_csv(mining_folder_path+'/'+game_name+'_unbiased.csv').iloc[:,1:].fillna(0)
hourly_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_hourly_corrected.csv').iloc[:,1:].fillna(0)
daily_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_daily_corrected.csv').iloc[:,1:].fillna(0)
global_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_global_corrected.csv').iloc[:,1:].fillna(0)

# sec1
print('sec1 to predict:')
tag = '_1'
sec_name = [i+tag for i in games_features_name]
results_1 = summary_direction_results(sec_name,tag,'_2')
results_1.to_csv(result_path+'/direction/'+game_name+tag+'.csv')

# sec2
print('sec2 to predict:')
tag = '_2'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary_direction_results(sec_name,tag,'_3')
results_1.to_csv(result_path+'/direction/'+game_name+tag+'.csv')

sec1 to predict:
sec2 to predict:


In [13]:
# baseball
game_name = 'baseball'
unbiased = pd.read_csv(mining_folder_path+'/'+game_name+'_unbiased.csv').iloc[:,1:].fillna(0)
hourly_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_hourly_corrected.csv').iloc[:,1:].fillna(0)
daily_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_daily_corrected.csv').iloc[:,1:].fillna(0)
global_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_global_corrected.csv').iloc[:,1:].fillna(0)

# sec1
print('sec1 to predict:')
tag = '_1'
sec_name = [i+tag for i in games_features_name]
results_1 = summary_direction_results(sec_name,tag,'_2')
results_1.to_csv(result_path+'/direction/'+game_name+tag+'.csv')

# sec2
print('sec2 to predict:')
tag = '_2'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary_direction_results(sec_name,tag,'_3')
results_1.to_csv(result_path+'/direction/'+game_name+tag+'.csv')

sec1 to predict:
sec2 to predict:


### regression: next period price

In [59]:
def summary_price_results(sec_name,sec_no,next_no):
    results = pd.DataFrame()
    target = unbiased['end'+sec_no]
    dic = regression_next(unbiased[sec_name].values, target.values)
    pred = pd.Series(dic)
    pred.name = 'unbiased'
    results = results.append(pred)

    
#     target = (hourly_corrected['end'+next_no] - hourly_corrected['end'+sec_no]).apply(direction_find)
    dic = regression_next(hourly_corrected[sec_name].values, target.values)
    pred = pd.Series(dic)
    pred.name = 'hourly_corrected'
    results = results.append(pred)

#     target = (daily_corrected['end'+next_no] - daily_corrected['end'+sec_no]).apply(direction_find)
    dic = regression_next(daily_corrected[sec_name].values, target.values)
    pred = pd.Series(dic)
    pred.name = 'daily_corrected'
    results = results.append(pred)

#     target = (global_corrected['end'+next_no] - global_corrected['end'+sec_no]).apply(direction_find)
    dic = regression_next(global_corrected[sec_name].values, target.values)
    pred = pd.Series(dic) 
    pred.name = 'global_corrected'
    results = results.append(pred)
    return results

# soccer
game_name = 'soccer'
unbiased = pd.read_csv(mining_folder_path+'/'+game_name+'_unbiased.csv').iloc[:,1:].fillna(0)
hourly_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_hourly_corrected.csv').iloc[:,1:].fillna(0)
daily_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_daily_corrected.csv').iloc[:,1:].fillna(0)
global_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_global_corrected.csv').iloc[:,1:].fillna(0)

# sec1
print('sec1 to predict:')
tag = '_1'
sec_name = [i+tag for i in soccer_features_name]
results_1 = summary_price_results(sec_name,tag,'_2')
results_1.to_csv(result_path+'/price/'+game_name+tag+'.csv')

# sec2
print('sec2 to predict:')
tag = '_2'
sec1_name = [i+tag for i in soccer_features_name]
results_1 = summary_price_results(sec_name,tag,'_3')
results_1.to_csv(result_path+'/price/'+game_name+tag+'.csv')

sec1 to predict:
sec2 to predict:


In [None]:
game_name = 'iceball'
unbiased = pd.read_csv(mining_folder_path+'/'+game_name+'_unbiased.csv').iloc[:,1:].fillna(0)
hourly_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_hourly_corrected.csv').iloc[:,1:].fillna(0)
daily_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_daily_corrected.csv').iloc[:,1:].fillna(0)
global_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_global_corrected.csv').iloc[:,1:].fillna(0)

# sec1
print('sec1 to predict:')
tag = '_1'
sec_name = [i+tag for i in games_features_name]
results_1 = summary_price_results(sec_name,tag,'_2')
results_1.to_csv(result_path+'/price/'+game_name+tag+'.csv')

# sec2
print('sec2 to predict:')
tag = '_2'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary_price_results(sec_name,tag,'_3')
results_1.to_csv(result_path+'/price/'+game_name+tag+'.csv')

sec1 to predict:


In [None]:
game_name = 'baseball'
unbiased = pd.read_csv(mining_folder_path+'/'+game_name+'_unbiased.csv').iloc[:,1:].fillna(0)
hourly_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_hourly_corrected.csv').iloc[:,1:].fillna(0)
daily_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_daily_corrected.csv').iloc[:,1:].fillna(0)
global_corrected = pd.read_csv(mining_folder_path+'/'+game_name+'_global_corrected.csv').iloc[:,1:].fillna(0)

# sec1
print('sec1 to predict:')
tag = '_1'
sec_name = [i+tag for i in games_features_name]
results_1 = summary_price_results(sec_name,tag,'_2')
results_1.to_csv(result_path+'/price/'+game_name+tag+'.csv')

# sec2
print('sec2 to predict:')
tag = '_2'
sec1_name = [i+tag for i in games_features_name]
results_1 = summary_price_results(sec_name,tag,'_3')
results_1.to_csv(result_path+'/price/'+game_name+tag+'.csv')