In [1]:
import numpy as np
import pandas as pd
from timeit import default_timer as timer
from multiprocessing import Pool
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"\\utilities")
    
from Oliver_four_factors_features_creation import features_creation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('../data/preprocess_data.csv',sep = ',')
data['Match Date']=pd.to_datetime(data['Match Date'])

In [3]:
data.head()

Unnamed: 0,Tournament,Home Team,Away Team,Match Date,Winner Team,Winner Home Or Away,Phase,Final Score,Extra Periods,Team,...,opptSTL/TO,oppt_Game_Score,Points difference,Finals,Last 32,Playoffs,Quarter-Finals,Regular Season,Semifinals,Top 16
0,Basket League,KAOD,Aris,2013-10-12,Aris,Away,Regular Season,58-66,0.0,KAOD,...,0.571429,39.8,-8.0,0,0,0,0,1,0,0
1,Basket League,PAOK,Kolossos Rhodes,2013-10-12,PAOK,Home,Regular Season,88-81,0.0,PAOK,...,0.444444,46.3,7.0,0,0,0,0,1,0,0
2,Liga ACB,Bilbao,Zaragoza,2013-10-12,Zaragoza,Away,Regular Season,77-86,0.0,Bilbao,...,1.0,70.0,-9.0,0,0,0,0,1,0,0
3,Basket League,Trikala,Kifisia,2013-10-12,Kifisia,Away,Regular Season,64-69,0.0,Trikala,...,0.25,40.0,-5.0,0,0,0,0,1,0,0
4,Liga ACB,Tenerife,Obradoiro,2013-10-12,Tenerife,Home,Regular Season,76-74,0.0,Tenerife,...,0.571429,48.6,2.0,0,0,0,0,1,0,0


### Oliver’s Four Factors means eight overall factors:


* Offensive Factors
    - Effective Field Goal Percentage
    - Turnover Percentage
    - Offensive Rebound Percentage
    - Free Throw Rate
* Defensive Factors
    - Opponent’s Effective Field Goal Percentage
    - Opponent’s Turnover Percentage
    - Opponent’s Offensive Rebound Percentage
    - Opponent’s Free Throw Rate


# Greek Basket League

In [4]:
start = timer()
if __name__ ==  '__main__':
    pool = Pool(processes=6)  
    greece_features = features_creation().run_features_creations(pool=pool,Data_Frame = data,
                                                                             Tournament = "Basket League",
                                                                             Date = "2014-8-01")
    pool.close()
    pool.terminate()
    pool.join()
end = timer()
print ("time:")
print ((end - start)/60)

time:
0.0738585


In [5]:
greece_features.head()

Unnamed: 0,home_EFG%,home_TO%,home_OREB%,home_FTRate,away_EFG%,away_TO%,away_OREB%,away_FTRate,Team Result
0,51.491269,15.333681,22.593132,0.736295,47.679526,15.005399,24.957344,0.714559,1.0
1,53.392916,15.384917,27.523827,0.661094,43.518186,14.525156,30.296659,0.640294,1.0
2,45.769466,21.686776,26.930714,0.687929,47.232598,18.759117,26.430274,0.732545,0.0
3,53.738395,15.286959,29.167086,0.667636,51.927985,17.605095,18.297129,0.750579,1.0
4,44.959301,13.11887,24.974233,0.729764,48.750853,17.421278,31.623343,0.669728,0.0


In [6]:
x_greece = greece_features.iloc[:,:-1]
y_greece = greece_features.iloc[:,-1]

x1_greece = data[(data['Tournament'] =="Basket League")&(data['Match Date'] >"2014-8-01")]

In [7]:
corr_df = greece_features.corr()['Team Result'].reset_index()
corr_df = corr_df.sort_values([('Team Result')], ascending = False)
corr_df

Unnamed: 0,index,Team Result
8,Team Result,1.0
0,home_EFG%,0.152342
2,home_OREB%,0.129894
5,away_TO%,0.112521
3,home_FTRate,-0.014841
7,away_FTRate,-0.096302
1,home_TO%,-0.143397
6,away_OREB%,-0.188759
4,away_EFG%,-0.241093


In [8]:
x_train_greece=x_greece[:len(x1_greece[x1_greece["Match Date"]<"2017-8-1"])]
y_train_greece=y_greece.iloc[:len(x1_greece[x1_greece["Match Date"]<"2017-8-1"])]
x_test_greece=x_greece[len(x1_greece[x1_greece["Match Date"]<"2017-8-1"]):]
y_test_greece=y_greece.iloc[len(x1_greece[x1_greece["Match Date"]<"2017-8-1"]):]

In [11]:
lgr = LogisticRegression(random_state=1, max_iter=10000).fit(x_train_greece, y_train_greece)
y_pre_lgr= lgr.predict(x_test_greece)
print("Test Accuracy:",round(accuracy_score(y_pre_lgr,y_test_greece),3))


Test Accuracy: 0.75


# Liga ACB

In [12]:
start = timer()
if __name__ ==  '__main__':
    pool = Pool(processes=6)  
    spain_features = features_creation().run_features_creations(pool=pool,Data_Frame = data,
                                                                             Tournament = "Liga ACB",
                                                                             Date = "2014-8-01")
    pool.close()
    pool.terminate()
    pool.join()
end = timer()
print ("time:")
print ((end - start)/60)

time:
0.10089339


In [13]:
x_spain = spain_features.iloc[:,:-1]
y_spain = spain_features.iloc[:,-1]

x1_spain = data[(data['Tournament'] =="Liga ACB")&(data['Match Date'] >"2014-8-01")]

In [15]:
corr_df = spain_features.corr()['Team Result'].reset_index()
corr_df = corr_df.sort_values([('Team Result')], ascending = False)
corr_df

Unnamed: 0,index,Team Result
8,Team Result,1.0
0,home_EFG%,0.199799
5,away_TO%,0.109327
2,home_OREB%,0.108891
3,home_FTRate,0.072428
7,away_FTRate,-0.036979
6,away_OREB%,-0.085816
1,home_TO%,-0.097952
4,away_EFG%,-0.175237


In [16]:
x_train_spain=x_spain[:len(x1_spain[x1_spain["Match Date"]<"2017-8-1"])]
y_train_spain=y_spain.iloc[:len(x1_spain[x1_spain["Match Date"]<"2017-8-1"])]
x_test_spain=x_spain[len(x1_spain[x1_spain["Match Date"]<"2017-8-1"]):]
y_test_spain=y_spain.iloc[len(x1_spain[x1_spain["Match Date"]<"2017-8-1"]):]

In [17]:
lgr = LogisticRegression(random_state=1, max_iter=10000).fit(x_train_spain, y_train_spain)
y_pre_lgr= lgr.predict(x_test_spain)
print("Test Accuracy:",round(accuracy_score(y_pre_lgr,y_test_spain),3))

Test Accuracy: 0.645


# Euroleague

In [18]:
start = timer()
if __name__ ==  '__main__':
    pool = Pool(processes=6)  
    euroleague_features = features_creation().run_features_creations(pool=pool,Data_Frame = data,
                                                                             Tournament = "Euroleague",
                                                                             Date = "2014-8-01")
    pool.close()
    pool.terminate()
    pool.join()
end = timer()
print ("time:")
print ((end - start)/60)

time:
0.08436802333333351


In [19]:
x_el = euroleague_features.iloc[:,:-1]
y_el = euroleague_features.iloc[:,-1]

x1_el = data[(data['Tournament'] =="Euroleague")&(data['Match Date'] >"2014-8-01")]

In [20]:
corr_df =euroleague_features.corr()['Team Result'].reset_index()
corr_df = corr_df.sort_values([('Team Result')], ascending = False)
corr_df

Unnamed: 0,index,Team Result
8,Team Result,1.0
0,home_EFG%,0.094286
3,home_FTRate,0.033476
2,home_OREB%,0.020997
1,home_TO%,-0.020392
5,away_TO%,-0.031722
7,away_FTRate,-0.058639
6,away_OREB%,-0.081041
4,away_EFG%,-0.162858


In [21]:
x_train_el=x_el[:len(x1_el[x1_el["Match Date"]<"2017-8-1"])]
y_train_el=y_el.iloc[:len(x1_el[x1_el["Match Date"]<"2017-8-1"])]
x_test_el=x_el[len(x1_el[x1_el["Match Date"]<"2017-8-1"]):]
y_test_el=y_el.iloc[len(x1_el[x1_el["Match Date"]<"2017-8-1"]):]

In [22]:
lgr = LogisticRegression(random_state=1, max_iter=10000).fit(x_train_el, y_train_el)
y_pre_lgr= lgr.predict(x_test_el)
print("Test Accuracy:",round(accuracy_score(y_pre_lgr,y_test_el),3))

Test Accuracy: 0.627


# Eurocup

In [23]:
start = timer()
if __name__ ==  '__main__':
    pool = Pool(processes=6)  
    eurocup_features = features_creation().run_features_creations(pool=pool,Data_Frame = data,
                                                                             Tournament = "Eurocup",
                                                                             Date = "2014-8-01")
    pool.close()
    pool.terminate()
    pool.join()
end = timer()
print ("time:")
print ((end - start)/60)

time:
0.07766827166666654


In [24]:
x_ec = eurocup_features.iloc[:,:-1]
y_ec = eurocup_features.iloc[:,-1]

x1_ec = data[(data['Tournament'] =="Eurocup")&(data['Match Date'] >"2014-8-01")]

In [25]:
corr_df =eurocup_features.corr()['Team Result'].reset_index()
corr_df = corr_df.sort_values([('Team Result')], ascending = False)
corr_df

Unnamed: 0,index,Team Result
8,Team Result,1.0
5,away_TO%,0.026446
2,home_OREB%,0.016191
4,away_EFG%,-0.031412
7,away_FTRate,-0.035486
0,home_EFG%,-0.036308
3,home_FTRate,-0.041624
6,away_OREB%,-0.108612
1,home_TO%,-0.111494


In [26]:
x_train_ec=x_ec[:len(x1_ec[x1_ec["Match Date"]<"2017-8-1"])]
y_train_ec=y_ec.iloc[:len(x1_ec[x1_ec["Match Date"]<"2017-8-1"])]
x_test_ec=x_ec[len(x1_ec[x1_ec["Match Date"]<"2017-8-1"]):]
y_test_ec=y_ec.iloc[len(x1_ec[x1_ec["Match Date"]<"2017-8-1"]):]

In [27]:
lgr = LogisticRegression(random_state=1, max_iter=10000).fit(x_train_ec, y_train_ec)
y_pre_lgr= lgr.predict(x_test_ec)
print("Test Accuracy:",round(accuracy_score(y_pre_lgr,y_test_ec),3))

Test Accuracy: 0.62
