In [17]:
from itertools import chain, combinations
import numpy as np # Fundamental package for scientific computing with Python
import pandas as pd
import calendar
import time
from sklearn.metrics import accuracy_score, precision_score, f1_score
from lib.Utility import calculate_gainHAD, calculate_gain_O25
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [18]:
# Train the model
from sklearn.tree import DecisionTreeClassifier


def trainModel(x_train, y_train):
    reg = 0.01
    mod = LogisticRegression(C=1/reg, multi_class='ovr',
                                solver="liblinear", random_state=42).fit(x_train, y_train)

    return mod

In [19]:
def custom_predict(X, threshold, model):
    probs = model.predict_proba(X) 
    return (probs[:, 1] > threshold).astype(int)   

In [20]:
dataframe = pd.DataFrame(columns=['DataSetName', 'TrainAccuracy', 'TrainPrecision', 'TrainF2', 'TrainRecall',
                                  'TrainROC', 'TestAccuracy', 'TestPrecision', 'TestF2', 'TestRecall', 'TestROC', 'BestParams'])

features = [
    # 'HomeGoalsCumulative', 'AwayGoalsCumulative', 'HomePointsCumulative', 'AwayPointsCumulative',
    # 'HomeGoalsConcededCumulative',	'AwayGoalsConcededCumulative',
    'PointsDifference',
    'ConcededGoalsDifference',
    'GoalsDifference',
    'AwayGoalsRatio',
    'HomeGoalsRatio',
    # 'AwayGoalGap', 'HomeGoalGap',
    # 'AwayPointGap', 'HomePointGap'
    # , 'B365D',
    # 'B365H',
    # 'EloRatio'
    # ,  'FormRatio', 'RecentFormRatio' 
    # ,'B365A', 'B365>2.5'
    # , 'UltimoScontroDiretto',
    # 'HomeWins',
    # 'HomeDraws',
    # 'HomeLosses',
    # 'AwayWins',
    # 'AwayDraws',
    # 'AwayLosses',
    'HomeLast3Points',
    # 'HomeAvgGoalsScored',
    # 'HomeAvgGoalsConceded',
    # 'HomeEwmaPoints',
    'HomeEwmaGoalsScored',
    'HomeEwmaGoalsConceded',
    'AwayLast3Points',
    # 'AwayAvgGoalsScored',
    # 'AwayAvgGoalsConceded',
    # 'AwayEwmaPoints',
    'AwayEwmaGoalsScored',
    'AwayEwmaGoalsConceded'
]
features = ['AwayWins','HomeWins','PointsDifference','ConcededGoalsDifference', 'HomeLast3Points', 'HomeEwmaGoalsScored', 'HomeEwmaGoalsConceded',
            'GoalsDifference', 'EloRatio',
            'Last3PointsDifference', 'GoalRatioDifference', 'EwmaGoalsSum', 'GoalsSum'
            ]
features = ['AwayWins','HomeWins',
            'AwayDraws',
    'AwayLosses',
        'HomeDraws',
    'HomeLosses'
            # ,'PointsDifference'
            ,'ConcededGoalsDifference', 'HomeLast3Points', 
            'GoalsDifference', 'EloRatio',
            'Last3PointsDifference', 'GoalRatioDifference'
            # , 'EwmaGoalsSum'
            , 'GoalsSum'
            ]

combinazioni = chain.from_iterable(combinations(
    features, r) for r in range(3, len(features) + 1))

combos = []

for idx, combinazione in enumerate(combinazioni):
    current_GMT = time.gmtime()
    time_stamp = calendar.timegm(current_GMT)

    #  print(idx, time.strftime("%H:%M:%S", time.localtime()), ':', combinazione)
    combos.append(combinazione)

#  print(combos)

data = pd.read_excel('data/mergedDataFull2.xlsx')
# data['Cluster'] = np.where(data['FTHG'] == data['FTAG'], 1, 0)
# np.where(data['FTHG'] == data['FTAG'], 1, 0)
data['Cluster'] = np.where(data['MatchGoal'] > 2, 1,  0)
data = data[data['B365>2.5'].notna()]
data['FormRatio'] = data['HomeForm']/data['AwayForm']
data['RecentFormRatio'] = data['HomeRecentHomeForm']/data['AwayRecentAwayForm']
data['UltimoScontroDiretto'] = data['UltimoScontroDiretto'].replace({'H': 1, 'A': 2, 'D': 0})

columns_to_check = [
    # 'HomeWins', 'HomeDraws', 'HomeLosses',
    # 'AwayWins', 'AwayDraws', 'AwayLosses',
    'HomeLast3Points',
#     , 'HomeAvgGoalsScored',
#     'HomeAvgGoalsConceded', 
#     # 'HomeEwmaPoints',
#     'HomeEwmaGoalsScored', 'HomeEwmaGoalsConceded',
    'AwayLast3Points'
# , 'AwayAvgGoalsScored',
#     'AwayAvgGoalsConceded', 
#     # 'AwayEwmaPoints',
#     'AwayEwmaGoalsScored', 'AwayEwmaGoalsConceded', 'UltimoScontroDiretto'
]

data = data.dropna(subset=columns_to_check)
# for threshold in range(5, 70, 5):
# threshold /= 100

#  print("Extracting features columns and creating target variable...")

from sklearn.metrics import classification_report

print('Length of combos',combos.__len__())
for idx, combo in enumerate(combos):
    current_GMT = time.gmtime()
    time_stamp = calendar.timegm(current_GMT)
    
    feat_cols = [col for col in combo]

    x_train, x_test, y_train, y_test = train_test_split(
        data[feat_cols], data['Cluster'], test_size=0.3, random_state=42, shuffle=True)

    x_train = x_train.sort_index()
    x_test = x_test.sort_index()
    y_train = y_train.sort_index()
    y_test = y_test.sort_index()


    model = trainModel(x_train, y_train)

    # Predictions
    y_train_pred = model.predict(x_train.sort_index())
    # y_test_pred = model.predict(x_test.sort_index())
    text_pred_prob = model.predict_proba(x_test)
    # #  print('y_test_pred', y_test_pred)
    # clf.fit(x_train,y_train)

    y_train_pred = model.predict(x_train)
    # y_test_pred = clf.predict(x_test)

    # VN Calcolo la equity

    test_data = data[data.index.isin(x_test.index)]
    test_data = test_data.copy()
    # test_data['prediction'] = y_test_pred
    test_data['prediction'] = model.predict(x_test.sort_index())
    # #  print('test_data[predictions]', test_data['prediction'])
    # test_data['gain'] = np.where((test_data['FTR'] == 'D') & (test_data['prediction'] > threshold),
    #                                 test_data['B365D']-1, np.where((test_data['FTR'] != 'D') & (test_data['prediction'] > threshold), -1, 0))
    test_data['gain'] = test_data.apply(calculate_gain_O25, axis=1)
    test_equity = test_data['gain'].sum()
    # #  print('test_data',test_data)

    #  print("Train metrics...")
    # Metrics
    train_accuracy = accuracy_score(y_train, y_train_pred)
    # train_precision = precision_score(y_train, y_train_pred)

    #  print("Test metrics...")
    test_accuracy = round(
        100*accuracy_score(y_test, test_data['prediction']), 2)
    # test_precision = precision_score(y_test, y_test_pred)
    # test_f1 = f1_score(y_test, y_test_pred)
    print(idx, time.strftime("%H:%M:%S", time.localtime()),test_accuracy, 'Trying with following combo', combo)


    # Ottieni il rapporto di classificazione
    report = classification_report(y_test, test_data['prediction'], output_dict=True)
    # print(report)
    for class_name, metrics in report.items():
        if class_name == '0':
            precision0 = metrics['precision']
        elif class_name == '1' :
            precision1 = metrics['precision']

    # Save Results
    new_row = pd.Series({
        'DataSetName': str(combo),
        'TrainAccuracy': train_accuracy,
        'Precision0': precision0,
        'Precision1': precision1,
        # 'TrainPrecision': train_precision,
        'TestAccuracy': test_accuracy,
        # 'TestPrecision': test_precision,
        # 'TestF1': test_f1,
        'BestParams': model.get_params,
        'TestEquity': test_equity,
        # 'TestCagr': test_cagr,
        # 'TestMaxDD': test_max_dd,
        # 'Eseguiti': numOp
    })
    #  print(new_row)
    dataframe = pd.concat(
        [dataframe, new_row.to_frame().T], ignore_index=True)

dataframe.to_excel("Dataframe/"+str(time_stamp) + "all.xlsx")

# #  print(result)

  data['UltimoScontroDiretto'] = data['UltimoScontroDiretto'].replace({'H': 1, 'A': 2, 'D': 0})


Length of combos 8100
0 19:07:12 50.34 Trying with following combo ('AwayWins', 'HomeWins', 'AwayDraws')
1 19:07:12 50.34 Trying with following combo ('AwayWins', 'HomeWins', 'AwayLosses')
2 19:07:12 50.34 Trying with following combo ('AwayWins', 'HomeWins', 'HomeDraws')
3 19:07:12 50.34 Trying with following combo ('AwayWins', 'HomeWins', 'HomeLosses')
4 19:07:12 52.59 Trying with following combo ('AwayWins', 'HomeWins', 'ConcededGoalsDifference')
5 19:07:12 50.57 Trying with following combo ('AwayWins', 'HomeWins', 'HomeLast3Points')
6 19:07:13 50.72 Trying with following combo ('AwayWins', 'HomeWins', 'GoalsDifference')
7 19:07:13 51.05 Trying with following combo ('AwayWins', 'HomeWins', 'EloRatio')
8 19:07:13 50.67 Trying with following combo ('AwayWins', 'HomeWins', 'Last3PointsDifference')
9 19:07:13 50.38 Trying with following combo ('AwayWins', 'HomeWins', 'GoalRatioDifference')
10 19:07:13 51.96 Trying with following combo ('AwayWins', 'HomeWins', 'GoalsSum')
11 19:07:13 50.3

In [21]:
# dataframe.to_excel("Dataframe/"+str(time_stamp) + "all.xlsx")