In [1]:
from itertools import chain, combinations
import numpy as np # Fundamental package for scientific computing with Python
import pandas as pd
import calendar
import time
from sklearn.metrics import accuracy_score, precision_score, f1_score
from lib.Utility import calculate_gainHAD, calculate_gain_O25
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Train the model
from sklearn.tree import DecisionTreeClassifier
def trainModel(x_train, y_train):
    # print(X_train)
    # Set regularization rate
    reg = 0.01
    # train a logistic regression model on the training set
    # mod = DecisionTreeClassifier(random_state=42
    #                             , min_samples_split=100
    #                             , class_weight={1: 1, 0: 1, 2: 1}
    #                             # , class_weight='balanced'
    #                             , max_depth=4  # , ccp_alpha= 0.00001
    #                             # , max_features= 3
    #                             ).fit(x_train, y_train)
    mod = RandomForestClassifier(n_estimators=100, random_state=42).fit(x_train, y_train)

    return mod



In [3]:
def custom_predict(X, threshold, model):
    probs = model.predict_proba(X) 
    return (probs[:, 1] > threshold).astype(int)
# Definisci una funzione che applica la logica descritta
def calculate_gain(row, threshold,quota_minima):
    if(row['predictions'] > threshold):
        if row['Cluster'] == row['predictions']:
            if row['predictions'] == 1 and row['B365H'] > quota_minima:
                return row['B365H']-1
            elif row['predictions'] == 2 and row['B365A'] > quota_minima:
                return row['B365A']-1
            elif row['predictions'] == 0 and row['B365D'] > quota_minima:
                return row['B365D']-1
        else:
            if((row['predictions'] == 0 and row['B365D'] > quota_minima) or (row['predictions'] == 2 and row['B365A'] > quota_minima) or (row['predictions'] == 1 and row['B365H'] > quota_minima)):
                return -1

In [4]:
dataframe = pd.DataFrame(columns=['DataSetName', 'TrainAccuracy', 'TrainPrecision', 'TrainF2', 'TrainRecall',
                                  'TrainROC', 'TestAccuracy', 'TestPrecision', 'TestF2', 'TestRecall', 'TestROC', 'BestParams'])

features = [
    # 'HomeGoalsCumulative',	'AwayGoalsCumulative',	'HomePointsCumulative',
    #         'AwayPointsCumulative',	'HomeGoalsConcededCumulative',	'AwayGoalsConcededCumulative',
             'PointsDifference'
            , 'ConcededGoalsDifference'
            , 'GoalsDifference'
            , 'AwayGoalsRatio'
            , 'HomeGoalsRatio'
            , 'B365H'
            ,	'B365D'
            ,	'B365A'
            ,'AwayGoalGap'
            , 'HomeGoalGap'
            , 'AwayPointGap'
            , 'HomePointGap'
            ]

combinazioni = chain.from_iterable(combinations(
    features, r) for r in range(1, len(features) + 1))

combos = []

for idx, combinazione in enumerate(combinazioni):
    current_GMT = time.gmtime()
    time_stamp = calendar.timegm(current_GMT)

    print(idx, time.strftime("%H:%M:%S", time.localtime()), ':', combinazione)
    combos.append(combinazione)

print(combos)

data = pd.read_excel('data/mergedDataFull2.xlsx')
# data['Cluster'] = np.where(data['FTHG'] == data['FTAG'], 1, 0)
data['Cluster'] = data['FTR'].map({'D': 0, 'H': 1, 'A': 2}) #np.where(data['FTHG'] == data['FTAG'], 1, 0)
data = data[data['B365H'].notna()]
data = data[data['B365A'].notna()]
data = data[data['B365D'].notna()]

# for threshold in range(5, 70, 5):
    # threshold /= 100
    
print("Extracting features columns and creating target variable...")

for idx, combo in enumerate(combos):
    current_GMT = time.gmtime()
    time_stamp = calendar.timegm(current_GMT)
    print(idx, time.strftime("%H:%M:%S", time.localtime()),
            'Trying with following combo', combo)
    feat_cols = [col for col in combo]

    x_train, x_test, y_train, y_test = train_test_split(
        data[feat_cols], data['Cluster'], test_size=0.3, random_state=42, shuffle=True)

    x_train = x_train.sort_index()
    x_test = x_test.sort_index()
    y_train = y_train.sort_index()
    y_test = y_test.sort_index()


    model = trainModel(x_train, y_train)

    # Predictions
    y_train_pred = model.predict(x_train.sort_index())
    # y_test_pred = model.predict(x_test.sort_index())
    text_pred_prob = model.predict_proba(x_test)
    # print('y_test_pred', y_test_pred)
    # clf.fit(x_train,y_train)

    # y_train_pred = clf.predict(x_train)
    # y_test_pred = clf.predict(x_test)

    # VN Calcolo la equity

    test_data = data[data.index.isin(x_test.index)]
    test_data = test_data.copy()
    # test_data['prediction'] = y_test_pred
    test_data['predictions'] = model.predict(x_test.sort_index())
    # print('test_data[predictions]', test_data['predictions'])
    # test_data['gain'] = np.where((test_data['FTR'] == 'D') & (test_data['predictions'] > threshold),
    #                                 test_data['B365D']-1, np.where((test_data['FTR'] != 'D') & (test_data['predictions'] > threshold), -1, 0))
    test_data['gain'] = test_data.apply(calculate_gainHAD, axis=1, args=('',))

    test_equity = test_data['gain'].sum()
    # print('test_data',test_data)

    print("Train metrics...")
    # Metrics
    # train_accuracy = accuracy_score(y_train, y_train_pred)
    # train_precision = precision_score(y_train, y_train_pred)

    print("Test metrics...")
    test_accuracy = round(100*accuracy_score(y_test, test_data['predictions']), 2)
    # test_precision = precision_score(y_test, y_test_pred)
    # test_f1 = f1_score(y_test, y_test_pred)

    # Save Results
    new_row = pd.Series({
        'DataSetName': str(combo),
        # 'TrainAccuracy': train_accuracy,
        # 'TrainPrecision': train_precision,
        'TestAccuracy': test_accuracy,
        # 'TestPrecision': test_precision,
        # 'TestF1': test_f1,
        'BestParams': model.get_params,
        'TestEquity': test_equity,
        # 'TestCagr': test_cagr,
        # 'TestMaxDD': test_max_dd,
        # 'Eseguiti': numOp
    })
    print(new_row)
    dataframe = pd.concat(
        [dataframe, new_row.to_frame().T], ignore_index=True)

dataframe.to_excel("Dataframe/"+str(time_stamp) + "all.xlsx")

# print(result)

0 18:08:42 : ('PointsDifference',)
1 18:08:42 : ('ConcededGoalsDifference',)
2 18:08:42 : ('GoalsDifference',)
3 18:08:42 : ('AwayGoalsRatio',)
4 18:08:42 : ('HomeGoalsRatio',)
5 18:08:42 : ('B365H',)
6 18:08:42 : ('B365D',)
7 18:08:42 : ('B365A',)
8 18:08:42 : ('AwayGoalGap',)
9 18:08:42 : ('HomeGoalGap',)
10 18:08:42 : ('AwayPointGap',)
11 18:08:42 : ('HomePointGap',)
12 18:08:42 : ('PointsDifference', 'ConcededGoalsDifference')
13 18:08:42 : ('PointsDifference', 'GoalsDifference')
14 18:08:42 : ('PointsDifference', 'AwayGoalsRatio')
15 18:08:42 : ('PointsDifference', 'HomeGoalsRatio')
16 18:08:42 : ('PointsDifference', 'B365H')
17 18:08:42 : ('PointsDifference', 'B365D')
18 18:08:42 : ('PointsDifference', 'B365A')
19 18:08:42 : ('PointsDifference', 'AwayGoalGap')
20 18:08:42 : ('PointsDifference', 'HomeGoalGap')
21 18:08:42 : ('PointsDifference', 'AwayPointGap')
22 18:08:42 : ('PointsDifference', 'HomePointGap')
23 18:08:42 : ('ConcededGoalsDifference', 'GoalsDifference')
24 18:08:4