In [1]:
import argparse
import os

import wandb
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from evaluation.metrics import evaluate
from tqdm import tqdm
import torch
from util import ThresholdClassifier

# Single Step With Prior

In [8]:
from sklearn.metrics import f1_score


def eval_single_step(model_name, with_prior=True):
    model_path = os.path.join('../storage', model_name)
    if not os.path.exists(model_path):
        return None

    test_set = pd.read_csv(os.path.join(model_path, 'test.csv'))
    dev_set = pd.read_csv(os.path.join(model_path, 'val.csv'))
    if not with_prior:
        if 'fst_step_pred' in test_set.columns:
            test_set['pred'] = test_set['fst_step_pred']
            dev_set['pred'] = dev_set['fst_step_pred']
        else:
            # if not within the range of [0, 1], then use logit to transform
            if test_set['fst_step_scores'].max() > 1 or test_set['fst_step_scores'].min() < 0:
                import math
                def sigmoid(x):
                    return 1 / (1 + math.exp(-x))
                test_set['fst_step_scores'] = test_set['fst_step_scores'].apply(sigmoid)

            if dev_set['fst_step_scores'].max() > 1 or dev_set['fst_step_scores'].min() < 0:
                dev_set['fst_step_scores'] = dev_set['fst_step_scores'].apply(sigmoid)
    
    test_set['pred_proba'] = test_set['fst_step_scores']
    dev_set['pred_proba'] = dev_set['fst_step_scores']
    eval_dict = evaluate(test_set, with_prior=with_prior, dev_set=dev_set)
    result_df = pd.DataFrame(eval_dict, index=[0])
    # no column limit
    pd.set_option('display.max_columns', None)
    # print(result_df)
    # upload to wandb
    # wandb.log(eval_dict)
    return eval_dict

def train_and_test_two_step(model_name, predict_model_name, with_prior=False, baseline=False):
    model_path = os.path.join('../storage', model_name)
    train_set = pd.read_csv(os.path.join(model_path, 'train.csv'))
    val_set = pd.read_csv(os.path.join(model_path, 'val.csv'))
    test_set = pd.read_csv(os.path.join(model_path, 'test.csv'))
    
    # wandb.init(project="Risky-Trader-Prediction",
    #             entity="uoe-turing",
    #             name="Result-{}-{}".format(model_name, predict_model_name),
    #             tags=['predict', 'two-step'],
    #             )
    if not baseline:
        variables = ['PerFTSE20', 'AVGPTS3_20', 'SharpeRatio20', 'DurationRate20',
                     'ProfitRate20', 'WinTradeRate20', 'ProfitxDur20', 'PassAvgReturn',
                     'AvgShortSales20', 'TradFQ20', 'Period', 'accountid',
                     'NumTrades', 'AvgOpen20', 'DurationRatio20', 'OrderCloseRate20', 'fst_step_scores']
    else:
        variables = ['PerFTSE20', 'AVGPTS3_20', 'SharpeRatio20', 'DurationRate20',
                     'ProfitRate20', 'WinTradeRate20', 'ProfitxDur20', 'PassAvgReturn',
                     'AvgShortSales20', 'TradFQ20', 'Period', 'accountid',
                     'NumTrades', 'AvgOpen20', 'DurationRatio20', 'OrderCloseRate20']

    X_train = train_set[variables]
    y_train = train_set["anomaly"]

    if predict_model_name == 'rf':
        model = RandomForestClassifier(n_jobs=-1, max_depth=7, verbose=0).fit(X_train, y_train)
    elif predict_model_name == 'xgb':
        model = XGBClassifier(n_estimators=1000).fit(X_train, y_train, verbose=False)
    elif predict_model_name == 'svm':
        from sklearn.svm import SVC
        model = SVC(probability=True).fit(X_train, y_train)
        # feature_important = model.get_booster().get_score(importance_type='weight')
        # keys = list(feature_important.keys())
        # values = list(feature_important.values())
        # 
        # data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
        # data.nlargest(40, columns="score").plot(kind='barh', figsize = (20,10)) ## plot top 40 features
        
    elif predict_model_name == 'lgbm':
        # disable log for lgbm
        model = LGBMClassifier(verbose=-1, n_estimators=1000).fit(X_train, y_train)
    else:
        raise Exception('Model not implemented')
    
    # get feature importance for xgb
    feature_important = model.feature_importances_
    keys = variables
    values = feature_important
    data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
    data.nlargest(40, columns="score").plot(kind='barh', figsize = (20,10)) ## plot top 40 features
    plt.ylabel('Features')
    plt.xlabel('Feature Importance Score')
    model_name_map = {'rf': 'Random Forest', 'xgb': 'XGBoost Classifier (10^3)', 'lgbm': 'LightGBM Classifier (10^3)', 'svm': 'SVM'}
    plt.title(model_name_map[predict_model_name])
    # plt.show()
    plt.savefig(f'{predict_model_name}_feature_importance.pdf')
    print(data)
    X_test = test_set[variables]
    
    try:
        y_proba = model.predict_proba(X_test).iloc[:,1].values
    except:
        y_proba = model.predict_proba(X_test)[:, 1]
    
    def find_optimal_threshold(y_true, y_proba):
        best_threshold = 0
        best_f1 = 0
        # grid search from 0.1 to 1 with step 0.1
        for threshold in [i / 10.0 for i in range(1, 10)]:
            y_pred = [1 if proba >= threshold else 0 for proba in y_proba]
            f1 = f1_score(y_true, y_pred)
            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold
        return best_threshold, best_f1
    
    best_threshold, _ = find_optimal_threshold(test_set['anomaly'], y_proba)
    y_pred = [1 if proba >= best_threshold else 0 for proba in y_proba]
    test_set['pred'] = y_pred
    test_set['pred_proba'] = y_proba
    eval_dict = evaluate(test_set, with_prior=with_prior, dev_set=None)
    # upload to wandb
    # wandb.log(eval_dict)

    return eval_dict, model

def visualise_anomalies(model_name, with_prior=True):
    model_path = os.path.join('../storage', model_name)
    test_set = pd.read_csv(os.path.join(model_path, 'test.csv'))
    dev_set = pd.read_csv(os.path.join(model_path, 'val.csv'))
    if not with_prior:
        if 'fst_step_pred' in test_set.columns:
            test_set['pred'] = test_set['fst_step_pred']
        else:
            # if not within the range of [0, 1], then use logit to transform
            if test_set['fst_step_scores'].max() > 1 or test_set['fst_step_scores'].min() < 0:
                import math
                def sigmoid(x):
                    return 1 / (1 + math.exp(-x))
                test_set['fst_step_scores'] = test_set['fst_step_scores'].apply(sigmoid)
    
    test_set['pred_proba'] = test_set['fst_step_scores']
    dev_set['pred_proba'] = dev_set['fst_step_scores']
    eval_dict = evaluate(test_set, with_prior=with_prior, dev_set=dev_set)
    anomaly_profit = eval_dict['anomaly_profit']
    # kde plot
    return eval_dict['anomaly_profit']

In [3]:
fold = 3
ori_model_name = [
    f'rf_benchmark_fold{str(fold)}', f'xgb_benchmark_fold{str(fold)}', f'lgbm_benchmark_fold{str(fold)}',
    f'deepsad_benchmark_fold{str(fold)}', f'deepisolationforest_benchmark_fold{str(fold)}', f'feawad_benchmark_fold{str(fold)}', f'slad_benchmark_fold{str(fold)}',
    f'sour_100_binary_fold{str(fold)}', f'tabtransformer_benchmark_fold{str(fold)}',
    f'lambdamart_100_binary_1000_trees_fold{str(fold)}', f'lambdamart_100_binary_10000_trees_fold{str(fold)}'
]
ori_proposed_model_names = [
    f'rankformer_100_binary_softmax_fold{str(fold)}', f'rankformer_100_binary_lambdaloss_fold{str(fold)}', f'pariskranker_100_binary_graph_fold{str(fold)}', f'pariskranker_100_binary_softmax_fold{str(fold)}'
]
for proposed_model_name in ori_proposed_model_names:
    model_path = os.path.join('../storage', proposed_model_name)
    if not os.path.isdir(model_path):
        continue
    # only list the subfolders name without absolute path
    subfolders = [f"{proposed_model_name}/{f.name}" for f in os.scandir(model_path) if f.is_dir()]
    ori_model_name.extend(subfolders)

In [4]:
result_df = pd.DataFrame()

for name in tqdm(ori_model_name):
    eval_dict = eval_single_step(name, with_prior=True)
    if eval_dict is not None:
        result_df = result_df._append(pd.DataFrame(eval_dict, index=[name]))

result_df.sort_values(by='f1_score', ascending=False)
result_df

100%|██████████| 14/14 [00:02<00:00,  4.97it/s]


Unnamed: 0,f1_score,PnL,auc_score,precision,sensitivity,specificity
rf_benchmark_fold3,0.9699,-70193.1,0.9999,0.9405,0.9405,0.9994
xgb_benchmark_fold3,0.9761,-55661.53,0.9999,0.9527,0.9527,0.9995
lgbm_benchmark_fold3,0.9735,-60714.45,0.9999,0.9475,0.9475,0.9995
deepsad_benchmark_fold3,0.9814,-40864.09,0.9998,0.9632,0.9632,0.9996
deepisolationforest_benchmark_fold3,0.6029,-722870.87,0.9664,0.2137,0.2137,0.9921
feawad_benchmark_fold3,0.7603,-406563.07,0.9944,0.5254,0.5254,0.9952
slad_benchmark_fold3,0.6037,-733141.98,0.9674,0.2154,0.2154,0.9921
sour_100_binary_fold3,0.4949,-1068217.55,0.1024,0.0,0.0,0.9899
tabtransformer_benchmark_fold3,0.9682,-64283.37,0.9995,0.937,0.937,0.9994
lambdamart_100_binary_1000_trees_fold3,0.9735,-60172.53,0.9999,0.9475,0.9475,0.9995


In [5]:
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
result_df = pd.DataFrame()

for name in tqdm(ori_model_name):
    eval_dict = eval_single_step(name, with_prior=False)
    if eval_dict is not None:
        result_df = result_df._append(pd.DataFrame(eval_dict, index=[name]))
result_df.sort_values(by='f1_score', ascending=False)
result_df

100%|██████████| 14/14 [00:17<00:00,  1.24s/it]


Unnamed: 0,f1_score,PnL,auc_score,precision,sensitivity,specificity
rf_benchmark_fold3,0.9715,-66672.92,0.9999,0.9487,0.9387,0.9995
xgb_benchmark_fold3,0.9773,-54497.74,0.9999,0.9833,0.9282,0.9998
lgbm_benchmark_fold3,0.9763,-55887.26,0.9999,0.9814,0.9264,0.9998
deepsad_benchmark_fold3,0.567,-765955.46,0.9998,0.0999,1.0,0.909
deepisolationforest_benchmark_fold3,0.6217,-512311.97,0.9664,0.1724,0.5166,0.975
feawad_benchmark_fold3,0.5686,-1401114.91,0.9944,0.1013,1.0,0.9104
slad_benchmark_fold3,0.6098,-545442.64,0.9674,0.1474,0.6637,0.9612
sour_100_binary_fold3,0.4975,-1047720.2,0.1024,0.0,0.0,1.0
tabtransformer_benchmark_fold3,0.9744,-50196.76,0.9995,0.9168,0.9842,0.9991
lambdamart_100_binary_1000_trees_fold3,0.9742,-58702.87,0.9999,0.954,0.944,0.9995


In [11]:
all_results = []
for fold in [1, 2, 3]:
    model_name = [
        f'rf_benchmark_fold{fold}', f'xgb_benchmark_fold{fold}', f'lgbm_benchmark_fold{fold}',
        f'deepsad_benchmark_fold{fold}', f'deepisolationforest_benchmark_fold{fold}', f'feawad_benchmark_fold{fold}', f'slad_benchmark_fold{fold}',
        f'sour_100_binary_fold{fold}', f'tabtransformer_benchmark_fold{fold}',
        f'lambdamart_100_binary_1000_trees_fold{fold}', f'lambdamart_100_binary_10000_trees_fold{fold}', f'lambdamart_pabce_10000_fold{fold}'
    ]
    proposed_model_names = [
        f'rankformer_100_binary_softmax_fold{fold}', f'rankformer_100_binary_lambdaloss_fold{fold}',
        f'pariskranker_100_binary_graph_fold{fold}'
    ]
    for proposed_model_name in proposed_model_names:
        model_path = os.path.join('../storage', proposed_model_name)
        if not os.path.isdir(model_path):
            continue
        subfolders = [f"{proposed_model_name}/{f.name}" for f in os.scandir(model_path) if f.is_dir()]
        model_name.extend(subfolders)

    for name in tqdm(model_name):
        eval_dict = eval_single_step(name, with_prior=True)
        if eval_dict is not None:
            eval_dict["model"] = name.rsplit('_fold', 1)[0]  # Strip fold suffix
            eval_dict["fold"] = fold
            all_results.append(eval_dict)

# Combine and compute stats
df_all = pd.DataFrame(all_results)
grouped = df_all.groupby("model").agg(['mean'])

# Flatten column names
grouped.columns = ['_'.join(col).strip() for col in grouped.columns.values]
grouped = grouped.sort_values(by='f1_score_mean', ascending=False)

grouped

100%|██████████| 15/15 [00:04<00:00,  3.69it/s]
100%|██████████| 15/15 [00:04<00:00,  3.74it/s]
100%|██████████| 15/15 [00:04<00:00,  3.74it/s]


Unnamed: 0_level_0,f1_score_mean,PnL_mean,auc_score_mean,precision_mean,sensitivity_mean,specificity_mean,fold_mean
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
pariskranker_100_binary_graph,0.987033,-31368.39,0.9998,0.9743,0.9743,0.999767,2.0
rankformer_100_binary_softmax,0.982,-43821.78,0.999933,0.9644,0.9644,0.999633,2.0
deepsad_benchmark,0.976667,-52499.95,0.999867,0.953867,0.953867,0.999533,2.0
xgb_benchmark,0.9755,-58149.26,0.9999,0.951567,0.951567,0.9995,2.0
lgbm_benchmark,0.9729,-63659.04,0.9999,0.9463,0.9463,0.9995,2.0
lambdamart_100_binary_10000_trees,0.969367,-72817.16,0.9999,0.9393,0.9393,0.999367,2.0
lambdamart_100_binary_1000_trees,0.969367,-72757.35,0.9999,0.9393,0.9393,0.999367,2.0
rf_benchmark,0.968167,-74770.6,0.999833,0.937,0.937,0.999367,2.0
lambdamart_pabce_10000,0.966667,-83895.95,0.999067,0.934033,0.934033,0.999333,2.0
tabtransformer_benchmark,0.959333,-88705.93,0.9971,0.919467,0.919467,0.9992,2.0


In [12]:
all_results = []
for fold in [1, 2, 3]:
    model_name = [
        f'rf_benchmark_fold{fold}', f'xgb_benchmark_fold{fold}', f'lgbm_benchmark_fold{fold}',
        f'deepsad_benchmark_fold{fold}', f'deepisolationforest_benchmark_fold{fold}', f'feawad_benchmark_fold{fold}', f'slad_benchmark_fold{fold}',
        f'sour_100_binary_fold{fold}', f'tabtransformer_benchmark_fold{fold}',
        f'lambdamart_100_binary_1000_trees_fold{fold}', f'lambdamart_100_binary_10000_trees_fold{fold}', f'lambdamart_pabce_10000_fold{fold}'
    ]
    proposed_model_names = [
        f'rankformer_100_binary_softmax_fold{fold}', f'rankformer_100_binary_lambdaloss_fold{fold}',
        f'pariskranker_100_binary_graph_fold{fold}'
    ]
    for proposed_model_name in proposed_model_names:
        model_path = os.path.join('../storage', proposed_model_name)
        if not os.path.isdir(model_path):
            continue
        subfolders = [f"{proposed_model_name}/{f.name}" for f in os.scandir(model_path) if f.is_dir()]
        model_name.extend(subfolders)

    for name in tqdm(model_name):
        eval_dict = eval_single_step(name, with_prior=False)
        if eval_dict is not None:
            eval_dict["model"] = name.rsplit('_fold', 1)[0]  # Strip fold suffix
            eval_dict["fold"] = fold
            all_results.append(eval_dict)

# Combine and compute stats
df_all = pd.DataFrame(all_results)
grouped = df_all.groupby("model").agg(['mean'])

# Flatten column names
grouped.columns = ['_'.join(col).strip() for col in grouped.columns.values]
grouped = grouped.sort_values(by='f1_score_mean', ascending=False)

grouped

100%|██████████| 15/15 [00:12<00:00,  1.22it/s]
100%|██████████| 15/15 [00:12<00:00,  1.21it/s]
100%|██████████| 15/15 [00:12<00:00,  1.22it/s]


Unnamed: 0_level_0,f1_score_mean,PnL_mean,auc_score_mean,precision_mean,sensitivity_mean,specificity_mean,fold_mean
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
pariskranker_100_binary_graph,0.985633,-35332.36,0.9998,0.970767,0.972567,0.999733,2.0
rankformer_100_binary_softmax,0.981133,-45342.84,0.999933,0.958067,0.9673,0.999567,2.0
lgbm_benchmark,0.972233,-66277.72,0.9999,0.968167,0.922933,0.9997,2.0
xgb_benchmark,0.9718,-67644.6,0.9999,0.975833,0.914767,0.999767,2.0
lambdamart_100_binary_10000_trees,0.969867,-72533.09,0.9999,0.9565,0.9247,0.9996,2.0
lambdamart_100_binary_1000_trees,0.9679,-77118.5,0.9999,0.941933,0.9317,0.9994,2.0
rf_benchmark,0.964867,-85672.36,0.999833,0.968333,0.896067,0.999733,2.0
tabtransformer_benchmark,0.959333,-89030.83,0.9971,0.900733,0.939267,0.998967,2.0
rankformer_100_binary_lambdaloss,0.9533,-104037.6,0.999733,0.8883,0.928767,0.9988,2.0
feawad_benchmark,0.696467,-657491.5,0.992133,0.305767,0.830733,0.962867,2.0


# Two Step Prediction

In [6]:
# from tqdm import tqdm
#
# result_df = pd.DataFrame()
#
# for name in tqdm(model_name):
#     for predict_model_name in ['xgb']:
#         eval_dict, _ = train_and_test_two_step(name, predict_model_name, with_prior=False)
#         result_df = result_df._append(pd.DataFrame(eval_dict, index=[name + '-' + predict_model_name]))
#
# result_df.to_csv('two_step_xgb.csv')
# result_df.sort_values(by='f1_score', ascending=False)

In [13]:
from tqdm import tqdm

result_df = pd.DataFrame()

for name in tqdm(model_name):
    for predict_model_name in ['lgbm']:
        eval_dict, _ = train_and_test_two_step(name, predict_model_name, with_prior=False)
        result_df = result_df._append(pd.DataFrame(eval_dict, index=[name + '-' + predict_model_name]))

result_df.to_csv('two_step_lgbm.csv')
result_df.sort_values(by='f1_score', ascending=False)

  0%|          | 0/15 [00:00<?, ?it/s]


KeyError: "['PerFTSE20', 'AVGPTS3_20', 'SharpeRatio20', 'DurationRate20', 'ProfitRate20', 'WinTradeRate20', 'ProfitxDur20', 'PassAvgReturn', 'AvgShortSales20', 'TradFQ20', 'Period', 'accountid', 'NumTrades', 'AvgOpen20', 'DurationRatio20', 'OrderCloseRate20'] not in index"

In [None]:
# from tqdm import tqdm
#
# result_df = pd.DataFrame()
#
# for name in tqdm(model_name):
#     for predict_model_name in ['rf']:
#         eval_dict, _ = train_and_test_two_step(name, predict_model_name, with_prior=False)
#         result_df = result_df._append(pd.DataFrame(eval_dict, index=[name + '-' + predict_model_name]))
#
# result_df.to_csv('two_step_rf.csv')
# result_df.sort_values(by='f1_score', ascending=False)

In [None]:
# result_df = pd.DataFrame()
#
# def test_baseline(baseline_name):
#     train_set = pd.read_csv("../data/lcg_train.csv")
#     val_set = pd.read_csv("../data/lcg_val.csv")
#     test_set = pd.read_csv("../data/lcg_test.csv")
#
#
#     variables = ['PerFTSE20', 'AVGPTS3_20', 'SharpeRatio20', 'DurationRate20',
#                  'ProfitRate20', 'WinTradeRate20', 'ProfitxDur20', 'PassAvgReturn',
#                  'AvgShortSales20', 'TradFQ20', 'Period', 'accountid',
#                  'NumTrades', 'AvgOpen20', 'DurationRatio20', 'OrderCloseRate20']
#
#     X_train = train_set[variables]
#     y_train = train_set["anomaly"]
#
#     if baseline_name == 'rf':
#         model = RandomForestClassifier(n_jobs=-1, max_depth=7, verbose=0).fit(X_train, y_train)
#     elif baseline_name == 'xgb':
#         model = XGBClassifier(n_estimators=1000).fit(X_train, y_train, verbose=False)
#
#     elif baseline_name == 'lgbm':
#         # disable log for lgbm
#         model = LGBMClassifier(verbose=-1, n_estimators=1000).fit(X_train, y_train)
#     else:
#         raise Exception('Model not implemented')
#
#
#     X_test = test_set[variables]
#
#     try:
#         y_proba = model.predict_proba(X_test).iloc[:,1].values
#     except:
#         y_proba = model.predict_proba(X_test)[:, 1]
#
#     def find_optimal_threshold(y_true, y_proba):
#         best_threshold = 0
#         best_f1 = 0
#         # grid search from 0.1 to 1 with step 0.1
#         for threshold in [i / 10.0 for i in range(1, 10)]:
#             y_pred = [1 if proba >= threshold else 0 for proba in y_proba]
#             f1 = f1_score(y_true, y_pred)
#             if f1 > best_f1:
#                 best_f1 = f1
#                 best_threshold = threshold
#         return best_threshold, best_f1
#
#     best_threshold, _ = find_optimal_threshold(test_set['anomaly'], y_proba)
#     y_pred = [1 if proba >= best_threshold else 0 for proba in y_proba]
#     test_set['pred'] = y_pred
#     test_set['pred_proba'] = y_proba
#     test_set['fst_step_scores'] = y_proba
#     eval_dict = evaluate(test_set, with_prior=False, dev_set=None)
#     return eval_dict
#
# for predict_model_name in tqdm(['rf', 'xgb', 'lgbm']):
#         eval_dict = test_baseline(predict_model_name)
#         result_df = result_df._append(pd.DataFrame(eval_dict, index=[predict_model_name]))
#
# result_df.sort_values(by='f1_score', ascending=False)

In [None]:
# import matplotlib
# matplotlib.rc('font', size=16)
# eval_dict, model = train_and_test_two_step("embed_graph_rankformer_200_binary_graph/30", 'lgbm', with_prior=False)