# features test

In [1]:
import os
import pandas as pd
import numpy as np
import re
import ast
import json
DATA_DIR = '../../data/Evaluation_CoTs/'
storage_dir = os.path.join(DATA_DIR, 'Algo_Design_Data')
file_store_path = os.path.join(storage_dir, 'final_extracted.json')
import sys
sys.path.insert(0, '../')
from utils import *

  from .autonotebook import tqdm as notebook_tqdm


# Abalation Study 2: Compare effects of different IDV score models

In [8]:
from CS_based_early_stopping import trained_LR_model, CS_early_stopping
import itertools
from itertools import combinations


N_values = [2, 3, 4, 5]
threshold_values = [0.1,0.3,0.2,0.4,0.5]
features = ['LEN', 'QUA_IM', 'DIF_IV', 'SIM_INPUT', 'SIM_COT_BIGRAM']
feature_combinations = []
for r in range(1, len(features) + 1):
    feature_combinations.extend([list(comb) for comb in combinations(features, r)])

df = pd.read_json(file_store_path, lines=True)
df = df[df.Model != 'gpt-4'].reset_index(drop=True)
base_acc = df.SC_correctness.sum() / len(df)

results = {}

In [9]:


for feature_li in feature_combinations:
    try:
        print(f"Running experiments for feature set: {feature_li}")
        df_cs, roc = trained_LR_model(df, feature_li, report_auroc=True)
        base_acc = df_cs.SC_correctness.sum() / len(df_cs)
        print(base_acc)
        min_steps = float('inf')
        max_cs_accuracy = 0
        best_params = None
        
        for N, threshold in itertools.product(N_values, threshold_values):
            df_final = CS_early_stopping(df=df_cs, threshold=threshold, N=N)
            cs_accuracy = df_final.CS_correctness.sum() / len(df_final)
            steps = df_final.CS_steps.mean()
            if base_acc < cs_accuracy and steps < min_steps:
                min_steps = steps
                best_params = {
                    'rocauc': roc,
                    'N': N,
                    'threshold': threshold,
                    'steps': steps,
                    'cs_acc': cs_accuracy
                }
            
            if cs_accuracy > max_cs_accuracy:
                max_cs_accuracy = cs_accuracy
                max_params = {
                    'rocauc': roc,
                    'N': N,
                    'threshold': threshold,
                    'steps': df_final.CS_steps.mean(),
                    'cs_acc': cs_accuracy
                }
        
        if best_params is not None:
            results[tuple(feature_li)] = best_params
        else:
            results[tuple(feature_li)] = max_params
    except np.linalg.LinAlgError:
        print(f"Skipping feature set {feature_li} due to singular matrix.")
        continue

print("Final results:")
for feature_set, params in results.items():
    print(f"Feature set: {feature_set}")
    print(f"ROCAUC: {params['rocauc']}, N: {params['N']}, Threshold: {params['threshold']}, Steps: {params['steps']}, CS Accuracy: {params['cs_acc']}")

# TO DO
# 1. WHAT SHOULD BE THE N/TRESHOLD OF OUR BEST MODEL??
# 2. Feature Selectiom; Is it necessary to do this? should it be based on roauc? (note that correctness is just an approximation of hallucination)
# 3. How to justify the use of IDV confidence model? (the current results suggest that higher rocauc is not always better, but it is necessary)
# 4. Should the cost anlaysis be done as a python script to time or as a theretical analysis?


Running experiments for feature set: ['LEN']
Optimization terminated successfully.
         Current function value: 0.658740
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:            Correctness   No. Observations:               250960
Model:                          Logit   Df Residuals:                   250958
Method:                           MLE   Df Model:                            1
Date:                Wed, 29 May 2024   Pseudo R-squ.:                 0.01886
Time:                        15:33:02   Log-Likelihood:            -1.6532e+05
converged:                       True   LL-Null:                   -1.6849e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0222      0.007      3.101      0.002       0.

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.5681688900197942
0.43159851301115243
SC_ACC : 0.43159851301115243
ES_ACC : 0.43159851301115243
CS_ACC : 0.3929368029739777
SC_Avg_Steps : 40
ES_Avg_Steps : 15.52453531598513
CS_Avg_Steps : 1.0107806691449814
ASC_Avg_Steps : 13.35910780669145
ASC_ACC : 0.43308550185873607
SC_ACC : 0.43159851301115243
ES_ACC : 0.43159851301115243
CS_ACC : 0.3966542750929368
SC_Avg_Steps : 40
ES_Avg_Steps : 15.52453531598513
CS_Avg_Steps : 2.87360594795539
ASC_Avg_Steps : 13.35910780669145
ASC_ACC : 0.43308550185873607
SC_ACC : 0.43159851301115243
ES_ACC : 0.43159851301115243
CS_ACC : 0.39330855018587363
SC_Avg_Steps : 40
ES_Avg_Steps : 15.52453531598513
CS_Avg_Steps : 1.1513011152416357
ASC_Avg_Steps : 13.35910780669145
ASC_ACC : 0.43308550185873607
SC_ACC : 0.43159851301115243
ES_ACC : 0.43159851301115243
CS_ACC : 0.3929368029739777
SC_Avg_Steps : 40
ES_Avg_Steps : 15.52453531598513
CS_Avg_Steps : 5.957620817843866
ASC_Avg_Steps : 13.35910780669145
ASC_ACC : 0.43308550185873607
SC_

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.5018227890000243
0.44052044609665425
SC_ACC : 0.44052044609665425
ES_ACC : 0.441635687732342
CS_ACC : 0.38847583643122674
SC_Avg_Steps : 40
ES_Avg_Steps : 15.923048327137547
CS_Avg_Steps : 1.0
ASC_Avg_Steps : 13.602973977695168
ASC_ACC : 0.43940520446096654
SC_ACC : 0.44052044609665425
ES_ACC : 0.441635687732342
CS_ACC : 0.38810408921933087
SC_Avg_Steps : 40
ES_Avg_Steps : 15.923048327137547
CS_Avg_Steps : 1.0070631970260222
ASC_Avg_Steps : 13.602973977695168
ASC_ACC : 0.43940520446096654
SC_ACC : 0.44052044609665425
ES_ACC : 0.441635687732342
CS_ACC : 0.38810408921933087
SC_Avg_Steps : 40
ES_Avg_Steps : 15.923048327137547
CS_Avg_Steps : 1.0070631970260222
ASC_Avg_Steps : 13.602973977695168
ASC_ACC : 0.43940520446096654
SC_ACC : 0.44052044609665425
ES_ACC : 0.441635687732342
CS_ACC : 0.20929368029739778
SC_Avg_Steps : 40
ES_Avg_Steps : 15.923048327137547
CS_Avg_Steps : 40.0
ASC_Avg_Steps : 13.602973977695168
ASC_ACC : 0.43940520446096654
SC_ACC : 0.440520446096654

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.6256168019987508
0.445724907063197
SC_ACC : 0.445724907063197
ES_ACC : 0.44646840148698885
CS_ACC : 0.4382899628252788
SC_Avg_Steps : 40
ES_Avg_Steps : 15.555762081784387
CS_Avg_Steps : 3.3111524163568773
ASC_Avg_Steps : 13.298884758364313
ASC_ACC : 0.445724907063197
SC_ACC : 0.445724907063197
ES_ACC : 0.44646840148698885
CS_ACC : 0.4382899628252788
SC_Avg_Steps : 40
ES_Avg_Steps : 15.555762081784387
CS_Avg_Steps : 3.3111524163568773
ASC_Avg_Steps : 13.298884758364313
ASC_ACC : 0.445724907063197
SC_ACC : 0.445724907063197
ES_ACC : 0.44646840148698885
CS_ACC : 0.4382899628252788
SC_Avg_Steps : 40
ES_Avg_Steps : 15.555762081784387
CS_Avg_Steps : 3.3111524163568773
ASC_Avg_Steps : 13.298884758364313
ASC_ACC : 0.445724907063197
SC_ACC : 0.445724907063197
ES_ACC : 0.44646840148698885
CS_ACC : 0.4382899628252788
SC_Avg_Steps : 40
ES_Avg_Steps : 15.555762081784387
CS_Avg_Steps : 3.3111524163568773
ASC_Avg_Steps : 13.298884758364313
ASC_ACC : 0.445724907063197
SC_ACC : 0.

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.5986919340811416
0.4349442379182156
SC_ACC : 0.4349442379182156
ES_ACC : 0.43717472118959105
CS_ACC : 0.4018587360594795
SC_Avg_Steps : 40
ES_Avg_Steps : 15.676579925650557
CS_Avg_Steps : 1.0
ASC_Avg_Steps : 13.261710037174721
ASC_ACC : 0.43531598513011155
SC_ACC : 0.4349442379182156
ES_ACC : 0.43717472118959105
CS_ACC : 0.4070631970260223
SC_Avg_Steps : 40
ES_Avg_Steps : 15.676579925650557
CS_Avg_Steps : 1.7048327137546468
ASC_Avg_Steps : 13.261710037174721
ASC_ACC : 0.43531598513011155
SC_ACC : 0.4349442379182156
ES_ACC : 0.43717472118959105
CS_ACC : 0.4052044609665427
SC_Avg_Steps : 40
ES_Avg_Steps : 15.676579925650557
CS_Avg_Steps : 1.0360594795539033
ASC_Avg_Steps : 13.261710037174721
ASC_ACC : 0.43531598513011155
SC_ACC : 0.4349442379182156
ES_ACC : 0.43717472118959105
CS_ACC : 0.3814126394052045
SC_Avg_Steps : 40
ES_Avg_Steps : 15.676579925650557
CS_Avg_Steps : 13.339033457249071
ASC_Avg_Steps : 13.261710037174721
ASC_ACC : 0.43531598513011155
SC_ACC : 0.43

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.6320711806676031
0.42936802973977695
SC_ACC : 0.42936802973977695
ES_ACC : 0.43271375464684014
CS_ACC : 0.3825278810408922
SC_Avg_Steps : 40
ES_Avg_Steps : 15.838289962825279
CS_Avg_Steps : 1.0
ASC_Avg_Steps : 13.463197026022305
ASC_ACC : 0.43048327137546466
SC_ACC : 0.42936802973977695
ES_ACC : 0.43271375464684014
CS_ACC : 0.4133828996282528
SC_Avg_Steps : 40
ES_Avg_Steps : 15.838289962825279
CS_Avg_Steps : 2.1319702602230484
ASC_Avg_Steps : 13.463197026022305
ASC_ACC : 0.43048327137546466
SC_ACC : 0.42936802973977695
ES_ACC : 0.43271375464684014
CS_ACC : 0.4144981412639405
SC_Avg_Steps : 40
ES_Avg_Steps : 15.838289962825279
CS_Avg_Steps : 2.0226765799256508
ASC_Avg_Steps : 13.463197026022305
ASC_ACC : 0.43048327137546466
SC_ACC : 0.42936802973977695
ES_ACC : 0.43271375464684014
CS_ACC : 0.40483271375464686
SC_Avg_Steps : 40
ES_Avg_Steps : 15.838289962825279
CS_Avg_Steps : 11.939405204460966
ASC_Avg_Steps : 13.463197026022305
ASC_ACC : 0.43048327137546466
SC_ACC 

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.5648056186777022
0.4345724907063197
SC_ACC : 0.4345724907063197
ES_ACC : 0.43643122676579926
CS_ACC : 0.3992565055762082
SC_Avg_Steps : 40
ES_Avg_Steps : 15.69182156133829
CS_Avg_Steps : 1.0092936802973977
ASC_Avg_Steps : 13.476208178438661
ASC_ACC : 0.43271375464684014
SC_ACC : 0.4345724907063197
ES_ACC : 0.43643122676579926
CS_ACC : 0.3973977695167286
SC_Avg_Steps : 40
ES_Avg_Steps : 15.69182156133829
CS_Avg_Steps : 3.00817843866171
ASC_Avg_Steps : 13.476208178438661
ASC_ACC : 0.43271375464684014
SC_ACC : 0.4345724907063197
ES_ACC : 0.43643122676579926
CS_ACC : 0.40111524163568774
SC_Avg_Steps : 40
ES_Avg_Steps : 15.69182156133829
CS_Avg_Steps : 1.1568773234200744
ASC_Avg_Steps : 13.476208178438661
ASC_ACC : 0.43271375464684014
SC_ACC : 0.4345724907063197
ES_ACC : 0.43643122676579926
CS_ACC : 0.4003717472118959
SC_Avg_Steps : 40
ES_Avg_Steps : 15.69182156133829
CS_Avg_Steps : 5.838289962825279
ASC_Avg_Steps : 13.476208178438661
ASC_ACC : 0.43271375464684014
SC_A

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.6789248135756595
0.41858736059479557
SC_ACC : 0.41858736059479557
ES_ACC : 0.420817843866171
CS_ACC : 0.42825278810408923
SC_Avg_Steps : 40
ES_Avg_Steps : 15.898884758364312
CS_Avg_Steps : 3.64907063197026
ASC_Avg_Steps : 13.535687732342007
ASC_ACC : 0.41858736059479557
SC_ACC : 0.41858736059479557
ES_ACC : 0.420817843866171
CS_ACC : 0.42973977695167287
SC_Avg_Steps : 40
ES_Avg_Steps : 15.898884758364312
CS_Avg_Steps : 4.600371747211896
ASC_Avg_Steps : 13.535687732342007
ASC_ACC : 0.41858736059479557
SC_ACC : 0.41858736059479557
ES_ACC : 0.420817843866171
CS_ACC : 0.42862453531598516
SC_Avg_Steps : 40
ES_Avg_Steps : 15.898884758364312
CS_Avg_Steps : 3.717472118959108
ASC_Avg_Steps : 13.535687732342007
ASC_ACC : 0.41858736059479557
SC_ACC : 0.41858736059479557
ES_ACC : 0.420817843866171
CS_ACC : 0.43197026022304835
SC_Avg_Steps : 40
ES_Avg_Steps : 15.898884758364312
CS_Avg_Steps : 6.170260223048327
ASC_Avg_Steps : 13.535687732342007
ASC_ACC : 0.41858736059479557
SC

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.6145836258925844
0.4256505576208178
SC_ACC : 0.4256505576208178
ES_ACC : 0.42713754646840146
CS_ACC : 0.3825278810408922
SC_Avg_Steps : 40
ES_Avg_Steps : 15.457992565055761
CS_Avg_Steps : 1.007806691449814
ASC_Avg_Steps : 13.304089219330855
ASC_ACC : 0.42602230483271375
SC_ACC : 0.4256505576208178
ES_ACC : 0.42713754646840146
CS_ACC : 0.38884758364312266
SC_Avg_Steps : 40
ES_Avg_Steps : 15.457992565055761
CS_Avg_Steps : 3.4308550185873607
ASC_Avg_Steps : 13.304089219330855
ASC_ACC : 0.42602230483271375
SC_ACC : 0.4256505576208178
ES_ACC : 0.42713754646840146
CS_ACC : 0.3814126394052045
SC_Avg_Steps : 40
ES_Avg_Steps : 15.457992565055761
CS_Avg_Steps : 1.333457249070632
ASC_Avg_Steps : 13.304089219330855
ASC_ACC : 0.42602230483271375
SC_ACC : 0.4256505576208178
ES_ACC : 0.42713754646840146
CS_ACC : 0.39442379182156134
SC_Avg_Steps : 40
ES_Avg_Steps : 15.457992565055761
CS_Avg_Steps : 9.172118959107806
ASC_Avg_Steps : 13.304089219330855
ASC_ACC : 0.42602230483271375

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.6422246075975453
0.41524163568773237
SC_ACC : 0.41524163568773237
ES_ACC : 0.4163568773234201
CS_ACC : 0.3825278810408922
SC_Avg_Steps : 40
ES_Avg_Steps : 15.440520446096654
CS_Avg_Steps : 1.0219330855018587
ASC_Avg_Steps : 13.323791821561338
ASC_ACC : 0.41263940520446096
SC_ACC : 0.41524163568773237
ES_ACC : 0.4163568773234201
CS_ACC : 0.38661710037174724
SC_Avg_Steps : 40
ES_Avg_Steps : 15.440520446096654
CS_Avg_Steps : 4.237546468401487
ASC_Avg_Steps : 13.323791821561338
ASC_ACC : 0.41263940520446096
SC_ACC : 0.41524163568773237
ES_ACC : 0.4163568773234201
CS_ACC : 0.38327137546468404
SC_Avg_Steps : 40
ES_Avg_Steps : 15.440520446096654
CS_Avg_Steps : 1.3022304832713754
ASC_Avg_Steps : 13.323791821561338
ASC_ACC : 0.41263940520446096
SC_ACC : 0.41524163568773237
ES_ACC : 0.4163568773234201
CS_ACC : 0.3966542750929368
SC_Avg_Steps : 40
ES_Avg_Steps : 15.440520446096654
CS_Avg_Steps : 8.378066914498142
ASC_Avg_Steps : 13.323791821561338
ASC_ACC : 0.412639405204460

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.6229082614051027
0.43717472118959105
SC_ACC : 0.43717472118959105
ES_ACC : 0.43643122676579926
CS_ACC : 0.42862453531598516
SC_Avg_Steps : 40
ES_Avg_Steps : 15.375092936802973
CS_Avg_Steps : 3.2799256505576206
ASC_Avg_Steps : 13.207434944237919
ASC_ACC : 0.43605947955390334
SC_ACC : 0.43717472118959105
ES_ACC : 0.43643122676579926
CS_ACC : 0.42862453531598516
SC_Avg_Steps : 40
ES_Avg_Steps : 15.375092936802973
CS_Avg_Steps : 3.2847583643122675
ASC_Avg_Steps : 13.207434944237919
ASC_ACC : 0.43605947955390334
SC_ACC : 0.43717472118959105
ES_ACC : 0.43643122676579926
CS_ACC : 0.42862453531598516
SC_Avg_Steps : 40
ES_Avg_Steps : 15.375092936802973
CS_Avg_Steps : 3.2847583643122675
ASC_Avg_Steps : 13.207434944237919
ASC_ACC : 0.43605947955390334
SC_ACC : 0.43717472118959105
ES_ACC : 0.43643122676579926
CS_ACC : 0.42862453531598516
SC_Avg_Steps : 40
ES_Avg_Steps : 15.375092936802973
CS_Avg_Steps : 3.2847583643122675
ASC_Avg_Steps : 13.207434944237919
ASC_ACC : 0.4360594

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.5905327726941458
0.41821561338289964
SC_ACC : 0.41821561338289964
ES_ACC : 0.41895910780669143
CS_ACC : 0.38104089219330856
SC_Avg_Steps : 40
ES_Avg_Steps : 15.713382899628252
CS_Avg_Steps : 1.0022304832713755
ASC_Avg_Steps : 13.646096654275093
ASC_ACC : 0.4174721189591078
SC_ACC : 0.41821561338289964
ES_ACC : 0.41895910780669143
CS_ACC : 0.38996282527881043
SC_Avg_Steps : 40
ES_Avg_Steps : 15.713382899628252
CS_Avg_Steps : 1.7293680297397769
ASC_Avg_Steps : 13.646096654275093
ASC_ACC : 0.4174721189591078
SC_ACC : 0.41821561338289964
ES_ACC : 0.41895910780669143
CS_ACC : 0.38661710037174724
SC_Avg_Steps : 40
ES_Avg_Steps : 15.713382899628252
CS_Avg_Steps : 1.1026022304832714
ASC_Avg_Steps : 13.646096654275093
ASC_ACC : 0.4174721189591078
SC_ACC : 0.41821561338289964
ES_ACC : 0.41895910780669143
CS_ACC : 0.38438661710037175
SC_Avg_Steps : 40
ES_Avg_Steps : 15.713382899628252
CS_Avg_Steps : 12.074721189591077
ASC_Avg_Steps : 13.646096654275093
ASC_ACC : 0.4174721189

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.63017014723767
0.42973977695167287
SC_ACC : 0.42973977695167287
ES_ACC : 0.42825278810408923
CS_ACC : 0.3907063197026022
SC_Avg_Steps : 40
ES_Avg_Steps : 15.794052044609666
CS_Avg_Steps : 1.0033457249070632
ASC_Avg_Steps : 13.625650557620817
ASC_ACC : 0.42862453531598516
SC_ACC : 0.42973977695167287
ES_ACC : 0.42825278810408923
CS_ACC : 0.404089219330855
SC_Avg_Steps : 40
ES_Avg_Steps : 15.794052044609666
CS_Avg_Steps : 2.128996282527881
ASC_Avg_Steps : 13.625650557620817
ASC_ACC : 0.42862453531598516
SC_ACC : 0.42973977695167287
ES_ACC : 0.42825278810408923
CS_ACC : 0.40557620817843865
SC_Avg_Steps : 40
ES_Avg_Steps : 15.794052044609666
CS_Avg_Steps : 2.0308550185873604
ASC_Avg_Steps : 13.625650557620817
ASC_ACC : 0.42862453531598516
SC_ACC : 0.42973977695167287
ES_ACC : 0.42825278810408923
CS_ACC : 0.41263940520446096
SC_Avg_Steps : 40
ES_Avg_Steps : 15.794052044609666
CS_Avg_Steps : 11.759851301115242
ASC_Avg_Steps : 13.625650557620817
ASC_ACC : 0.4286245353159

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.6913389590690696
0.41858736059479557
SC_ACC : 0.41858736059479557
ES_ACC : 0.4178438661710037
CS_ACC : 0.4144981412639405
SC_Avg_Steps : 40
ES_Avg_Steps : 15.603345724907063
CS_Avg_Steps : 3.4014869888475836
ASC_Avg_Steps : 13.25278810408922
ASC_ACC : 0.41821561338289964
SC_ACC : 0.41858736059479557
ES_ACC : 0.4178438661710037
CS_ACC : 0.4144981412639405
SC_Avg_Steps : 40
ES_Avg_Steps : 15.603345724907063
CS_Avg_Steps : 3.404089219330855
ASC_Avg_Steps : 13.25278810408922
ASC_ACC : 0.41821561338289964
SC_ACC : 0.41858736059479557
ES_ACC : 0.4178438661710037
CS_ACC : 0.4144981412639405
SC_Avg_Steps : 40
ES_Avg_Steps : 15.603345724907063
CS_Avg_Steps : 3.4014869888475836
ASC_Avg_Steps : 13.25278810408922
ASC_ACC : 0.41821561338289964
SC_ACC : 0.41858736059479557
ES_ACC : 0.4178438661710037
CS_ACC : 0.40817843866171005
SC_Avg_Steps : 40
ES_Avg_Steps : 15.603345724907063
CS_Avg_Steps : 5.562453531598513
ASC_Avg_Steps : 13.25278810408922
ASC_ACC : 0.41821561338289964
SC

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.7152464341862736
0.43940520446096654
SC_ACC : 0.43940520446096654
ES_ACC : 0.43977695167286246
CS_ACC : 0.43382899628252786
SC_Avg_Steps : 40
ES_Avg_Steps : 15.656877323420074
CS_Avg_Steps : 3.3717472118959106
ASC_Avg_Steps : 13.104460966542751
ASC_ACC : 0.43977695167286246
SC_ACC : 0.43940520446096654
ES_ACC : 0.43977695167286246
CS_ACC : 0.4323420074349442
SC_Avg_Steps : 40
ES_Avg_Steps : 15.656877323420074
CS_Avg_Steps : 4.428252788104089
ASC_Avg_Steps : 13.104460966542751
ASC_ACC : 0.43977695167286246
SC_ACC : 0.43940520446096654
ES_ACC : 0.43977695167286246
CS_ACC : 0.43382899628252786
SC_Avg_Steps : 40
ES_Avg_Steps : 15.656877323420074
CS_Avg_Steps : 3.3717472118959106
ASC_Avg_Steps : 13.104460966542751
ASC_ACC : 0.43977695167286246
SC_ACC : 0.43940520446096654
ES_ACC : 0.43977695167286246
CS_ACC : 0.43271375464684014
SC_Avg_Steps : 40
ES_Avg_Steps : 15.656877323420074
CS_Avg_Steps : 5.334944237918216
ASC_Avg_Steps : 13.104460966542751
ASC_ACC : 0.4397769516

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.6308688790527719
0.43866171003717475
SC_ACC : 0.43866171003717475
ES_ACC : 0.4390334572490706
CS_ACC : 0.39516728624535313
SC_Avg_Steps : 40
ES_Avg_Steps : 15.683271375464685
CS_Avg_Steps : 1.0148698884758365
ASC_Avg_Steps : 13.438289962825278
ASC_ACC : 0.4382899628252788
SC_ACC : 0.43866171003717475
ES_ACC : 0.4390334572490706
CS_ACC : 0.408550185873606
SC_Avg_Steps : 40
ES_Avg_Steps : 15.683271375464685
CS_Avg_Steps : 4.574721189591078
ASC_Avg_Steps : 13.438289962825278
ASC_ACC : 0.4382899628252788
SC_ACC : 0.43866171003717475
ES_ACC : 0.4390334572490706
CS_ACC : 0.4107806691449814
SC_Avg_Steps : 40
ES_Avg_Steps : 15.683271375464685
CS_Avg_Steps : 1.4817843866171003
ASC_Avg_Steps : 13.438289962825278
ASC_ACC : 0.4382899628252788
SC_ACC : 0.43866171003717475
ES_ACC : 0.4390334572490706
CS_ACC : 0.41821561338289964
SC_Avg_Steps : 40
ES_Avg_Steps : 15.683271375464685
CS_Avg_Steps : 11.232342007434944
ASC_Avg_Steps : 13.438289962825278
ASC_ACC : 0.4382899628252788
S

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.6777882571084848
0.4382899628252788
SC_ACC : 0.4382899628252788
ES_ACC : 0.43717472118959105
CS_ACC : 0.43382899628252786
SC_Avg_Steps : 40
ES_Avg_Steps : 15.784758364312268
CS_Avg_Steps : 3.4308550185873607
ASC_Avg_Steps : 13.425278810408923
ASC_ACC : 0.4379182156133829
SC_ACC : 0.4382899628252788
ES_ACC : 0.43717472118959105
CS_ACC : 0.43643122676579926
SC_Avg_Steps : 40
ES_Avg_Steps : 15.784758364312268
CS_Avg_Steps : 4.346096654275093
ASC_Avg_Steps : 13.425278810408923
ASC_ACC : 0.4379182156133829
SC_ACC : 0.4382899628252788
ES_ACC : 0.43717472118959105
CS_ACC : 0.4349442379182156
SC_Avg_Steps : 40
ES_Avg_Steps : 15.784758364312268
CS_Avg_Steps : 3.509293680297398
ASC_Avg_Steps : 13.425278810408923
ASC_ACC : 0.4379182156133829
SC_ACC : 0.4382899628252788
ES_ACC : 0.43717472118959105
CS_ACC : 0.43531598513011155
SC_Avg_Steps : 40
ES_Avg_Steps : 15.784758364312268
CS_Avg_Steps : 5.742007434944238
ASC_Avg_Steps : 13.425278810408923
ASC_ACC : 0.4379182156133829
SC

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.6122483644272863
0.4278810408921933
SC_ACC : 0.4278810408921933
ES_ACC : 0.42973977695167287
CS_ACC : 0.39219330855018586
SC_Avg_Steps : 40
ES_Avg_Steps : 15.50631970260223
CS_Avg_Steps : 1.0096654275092938
ASC_Avg_Steps : 13.137174721189592
ASC_ACC : 0.42825278810408923
SC_ACC : 0.4278810408921933
ES_ACC : 0.42973977695167287
CS_ACC : 0.3992565055762082
SC_Avg_Steps : 40
ES_Avg_Steps : 15.50631970260223
CS_Avg_Steps : 3.321933085501859
ASC_Avg_Steps : 13.137174721189592
ASC_ACC : 0.42825278810408923
SC_ACC : 0.4278810408921933
ES_ACC : 0.42973977695167287
CS_ACC : 0.39144981412639407
SC_Avg_Steps : 40
ES_Avg_Steps : 15.50631970260223
CS_Avg_Steps : 1.300743494423792
ASC_Avg_Steps : 13.137174721189592
ASC_ACC : 0.42825278810408923
SC_ACC : 0.4278810408921933
ES_ACC : 0.42973977695167287
CS_ACC : 0.4063197026022305
SC_Avg_Steps : 40
ES_Avg_Steps : 15.50631970260223
CS_Avg_Steps : 9.391078066914497
ASC_Avg_Steps : 13.137174721189592
ASC_ACC : 0.42825278810408923
SC_

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.6502978943577011
0.4401486988847584
SC_ACC : 0.4401486988847584
ES_ACC : 0.4390334572490706
CS_ACC : 0.40483271375464686
SC_Avg_Steps : 40
ES_Avg_Steps : 15.571003717472118
CS_Avg_Steps : 1.0252788104089219
ASC_Avg_Steps : 13.394795539033458
ASC_ACC : 0.44052044609665425
SC_ACC : 0.4401486988847584
ES_ACC : 0.4390334572490706
CS_ACC : 0.4
SC_Avg_Steps : 40
ES_Avg_Steps : 15.571003717472118
CS_Avg_Steps : 4.717472118959108
ASC_Avg_Steps : 13.394795539033458
ASC_ACC : 0.44052044609665425
SC_ACC : 0.4401486988847584
ES_ACC : 0.4390334572490706
CS_ACC : 0.4
SC_Avg_Steps : 40
ES_Avg_Steps : 15.571003717472118
CS_Avg_Steps : 1.4096654275092937
ASC_Avg_Steps : 13.394795539033458
ASC_ACC : 0.44052044609665425
SC_ACC : 0.4401486988847584
ES_ACC : 0.4390334572490706
CS_ACC : 0.41598513011152416
SC_Avg_Steps : 40
ES_Avg_Steps : 15.571003717472118
CS_Avg_Steps : 9.48810408921933
ASC_Avg_Steps : 13.394795539033458
ASC_ACC : 0.44052044609665425
SC_ACC : 0.4401486988847584
ES_AC

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.7053280391235832
0.4345724907063197
SC_ACC : 0.4345724907063197
ES_ACC : 0.43048327137546466
CS_ACC : 0.4390334572490706
SC_Avg_Steps : 40
ES_Avg_Steps : 15.379553903345725
CS_Avg_Steps : 3.784014869888476
ASC_Avg_Steps : 13.026765799256506
ASC_ACC : 0.43382899628252786
SC_ACC : 0.4345724907063197
ES_ACC : 0.43048327137546466
CS_ACC : 0.44275092936802973
SC_Avg_Steps : 40
ES_Avg_Steps : 15.379553903345725
CS_Avg_Steps : 5.078066914498141
ASC_Avg_Steps : 13.026765799256506
ASC_ACC : 0.43382899628252786
SC_ACC : 0.4345724907063197
ES_ACC : 0.43048327137546466
CS_ACC : 0.4390334572490706
SC_Avg_Steps : 40
ES_Avg_Steps : 15.379553903345725
CS_Avg_Steps : 3.8520446096654277
ASC_Avg_Steps : 13.026765799256506
ASC_ACC : 0.43382899628252786
SC_ACC : 0.4345724907063197
ES_ACC : 0.43048327137546466
CS_ACC : 0.4434944237918216
SC_Avg_Steps : 40
ES_Avg_Steps : 15.379553903345725
CS_Avg_Steps : 6.654646840148699
ASC_Avg_Steps : 13.026765799256506
ASC_ACC : 0.43382899628252786


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.7233039359681711
0.4308550185873606
SC_ACC : 0.4308550185873606
ES_ACC : 0.43048327137546466
CS_ACC : 0.4349442379182156
SC_Avg_Steps : 40
ES_Avg_Steps : 15.15278810408922
CS_Avg_Steps : 3.4773234200743492
ASC_Avg_Steps : 13.114126394052045
ASC_ACC : 0.4312267657992565
SC_ACC : 0.4308550185873606
ES_ACC : 0.43048327137546466
CS_ACC : 0.43531598513011155
SC_Avg_Steps : 40
ES_Avg_Steps : 15.15278810408922
CS_Avg_Steps : 5.485873605947956
ASC_Avg_Steps : 13.114126394052045
ASC_ACC : 0.4312267657992565
SC_ACC : 0.4308550185873606
ES_ACC : 0.43048327137546466
CS_ACC : 0.4390334572490706
SC_Avg_Steps : 40
ES_Avg_Steps : 15.15278810408922
CS_Avg_Steps : 3.6267657992565057
ASC_Avg_Steps : 13.114126394052045
ASC_ACC : 0.4312267657992565
SC_ACC : 0.4308550185873606
ES_ACC : 0.43048327137546466
CS_ACC : 0.4312267657992565
SC_Avg_Steps : 40
ES_Avg_Steps : 15.15278810408922
CS_Avg_Steps : 7.668773234200743
ASC_Avg_Steps : 13.114126394052045
ASC_ACC : 0.4312267657992565
SC_ACC 

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.6371763150722161
0.42862453531598516
SC_ACC : 0.42862453531598516
ES_ACC : 0.42602230483271375
CS_ACC : 0.3892193308550186
SC_Avg_Steps : 40
ES_Avg_Steps : 15.738289962825279
CS_Avg_Steps : 1.0531598513011153
ASC_Avg_Steps : 13.61375464684015
ASC_ACC : 0.42936802973977695
SC_ACC : 0.42862453531598516
ES_ACC : 0.42602230483271375
CS_ACC : 0.3996282527881041
SC_Avg_Steps : 40
ES_Avg_Steps : 15.738289962825279
CS_Avg_Steps : 5.199628252788104
ASC_Avg_Steps : 13.61375464684015
ASC_ACC : 0.42936802973977695
SC_ACC : 0.42862453531598516
ES_ACC : 0.42602230483271375
CS_ACC : 0.3869888475836431
SC_Avg_Steps : 40
ES_Avg_Steps : 15.738289962825279
CS_Avg_Steps : 2.1018587360594796
ASC_Avg_Steps : 13.61375464684015
ASC_ACC : 0.42936802973977695
SC_ACC : 0.42862453531598516
ES_ACC : 0.42602230483271375
CS_ACC : 0.40929368029739777
SC_Avg_Steps : 40
ES_Avg_Steps : 15.738289962825279
CS_Avg_Steps : 9.737546468401487
ASC_Avg_Steps : 13.61375464684015
ASC_ACC : 0.4293680297397769



                           Logit Regression Results                           
Dep. Variable:            Correctness   No. Observations:               250960
Model:                          Logit   Df Residuals:                   250956
Method:                           MLE   Df Model:                            3
Date:                Wed, 29 May 2024   Pseudo R-squ.:                  0.1452
Time:                        15:34:27   Log-Likelihood:            -1.4362e+05
converged:                      False   LL-Null:                   -1.6800e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.1165      0.016    -68.674      0.000      -1.148      -1.085
QUA_IM        -1.2144      0.097    -12.554      0.000      -1.404      -1.025
DIF_IV       -22.5184    427.260     -0.053      0.9

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


SC_ACC : 0.44535315985130114
ES_ACC : 0.4479553903345725
CS_ACC : 0.43048327137546466
SC_Avg_Steps : 40
ES_Avg_Steps : 15.767657992565056
CS_Avg_Steps : 3.2929368029739776
ASC_Avg_Steps : 13.463940520446096
ASC_ACC : 0.44535315985130114
SC_ACC : 0.44535315985130114
ES_ACC : 0.4479553903345725
CS_ACC : 0.43048327137546466
SC_Avg_Steps : 40
ES_Avg_Steps : 15.767657992565056
CS_Avg_Steps : 3.304460966542751
ASC_Avg_Steps : 13.463940520446096
ASC_ACC : 0.44535315985130114
SC_ACC : 0.44535315985130114
ES_ACC : 0.4479553903345725
CS_ACC : 0.43048327137546466
SC_Avg_Steps : 40
ES_Avg_Steps : 15.767657992565056
CS_Avg_Steps : 3.300743494423792
ASC_Avg_Steps : 13.463940520446096
ASC_ACC : 0.44535315985130114
SC_ACC : 0.44535315985130114
ES_ACC : 0.4479553903345725
CS_ACC : 0.43977695167286246
SC_Avg_Steps : 40
ES_Avg_Steps : 15.767657992565056
CS_Avg_Steps : 6.126022304832714
ASC_Avg_Steps : 13.463940520446096
ASC_ACC : 0.44535315985130114
SC_ACC : 0.44535315985130114
ES_ACC : 0.447955390334572

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.7159823402429485
0.4342007434944238
SC_ACC : 0.4342007434944238
ES_ACC : 0.43717472118959105
CS_ACC : 0.42602230483271375
SC_Avg_Steps : 40
ES_Avg_Steps : 15.536802973977695
CS_Avg_Steps : 3.354646840148699
ASC_Avg_Steps : 13.486245353159852
ASC_ACC : 0.43271375464684014
SC_ACC : 0.4342007434944238
ES_ACC : 0.43717472118959105
CS_ACC : 0.4245353159851301
SC_Avg_Steps : 40
ES_Avg_Steps : 15.536802973977695
CS_Avg_Steps : 4.411152416356877
ASC_Avg_Steps : 13.486245353159852
ASC_ACC : 0.43271375464684014
SC_ACC : 0.4342007434944238
ES_ACC : 0.43717472118959105
CS_ACC : 0.42602230483271375
SC_Avg_Steps : 40
ES_Avg_Steps : 15.536802973977695
CS_Avg_Steps : 3.358364312267658
ASC_Avg_Steps : 13.486245353159852
ASC_ACC : 0.43271375464684014
SC_ACC : 0.4342007434944238
ES_ACC : 0.43717472118959105
CS_ACC : 0.4308550185873606
SC_Avg_Steps : 40
ES_Avg_Steps : 15.536802973977695
CS_Avg_Steps : 5.180297397769516
ASC_Avg_Steps : 13.486245353159852
ASC_ACC : 0.43271375464684014


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.643655341058799
0.4513011152416357
SC_ACC : 0.4513011152416357
ES_ACC : 0.4527881040892193
CS_ACC : 0.4018587360594795
SC_Avg_Steps : 40
ES_Avg_Steps : 14.958364312267658
CS_Avg_Steps : 1.013011152416357
ASC_Avg_Steps : 12.69182156133829
ASC_ACC : 0.45055762081784384
SC_ACC : 0.4513011152416357
ES_ACC : 0.4527881040892193
CS_ACC : 0.4230483271375465
SC_Avg_Steps : 40
ES_Avg_Steps : 14.958364312267658
CS_Avg_Steps : 4.360594795539034
ASC_Avg_Steps : 12.69182156133829
ASC_ACC : 0.45055762081784384
SC_ACC : 0.4513011152416357
ES_ACC : 0.4527881040892193
CS_ACC : 0.42490706319702604
SC_Avg_Steps : 40
ES_Avg_Steps : 14.958364312267658
CS_Avg_Steps : 1.4486988847583644
ASC_Avg_Steps : 12.69182156133829
ASC_ACC : 0.45055762081784384
SC_ACC : 0.4513011152416357
ES_ACC : 0.4527881040892193
CS_ACC : 0.4256505576208178
SC_Avg_Steps : 40
ES_Avg_Steps : 14.958364312267658
CS_Avg_Steps : 11.252416356877323
ASC_Avg_Steps : 12.69182156133829
ASC_ACC : 0.45055762081784384
SC_ACC :

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.7213538896080434
0.4434944237918216
SC_ACC : 0.4434944237918216
ES_ACC : 0.4434944237918216
CS_ACC : 0.44721189591078064
SC_Avg_Steps : 40
ES_Avg_Steps : 15.256133828996283
CS_Avg_Steps : 3.470631970260223
ASC_Avg_Steps : 12.970631970260223
ASC_ACC : 0.44423791821561337
SC_ACC : 0.4434944237918216
ES_ACC : 0.4434944237918216
CS_ACC : 0.45762081784386616
SC_Avg_Steps : 40
ES_Avg_Steps : 15.256133828996283
CS_Avg_Steps : 4.239776951672862
ASC_Avg_Steps : 12.970631970260223
ASC_ACC : 0.44423791821561337
SC_ACC : 0.4434944237918216
ES_ACC : 0.4434944237918216
CS_ACC : 0.44721189591078064
SC_Avg_Steps : 40
ES_Avg_Steps : 15.256133828996283
CS_Avg_Steps : 3.4817843866171003
ASC_Avg_Steps : 12.970631970260223
ASC_ACC : 0.44423791821561337
SC_ACC : 0.4434944237918216
ES_ACC : 0.4434944237918216
CS_ACC : 0.44535315985130114
SC_Avg_Steps : 40
ES_Avg_Steps : 15.256133828996283
CS_Avg_Steps : 8.651301115241635
ASC_Avg_Steps : 12.970631970260223
ASC_ACC : 0.44423791821561337
S

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.697699323119736
0.43308550185873607
SC_ACC : 0.43308550185873607
ES_ACC : 0.43382899628252786
CS_ACC : 0.4349442379182156
SC_Avg_Steps : 40
ES_Avg_Steps : 15.66728624535316
CS_Avg_Steps : 3.627881040892193
ASC_Avg_Steps : 13.348698884758365
ASC_ACC : 0.43159851301115243
SC_ACC : 0.43308550185873607
ES_ACC : 0.43382899628252786
CS_ACC : 0.4345724907063197
SC_Avg_Steps : 40
ES_Avg_Steps : 15.66728624535316
CS_Avg_Steps : 5.059851301115241
ASC_Avg_Steps : 13.348698884758365
ASC_ACC : 0.43159851301115243
SC_ACC : 0.43308550185873607
ES_ACC : 0.43382899628252786
CS_ACC : 0.43531598513011155
SC_Avg_Steps : 40
ES_Avg_Steps : 15.66728624535316
CS_Avg_Steps : 3.725650557620818
ASC_Avg_Steps : 13.348698884758365
ASC_ACC : 0.43159851301115243
SC_ACC : 0.43308550185873607
ES_ACC : 0.43382899628252786
CS_ACC : 0.42825278810408923
SC_Avg_Steps : 40
ES_Avg_Steps : 15.66728624535316
CS_Avg_Steps : 6.484014869888476
ASC_Avg_Steps : 13.348698884758365
ASC_ACC : 0.43159851301115243


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.7236823717897872
0.43531598513011155
SC_ACC : 0.43531598513011155
ES_ACC : 0.43531598513011155
CS_ACC : 0.43531598513011155
SC_Avg_Steps : 40
ES_Avg_Steps : 15.350929368029739
CS_Avg_Steps : 3.4178438661710038
ASC_Avg_Steps : 13.129368029739776
ASC_ACC : 0.43605947955390334
SC_ACC : 0.43531598513011155
ES_ACC : 0.43531598513011155
CS_ACC : 0.4368029739776952
SC_Avg_Steps : 40
ES_Avg_Steps : 15.350929368029739
CS_Avg_Steps : 5.081412639405205
ASC_Avg_Steps : 13.129368029739776
ASC_ACC : 0.43605947955390334
SC_ACC : 0.43531598513011155
ES_ACC : 0.43531598513011155
CS_ACC : 0.4379182156133829
SC_Avg_Steps : 40
ES_Avg_Steps : 15.350929368029739
CS_Avg_Steps : 3.553531598513011
ASC_Avg_Steps : 13.129368029739776
ASC_ACC : 0.43605947955390334
SC_ACC : 0.43531598513011155
ES_ACC : 0.43531598513011155
CS_ACC : 0.4301115241635688
SC_Avg_Steps : 40
ES_Avg_Steps : 15.350929368029739
CS_Avg_Steps : 7.356505576208178
ASC_Avg_Steps : 13.129368029739776
ASC_ACC : 0.4360594795539

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.6524367206403068
0.4356877323420074
SC_ACC : 0.4356877323420074
ES_ACC : 0.43531598513011155
CS_ACC : 0.3940520446096654
SC_Avg_Steps : 40
ES_Avg_Steps : 15.449814126394052
CS_Avg_Steps : 1.0364312267657994
ASC_Avg_Steps : 13.291449814126395
ASC_ACC : 0.4345724907063197
SC_ACC : 0.4356877323420074
ES_ACC : 0.43531598513011155
CS_ACC : 0.40111524163568774
SC_Avg_Steps : 40
ES_Avg_Steps : 15.449814126394052
CS_Avg_Steps : 4.763940520446097
ASC_Avg_Steps : 13.291449814126395
ASC_ACC : 0.4345724907063197
SC_ACC : 0.4356877323420074
ES_ACC : 0.43531598513011155
CS_ACC : 0.39144981412639407
SC_Avg_Steps : 40
ES_Avg_Steps : 15.449814126394052
CS_Avg_Steps : 1.7144981412639406
ASC_Avg_Steps : 13.291449814126395
ASC_ACC : 0.4345724907063197
SC_ACC : 0.4356877323420074
ES_ACC : 0.43531598513011155
CS_ACC : 0.4141263940520446
SC_Avg_Steps : 40
ES_Avg_Steps : 15.449814126394052
CS_Avg_Steps : 9.71449814126394
ASC_Avg_Steps : 13.291449814126395
ASC_ACC : 0.4345724907063197
SC_



                           Logit Regression Results                           
Dep. Variable:            Correctness   No. Observations:               250960
Model:                          Logit   Df Residuals:                   250955
Method:                           MLE   Df Model:                            4
Date:                Wed, 29 May 2024   Pseudo R-squ.:                  0.1638
Time:                        15:35:01   Log-Likelihood:            -1.4116e+05
converged:                      False   LL-Null:                   -1.6880e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -1.1363      0.022    -50.540      0.000      -1.180      -1.092
LEN               -0.0462      0.001    -39.004      0.000      -0.049      -0.044
DIF_IV           -20.0945    129.910

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


SC_ACC : 0.42379182156133827
ES_ACC : 0.42267657992565055
CS_ACC : 0.4263940520446097
SC_Avg_Steps : 40
ES_Avg_Steps : 15.914869888475836
CS_Avg_Steps : 3.288475836431227
ASC_Avg_Steps : 13.4182156133829
ASC_ACC : 0.4230483271375465
SC_ACC : 0.42379182156133827
ES_ACC : 0.42267657992565055
CS_ACC : 0.4241635687732342
SC_Avg_Steps : 40
ES_Avg_Steps : 15.914869888475836
CS_Avg_Steps : 5.250557620817844
ASC_Avg_Steps : 13.4182156133829
ASC_ACC : 0.4230483271375465
SC_ACC : 0.42379182156133827
ES_ACC : 0.42267657992565055
CS_ACC : 0.4301115241635688
SC_Avg_Steps : 40
ES_Avg_Steps : 15.914869888475836
CS_Avg_Steps : 3.4230483271375465
ASC_Avg_Steps : 13.4182156133829
ASC_ACC : 0.4230483271375465
SC_ACC : 0.42379182156133827
ES_ACC : 0.42267657992565055
CS_ACC : 0.42602230483271375
SC_Avg_Steps : 40
ES_Avg_Steps : 15.914869888475836
CS_Avg_Steps : 7.703717472118959
ASC_Avg_Steps : 13.4182156133829
ASC_ACC : 0.4230483271375465
SC_ACC : 0.42379182156133827
ES_ACC : 0.42267657992565055
CS_ACC :

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.7120629397493203
0.4263940520446097
SC_ACC : 0.4263940520446097
ES_ACC : 0.42602230483271375
CS_ACC : 0.42156133828996284
SC_Avg_Steps : 40
ES_Avg_Steps : 15.372862453531598
CS_Avg_Steps : 3.23271375464684
ASC_Avg_Steps : 13.123048327137546
ASC_ACC : 0.4256505576208178
SC_ACC : 0.4263940520446097
ES_ACC : 0.42602230483271375
CS_ACC : 0.4342007434944238
SC_Avg_Steps : 40
ES_Avg_Steps : 15.372862453531598
CS_Avg_Steps : 3.9479553903345725
ASC_Avg_Steps : 13.123048327137546
ASC_ACC : 0.4256505576208178
SC_ACC : 0.4263940520446097
ES_ACC : 0.42602230483271375
CS_ACC : 0.42156133828996284
SC_Avg_Steps : 40
ES_Avg_Steps : 15.372862453531598
CS_Avg_Steps : 3.2401486988847585
ASC_Avg_Steps : 13.123048327137546
ASC_ACC : 0.4256505576208178
SC_ACC : 0.4263940520446097
ES_ACC : 0.42602230483271375
CS_ACC : 0.4368029739776952
SC_Avg_Steps : 40
ES_Avg_Steps : 15.372862453531598
CS_Avg_Steps : 7.765055762081785
ASC_Avg_Steps : 13.123048327137546
ASC_ACC : 0.4256505576208178
SC_

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


The AUROC score is: 0.7348129380531436
0.4379182156133829
SC_ACC : 0.4379182156133829
ES_ACC : 0.43940520446096654
CS_ACC : 0.4301115241635688
SC_Avg_Steps : 40
ES_Avg_Steps : 15.415613382899629
CS_Avg_Steps : 3.5401486988847584
ASC_Avg_Steps : 13.288475836431227
ASC_ACC : 0.43866171003717475
SC_ACC : 0.4379182156133829
ES_ACC : 0.43940520446096654
CS_ACC : 0.43940520446096654
SC_Avg_Steps : 40
ES_Avg_Steps : 15.415613382899629
CS_Avg_Steps : 5.805576208178438
ASC_Avg_Steps : 13.288475836431227
ASC_ACC : 0.43866171003717475
SC_ACC : 0.4379182156133829
ES_ACC : 0.43940520446096654
CS_ACC : 0.43197026022304835
SC_Avg_Steps : 40
ES_Avg_Steps : 15.415613382899629
CS_Avg_Steps : 3.7215613382899626
ASC_Avg_Steps : 13.288475836431227
ASC_ACC : 0.43866171003717475
SC_ACC : 0.4379182156133829
ES_ACC : 0.43940520446096654
CS_ACC : 0.4412639405204461
SC_Avg_Steps : 40
ES_Avg_Steps : 15.415613382899629
CS_Avg_Steps : 8.104460966542751
ASC_Avg_Steps : 13.288475836431227
ASC_ACC : 0.4386617100371747

In [5]:
base_acc

0.44275092936802973

In [10]:
for feature_set, params in results.items():
    print(f"Feature set: {feature_set}")
    print(f"ROCAUC: {params['rocauc']}, N: {params['N']}, Threshold: {params['threshold']}, Steps: {params['steps']}, CS Accuracy: {params['cs_acc']}")

Feature set: ('LEN',)
ROCAUC: 0.5681688900197942, N: 5, Threshold: 0.1, Steps: 4.027509293680297, CS Accuracy: 0.416728624535316
Feature set: ('QUA_IM',)
ROCAUC: 0.5018227890000243, N: 5, Threshold: 0.1, Steps: 4.0, CS Accuracy: 0.420817843866171
Feature set: ('DIF_IV',)
ROCAUC: 0.6256168019987508, N: 4, Threshold: 0.1, Steps: 6.052044609665428, CS Accuracy: 0.4594795539033457
Feature set: ('SIM_INPUT',)
ROCAUC: 0.5986919340811416, N: 5, Threshold: 0.2, Steps: 4.092193308550186, CS Accuracy: 0.43382899628252786
Feature set: ('SIM_COT_BIGRAM',)
ROCAUC: 0.6320711806676031, N: 5, Threshold: 0.1, Steps: 4.0, CS Accuracy: 0.41933085501858736
Feature set: ('LEN', 'QUA_IM')
ROCAUC: 0.5648056186777022, N: 5, Threshold: 0.1, Steps: 4.020817843866171, CS Accuracy: 0.41821561338289964
Feature set: ('LEN', 'DIF_IV')
ROCAUC: 0.6789248135756595, N: 2, Threshold: 0.1, Steps: 3.64907063197026, CS Accuracy: 0.42825278810408923
Feature set: ('LEN', 'SIM_INPUT')
ROCAUC: 0.6145836258925844, N: 5, Threshol

# Logistic Regression Feature Selection

In [21]:
X = df[feature_li]
y = df['Correctness']

In [22]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, SelectFromModel, SelectKBest, f_classif
import os
import sys
import statsmodels.api as sm
split_idx = int(len(df) * 0.8)

In [23]:
split_idx = int(0.8 * len(df))  # Assuming an 80/20 train/test split

# Splitting data
X_train = df[feature_li].iloc[:split_idx]
y_train = df['Correctness'].iloc[:split_idx]
X_test = df[feature_li].iloc[split_idx:]
y_test = df['Correctness'].iloc[split_idx:]

In [24]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold


model = LogisticRegression(solver='liblinear')
rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(5), scoring='roc_auc')  # Adjust scoring to your needs
rfecv.fit(X_train, y_train)

print("Optimal number of features:", rfecv.n_features_)
print("Ranking of features:", rfecv.ranking_)
print("Selected features:", X_train.columns[rfecv.support_])

Optimal number of features: 7
Ranking of features: [1 1 1 1 1 1 2 1]
Selected features: Index(['QUA_IM', 'DIF_IV', 'SIM_COT_BIGRAM', 'SIM_COT_AGG', 'SIM_AC_BIGRAM',
       'SIM_INPUT', 'SIM_AC_PW'],
      dtype='object')


In [12]:
def logistic_regression_full_features(X_train, y_train, X_test, y_test):
    X_train_full = sm.add_constant(X_train)
    X_test_full = sm.add_constant(X_test)

    model_full = sm.Logit(y_train, X_train_full)
    result_full = model_full.fit()

    y_pred_proba_full = result_full.predict(X_test_full)
    auroc_full = roc_auc_score(y_test, y_pred_proba_full)

    print("\nLogistic Regression with all features:")
    print(result_full.summary())
    print(f'AUROC with all features: {auroc_full}')

def feature_selection_and_evaluation(X_train, X_test, y_train, y_test):
    estimator = LogisticRegression(C = 0.1, penalty = 'l1', solver='liblinear')
    selector = RFE(estimator, n_features_to_select=min(6, X_train.shape[1]))
    
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)
    
    selected_features = X_train.columns[selector.get_support()]
    print("\nFeatures selected by RFE:", selected_features.tolist())
    
    X_train_selected = sm.add_constant(X_train_selected)
    X_test_selected = sm.add_constant(X_test_selected)
    
    model_rfe = sm.Logit(y_train, X_train_selected)
    result_rfe = model_rfe.fit()
    print(result_rfe.summary())

    y_pred_proba_rfe = result_rfe.predict(X_test_selected)
    auroc_rfe = roc_auc_score(y_test, y_pred_proba_rfe)
    print(f'AUROC with RFE selected features: {auroc_rfe}')

# Run the full feature logistic regression
logistic_regression_full_features(X_train, y_train, X_test, y_test)

# Run the feature selection and evaluation using RFE
feature_selection_and_evaluation(X_train, X_test, y_train, y_test)


         Current function value: 0.510322
         Iterations: 35

Logistic Regression with all features:
                           Logit Regression Results                           
Dep. Variable:            Correctness   No. Observations:               305696
Model:                          Logit   Df Residuals:                   305687
Method:                           MLE   Df Model:                            8
Date:                Mon, 27 May 2024   Pseudo R-squ.:                  0.2565
Time:                        10:21:27   Log-Likelihood:            -1.5600e+05
converged:                      False   LL-Null:                   -2.0982e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -1.9641      0.023    -85.600      0.000      -2.009      -1.919
QUA_IM       

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):



Features selected by RFE: ['QUA_IM', 'DIF_IV', 'SIM_COT_BIGRAM', 'SIM_COT_AGG', 'SIM_INPUT', 'SIM_AC_AGG']
         Current function value: 0.517449
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:            Correctness   No. Observations:               305696
Model:                          Logit   Df Residuals:                   305689
Method:                           MLE   Df Model:                            6
Date:                Mon, 27 May 2024   Pseudo R-squ.:                  0.2461
Time:                        10:21:31   Log-Likelihood:            -1.5818e+05
converged:                      False   LL-Null:                   -2.0982e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.2103      0.019   -

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


# Grid Search on LR

In [16]:
import numpy as np
from sklearn.metrics import roc_auc_score
import warnings

warnings.filterwarnings("ignore", category=FutureWarning, message="is_sparse is deprecated")

def customized_LR(coe, intercept, X):
    coe_array = np.array([coe[feature] for feature in X.columns])
    z = np.dot(X, coe_array) + intercept
    pred_proba = 1 / (1 + np.exp(-z))
    return pred_proba

def evaluate_params(coe, intercept, X_train, y_train, X_test, y_test):
    y_train_pred_proba = customized_LR(coe, intercept, X_train)
    y_test_pred_proba = customized_LR(coe, intercept, X_test)
    train_auroc = roc_auc_score(y_train, y_train_pred_proba)
    test_auroc = roc_auc_score(y_test, y_test_pred_proba)
    return train_auroc, test_auroc

def grid_search_customized_LR(X_train, y_train, X_test, y_test):
    intercept_range = np.linspace(-3, -1, 3)
    coe_ranges = {
        'QUA_IM': np.linspace(-2, 0, 3),
        'DIF_IV': np.linspace(-10, 0, 3),
        'SIM_COT_BIGRAM': np.linspace(-2, 0, 3),
        'SIM_COT_AGG': np.linspace(1, 3, 3),
        'SIM_INPUT': np.linspace(0, 2, 3),
        'SIM_AC_AGG': np.linspace(1, 3, 3)
    }

    best_params = None
    best_test_auroc = 0

    for intercept in intercept_range:
        for coe_values in np.array(np.meshgrid(*coe_ranges.values())).T.reshape(-1, len(coe_ranges)):
            coe = dict(zip(coe_ranges.keys(), coe_values))
            train_auroc, test_auroc = evaluate_params(coe, intercept, X_train, y_train, X_test, y_test)
            
            print(f"AUROC (Train): {train_auroc:.4f}, AUROC (Test): {test_auroc:.4f}, Parameters: intercept={intercept}, coe={coe}")
            
            if test_auroc > best_test_auroc:
                best_train_auroc = train_auroc
                best_test_auroc = test_auroc
                best_params = {'intercept': intercept, 'coe': coe}

    print("\nBest parameters found by grid search:")
    print(best_params)
    print("Best training AUROC score:", best_train_auroc)
    print("Best test AUROC score:", best_test_auroc)

    return best_params



# Run the grid search on the customized logistic regression
feature_li = ['QUA_IM', 'DIF_IV', 'SIM_COT_BIGRAM', 'SIM_COT_AGG', 'SIM_INPUT', 'SIM_AC_AGG']
X_train = X_train[feature_li]
X_test = X_test[feature_li]
best_estimator = grid_search_customized_LR(X_train, y_train, X_test, y_test)

AUROC (Train): 0.7323, AUROC (Test): 0.6472, Parameters: intercept=-3.0, coe={'QUA_IM': -2.0, 'DIF_IV': -10.0, 'SIM_COT_BIGRAM': -2.0, 'SIM_COT_AGG': 1.0, 'SIM_INPUT': 0.0, 'SIM_AC_AGG': 1.0}
AUROC (Train): 0.7323, AUROC (Test): 0.6472, Parameters: intercept=-3.0, coe={'QUA_IM': -2.0, 'DIF_IV': -5.0, 'SIM_COT_BIGRAM': -2.0, 'SIM_COT_AGG': 1.0, 'SIM_INPUT': 0.0, 'SIM_AC_AGG': 1.0}
AUROC (Train): 0.5948, AUROC (Test): 0.4842, Parameters: intercept=-3.0, coe={'QUA_IM': -2.0, 'DIF_IV': 0.0, 'SIM_COT_BIGRAM': -2.0, 'SIM_COT_AGG': 1.0, 'SIM_INPUT': 0.0, 'SIM_AC_AGG': 1.0}
AUROC (Train): 0.7323, AUROC (Test): 0.6467, Parameters: intercept=-3.0, coe={'QUA_IM': -1.0, 'DIF_IV': -10.0, 'SIM_COT_BIGRAM': -2.0, 'SIM_COT_AGG': 1.0, 'SIM_INPUT': 0.0, 'SIM_AC_AGG': 1.0}
AUROC (Train): 0.7323, AUROC (Test): 0.6467, Parameters: intercept=-3.0, coe={'QUA_IM': -1.0, 'DIF_IV': -5.0, 'SIM_COT_BIGRAM': -2.0, 'SIM_COT_AGG': 1.0, 'SIM_INPUT': 0.0, 'SIM_AC_AGG': 1.0}
AUROC (Train): 0.5949, AUROC (Test): 0.4830,

# Evaluation on test data (Adaptive Self Consistency)

In [29]:
DATA_DIR = '../../data/adaptive_consistency_outputs/'
storage_dir = os.path.join(DATA_DIR, 'Algo_Design_Data')
file_asc_path = os.path.join(storage_dir, 'final_asc_extracted.json')
df_asc = pd.read_json(file_asc_path, lines=True)
df_asc = prepare_df(df_asc,feature_li)

In [30]:
def evaluate_params(estimator, df):
    X_train = df[feature_li]
    y_train = df['Correctness']
    coe = estimator['coe']
    intercept = estimator['intercept']
    y_train_pred_proba = customized_LR(coe, intercept, X_train)
    train_auroc = roc_auc_score(y_train, y_train_pred_proba)
    return train_auroc
evaluate_params(best_estimator, df_asc)

0.9011956214257395

In [31]:
# Fit a logistic regression model on the training data
from sklearn.linear_model import LogisticRegression

X_train_subset = X_train[feature_li]
y_train = y_train

lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_subset, y_train)

# Evaluate the fitted logistic regression model on the new data
X_test = df_asc[feature_li]
y_test = df_asc['Correctness']

y_test_pred_proba_lr = lr_model.predict_proba(X_test)[:, 1]
test_auroc_lr = roc_auc_score(y_test, y_test_pred_proba_lr)
print("Test AUROC score (Logistic Regression):", test_auroc_lr)

Test AUROC score (Logistic Regression): 0.8994950338486661


In [35]:
from CS_based_early_stopping import CS_early_stopping
from IDV_CS_Model import customized_LR_model

In [37]:


df_with_features = pd.read_json(file_asc_path, lines=True)

# Define the features list
feature_li = [
    # 'LEN',
    # 'QUA_IM',
    'DIF_IV',
    'SIM_COT_BIGRAM',
    'SIM_COT_AGG',
    'SIM_COT_PW',
    'SIM_AC_BIGRAM',
    'SIM_AC_AGG',
    # 'SIM_AC_PW',
]
# Continue with the rest of the script
coe = [0, -10, -2, 3, 1, 2]
intercept = -1
# df_cs = trained_LR_model(df_with_features, feature_li, report_auroc=True)
df_cs_clr = customized_LR_model(df_with_features,feature_li,coe, intercept, report_auroc=True)
df_cs_lr = customized_LR_model(df_with_features,feature_li,coe, intercept, report_auroc=True)
# Command line arguments for early stopping parameters
N = 3
threshold = 0.5
stop_mechanism = 'PositiveN'

# Applying early stopping mechanism
df_final = CS_early_stopping(df=df_cs, threshold=threshold, N=N, stop_mechanism=stop_mechanism)

Coefficients: [0, -10, -2, 3, 1, 2]
Intercept: -1
The AUROC score is: 0.7443992436703339
SC_ACC : 0.8276
ES_ACC : 0.8277
CS_ACC : 0.7671
SC_Avg_Steps : 40
ES_Avg_Steps : 11.612
CS_Avg_Steps : 39.9906
ASC_Avg_Steps : 9.4727
ASC_ACC : 0.8273
