In [None]:
import os
import numpy as np
import pandas as pd

### Root Directory and features to maximize and minimize for evaluation

In [None]:
columns_max = ['train_accuracy', 'test_accuracy', 'roc_auc',
           'precision_0', 'recall_0', 'f1_0', 
           'precision_1', 'recall_1', 'f1_1',
           'ks_stat','tp', 'tn']

# columns_max = ['train_accuracy', 'test_accuracy', 'roc_auc', 
#                'f1_0', 'f1_1', 'ks_stat','tp', 'tn']

# columns_max = ['tp', 'tn']

columns_min = ['fp','fn']

top_n = 1000

# This header is used to store evaluation metrics to csv
columns_name = ['method','k_neighbour', 'train_accuracy', 'test_accuracy', 'roc_auc',
                'precision_0', 'recall_0', 'f1_0', 
                'precision_1', 'recall_1', 'f1_1',
                'ks_stat', 'p_value', 
                'tp', 'tn', 'fp', 'fn']

root_dir = "./class_imbalance_methods_evaluations/"

# Function to search best value for k (neighbour) for each class balancing method

In [None]:
def best_k_value(file_loc, top_n):
    df = pd.read_csv(file_loc)
    all_col_top_rows = {}
    
    # Based on the columns present in columns_max, rows having maximum value for each column will be picked
    for col in columns_max:
        top_rows = df.sort_values(by=[col],ascending=False)[:top_n]['k_neighbour']
        all_col_top_rows[col + '_top_rows'] = set(top_rows)

    # Based on the columns present in columns_min, rows having minimum value for each column will be picked
    for col in columns_min:
        top_rows = df.sort_values(by=[col],ascending=True)[:top_n]['k_neighbour']
        all_col_top_rows[col + '_top_rows'] = set(top_rows)

    # Finding rows which are common in all the top_rows
    common_rows = set()
    for key,value in all_col_top_rows.items():
        if len(common_rows) == 0:
            common_rows = value
        else:
            common_rows = common_rows.intersection(value)
    #     break
    
    top_rows = df[df['k_neighbour'].isin(common_rows)]
    
    
    return top_rows.sort_values(by=['ks_stat'],ascending=True)

# Start Point

In [None]:
df = pd.DataFrame([], columns=columns_name)

best_rows_values = {}

for file in sorted(os.listdir(root_dir)):

    # Change value of file.split('_')[1] based on approach  i.e. [1,2]
    if file[-3:] == 'csv' and file.split('_')[1] == '2':
        approach = file.split('_')[1]
        method_name = file.split('_')[2].upper()
        
        best_rows = best_k_value(os.path.join(root_dir, file),top_n)
        print(file, len(best_rows))
        
        best_rows_values[method_name] = list(best_rows['k_neighbour'].values)
        
        best_rows.insert(0,'method' ,[f'Approach_{approach}_{method_name}']* len(best_rows) )
        
        df = pd.concat([df, best_rows[:1]])

    #     break

In [None]:
# Saving value of best value of k in numpy object.
np.save(os.path.join(root_dir,'best_neighbour_values.npy'), best_rows_values)