In [1]:
import json
import pandas as pd
import os
from tqdm.notebook import tqdm

### This uses the LOFO rankings to create 499 subsets, each containing the Top "X" number of features.

In [2]:
# Load the feature rankings.
with open('../metrics/accuracy_rankings.json', 'r') as f:
    rankings = json.load(f)

In [3]:
# Load training data.
data = pd.read_csv('../data/X_train_encoded.csv')
print(data.columns)

Index(['feature0', 'feature1', 'feature2', 'feature3', 'feature4', 'feature5',
       'feature6', 'feature7', 'feature8', 'feature9',
       ...
       'feature490', 'feature491', 'feature492', 'feature493', 'feature494',
       'feature495', 'feature496', 'feature497', 'feature498', 'feature499'],
      dtype='object', length=500)


In [4]:
# Make a directory for training subsets.
output_dir = '../data/ranked_subsets/train_features_csv'
os.makedirs(output_dir, exist_ok=True)

In [5]:
# Turn rankings into a list and extract column names.
feature_list = [int(rankings[str(i)]) for i in range(1, len(rankings) + 1)]
all_columns = data.columns.tolist()

# Generate CSV files for TOP "X" features.
for i in tqdm(range(1, 500), desc="Generating CSV files"):
    top_features = [all_columns[idx] for idx in feature_list[:i]]
    selected_data = data[top_features]
    if 'feature0' not in selected_data.columns:
        selected_data.insert(0, 'feature0', data['feature0'])
    
    # Save to CSV.
    output_file = os.path.join(output_dir, f'top_{i}_features.csv')
    selected_data.to_csv(output_file, index=False)
print("All CSV files have been generated.")

Generating CSV files:   0%|          | 0/500 [00:00<?, ?it/s]

All CSV files have been generated.


In [6]:
# Load test data.
test = pd.read_csv('../data/X_test_encoded.csv', low_memory=False)

In [7]:
# Make a directory for test subsets.
output_dir = '../data/ranked_subsets/test_features_csv'
os.makedirs(output_dir, exist_ok=True)

In [8]:
data = test

In [9]:
# Turn rankings into a list and extract column names.
feature_list = [int(rankings[str(i)]) for i in range(1, len(rankings) + 1)]
all_columns = data.columns.tolist()

# Generate CSV files for TOP "X" features.
for i in tqdm(range(1, 500), desc="Generating CSV files"):
    top_features = [all_columns[idx] for idx in feature_list[:i]]
    selected_data = data[top_features]
    if 'feature0' not in selected_data.columns:
        selected_data.insert(0, 'feature0', data['feature0'])
    
    # Save to CSV
    output_file = os.path.join(output_dir, f'test_{i}_features.csv')
    selected_data.to_csv(output_file, index=False)
print("All CSV files have been generated.")

Generating CSV files:   0%|          | 0/500 [00:00<?, ?it/s]

All CSV files have been generated.
