In [1]:
import os
import pandas as pd
import numpy as np
from itertools import combinations

### This creates subsets for LOFO ranking. Each CSV is missing one of the 499 features.

In [2]:
# Loab both datasets.
X_train = pd.read_csv('../data/X_train_encoded.csv', low_memory=False)
X_test = pd.read_csv('../data/X_test_encoded.csv', low_memory=False)
assert len(X_train.columns) == len(X_test.columns)

In [3]:
# Make sure it has the correct shape.
df = X_train
print(df.shape)

(25546, 500)


In [9]:
def subsets(df):
    """ 
    Generate all possible feature sets, each missing one feature. 
    """
    all_features = df.columns.tolist()
    n_features = len(all_features)
    feat_sets = list(combinations(all_features, n_features - 1))
    return [list(feat_set) for feat_set in feat_sets]

feat_sets = subsets(df)

# Make CSV directory.
output_dir = "../data/train_subsets"
os.makedirs(output_dir, exist_ok=True)

# Iterate untill all subsets are exported to CSV.
for i, feat_set in enumerate(feat_sets):
    subset_df = df[feat_set]
    removed_feature = set(df.columns) - set(feat_set)
    filename = f"train_without_{list(removed_feature)[0]}.csv"
    filepath = os.path.join(output_dir, filename)
    subset_df.to_csv(filepath, index=False)
    print(f"Exported subset {i+1}/499 to {filepath}")

Exported subset 1/499 to ../data/train_subsets/train_without_feature499.csv
Exported subset 2/499 to ../data/train_subsets/train_without_feature498.csv
Exported subset 3/499 to ../data/train_subsets/train_without_feature497.csv
Exported subset 4/499 to ../data/train_subsets/train_without_feature496.csv
Exported subset 5/499 to ../data/train_subsets/train_without_feature495.csv
Exported subset 6/499 to ../data/train_subsets/train_without_feature494.csv
Exported subset 7/499 to ../data/train_subsets/train_without_feature493.csv
Exported subset 8/499 to ../data/train_subsets/train_without_feature492.csv
Exported subset 9/499 to ../data/train_subsets/train_without_feature491.csv
Exported subset 10/499 to ../data/train_subsets/train_without_feature490.csv
Exported subset 11/499 to ../data/train_subsets/train_without_feature489.csv
Exported subset 12/499 to ../data/train_subsets/train_without_feature488.csv
Exported subset 13/499 to ../data/train_subsets/train_without_feature487.csv
Exported

In [10]:
# Make sure it has the correct shape.
df = X_test
print(df.shape)

(2831, 500)


In [11]:
def subsets(df):
    """ 
    Generate all possible feature sets, each missing one feature. 
    """
    all_features = df.columns.tolist()
    n_features = len(all_features)
    feat_sets = list(combinations(all_features, n_features - 1))
    return [list(feat_set) for feat_set in feat_sets]

feat_sets = subsets(df)

# Make CSV directory.
output_dir = "../data/test_subsets"
os.makedirs(output_dir, exist_ok=True)

# Iterate untill all subsets are exported to CSV.
for i, feat_set in enumerate(feat_sets):
    subset_df = df[feat_set]
    removed_feature = set(df.columns) - set(feat_set)
    filename = f"test_without_{list(removed_feature)[0]}.csv"
    filepath = os.path.join(output_dir, filename)
    subset_df.to_csv(filepath, index=False)
    print(f"Exported subset {i+1}/499 to {filepath}")

Exported subset 1/499 to ../data/test_subsets/test_without_feature499.csv
Exported subset 2/499 to ../data/test_subsets/test_without_feature498.csv
Exported subset 3/499 to ../data/test_subsets/test_without_feature497.csv
Exported subset 4/499 to ../data/test_subsets/test_without_feature496.csv
Exported subset 5/499 to ../data/test_subsets/test_without_feature495.csv
Exported subset 6/499 to ../data/test_subsets/test_without_feature494.csv
Exported subset 7/499 to ../data/test_subsets/test_without_feature493.csv
Exported subset 8/499 to ../data/test_subsets/test_without_feature492.csv
Exported subset 9/499 to ../data/test_subsets/test_without_feature491.csv
Exported subset 10/499 to ../data/test_subsets/test_without_feature490.csv
Exported subset 11/499 to ../data/test_subsets/test_without_feature489.csv
Exported subset 12/499 to ../data/test_subsets/test_without_feature488.csv
Exported subset 13/499 to ../data/test_subsets/test_without_feature487.csv
Exported subset 14/499 to ../data/