In [2]:
%matplotlib inline
import numpy as np
import pandas as pd

from collections import defaultdict

from utils import *
from config import X_TRAIN_MP, Y_TRAIN_PATH, UNDER_DIR, MASK_DIR, MODS_DIR

In [48]:
X_train = read_csv_mp(X_TRAIN_MP, asarray=False)
y_train = pd.read_csv(Y_TRAIN_PATH, header=None)

In [55]:
var_thresholds = (0.1, 0.05)
corr_thresholds = (0.7, 0.8, 0.9, 0.95)

def save_corr_mask(corr, prefix, directory='var-corr', corr_thresholds=corr_thresholds, random_state=None, verbose=True):
    for c in corr_thresholds:
        if verbose:
            print(f"Beginning correlation threshold: {c:.2f}")
            
        # TODO: use smarter correlation filtering method
        mask = corr_filter_basic(corr, threshold=c)
        save_mask(mask, directory, prefix, c, random_state=random_state)
        #if verbose:
        #    print(f"Correlation threshold complete.")

def save_var_corr_mask(data, suffix, var_thresholds=var_thresholds, verbose=True, *args, **kwargs):
    X = np.asarray(data) if type(data) is not np.ndarray else data
    for i, v in enumerate(var_thresholds):
        if verbose:
            print(f"BEGINNING VARIANCE THRESHOLD: {v:.3f}")
        mpath = mask_path('var', 'var', v)
        var_mask = np.loadtxt(mpath, delimiter=',', dtype=bool)
        corr = corrcoef(X[:,var_mask])
        save_corr_mask(corr, mpath.stem+'_'+suffix, verbose=verbose, *args, **kwargs)
        if verbose:
            print(f"VARIANCE THRESHOLD COMPLETE.")
        if i != len(var_thresholds) - 1:
            print('')
            
def print_var_corr_num_features(suffix, var_thresholds=var_thresholds, corr_thresholds=corr_thresholds, random_state=random_state):
    for i, v in enumerate(var_thresholds):
        mpath = mask_path('var', 'var', v)
        var_mask = np.loadtxt(mpath, delimiter=',', dtype=bool)
        print(f"Variance Threshold: {v:.2f}\t\tNumber of features: {np.sum(var_mask)}")
        for c in corr_thresholds:
            mask = np.loadtxt(mask_path('var-corr', mpath.stem+'_'+suffix, c, random_state=random_state), delimiter=',', dtype=bool)
            print(f"Correlation Threshold: {c:.2f}\t\tNumber of features: {np.sum(mask)}")
        if i != len(var_thresholds)-1:
            print('')

# "Vanilla" data (no oversampling, undersampling)

In [26]:
save_var_corr_mask(X_train, 'original')

BEGINNING VARIANCE THRESHOLD: 0.100
Beginning correlation threshold: 0.70
Beginning correlation threshold: 0.80
Beginning correlation threshold: 0.90
Beginning correlation threshold: 0.95
VARIANCE THRESHOLD COMPLETE.

BEGINNING VARIANCE THRESHOLD: 0.050
Beginning correlation threshold: 0.70
Beginning correlation threshold: 0.80
Beginning correlation threshold: 0.90
Beginning correlation threshold: 0.95
VARIANCE THRESHOLD COMPLETE.



In [42]:
print_var_corr_num_features('original')

Variance Threshold: 0.10		Number of features: 2670
Correlation Threshold: 0.70		Number of features: 352
Correlation Threshold: 0.80		Number of features: 412
Correlation Threshold: 0.90		Number of features: 457
Correlation Threshold: 0.95		Number of features: 469

Variance Threshold: 0.05		Number of features: 9850
Correlation Threshold: 0.70		Number of features: 1276
Correlation Threshold: 0.80		Number of features: 1916
Correlation Threshold: 0.90		Number of features: 2672
Correlation Threshold: 0.95		Number of features: 2997


# Oversampled data (3 classes)

In [49]:
random_state = 0
X_over, _ = oversample(X_train, y=y_train, random_state=random_state)

In [52]:
save_var_corr_mask(X_over, 'over', random_state=random_state)

BEGINNING VARIANCE THRESHOLD: 0.100
Beginning correlation threshold: 0.70
Beginning correlation threshold: 0.80
Beginning correlation threshold: 0.90
Beginning correlation threshold: 0.95
VARIANCE THRESHOLD COMPLETE.

BEGINNING VARIANCE THRESHOLD: 0.050
Beginning correlation threshold: 0.70
Beginning correlation threshold: 0.80
Beginning correlation threshold: 0.90
Beginning correlation threshold: 0.95
VARIANCE THRESHOLD COMPLETE.



In [56]:
print_var_corr_num_features('over', random_state=random_state)

Variance Threshold: 0.10		Number of features: 2670
Correlation Threshold: 0.70		Number of features: 276
Correlation Threshold: 0.80		Number of features: 372
Correlation Threshold: 0.90		Number of features: 454
Correlation Threshold: 0.95		Number of features: 471

Variance Threshold: 0.05		Number of features: 9850
Correlation Threshold: 0.70		Number of features: 815
Correlation Threshold: 0.80		Number of features: 1627
Correlation Threshold: 0.90		Number of features: 2608
Correlation Threshold: 0.95		Number of features: 3005


# Undersampled data - splits (3 classes)

In [57]:
def under_data_path(class_id, split_id, *, random_state):
    return UNDER_DIR / f'random_{random_state}' / f'{class_id}' / f'{split_id}.csv'

In [58]:
num_splits = 10
random_state = 0
splits = undersample_split(X_train, y=y_train, num_splits=num_splits, random_state=random_state)

In [None]:
thresholds = [0.7, 0.8, 0.9, 0.95]
threshold_to_mask = defaultdict(list)

for i, (X_under, _) in enumerate(splits):
    print(f"Beginning split: {i}")
    
    ###### DELETE #####
    # X_under = X_under.iloc[:,:10000]
    
    corr = corrcoef(X_under)
    
    for threshold in thresholds:
        print(f"Beginning threshold: {threshold}")        
        threshold_to_mask[threshold].append(corr_filter(corr, threshold=threshold))

    print(f"Split complete.\n")

for threshold, mask_list in threshold_to_mask.items():
    save_mask(np.column_stack(mask_list), 'var-corr', 'under', threshold, random_state=random_state)

In [None]:
for threshold in thresholds:
    mask = np.loadtxt(mask_path('var-corr', 'under', threshold, random_state=random_state), delimiter=',', dtype=int)
    mask_all = np.all(mask, axis=1)
    print(f"Threshold: {threshold:.2f}\t\tNumber of features: {np.sum(mask_all)}")

# Oversampled data (2 classes \[merged smoker + former\])

In [59]:
random_state = 0
y_merge = merge_classes(y_train, [1, 2], col=0)
X_over, _ = oversample(X_train, y=y_merge, random_state=random_state)

In [60]:
save_var_corr_mask(X_over, 'merged_over', random_state=random_state)

BEGINNING VARIANCE THRESHOLD: 0.100
Beginning correlation threshold: 0.70
Beginning correlation threshold: 0.80
Beginning correlation threshold: 0.90
Beginning correlation threshold: 0.95
VARIANCE THRESHOLD COMPLETE.

BEGINNING VARIANCE THRESHOLD: 0.050
Beginning correlation threshold: 0.70
Beginning correlation threshold: 0.80
Beginning correlation threshold: 0.90
Beginning correlation threshold: 0.95
VARIANCE THRESHOLD COMPLETE.


In [61]:
print_var_corr_num_features('merged_over', random_state=random_state)

Variance Threshold: 0.10		Number of features: 2670
Correlation Threshold: 0.70		Number of features: 338
Correlation Threshold: 0.80		Number of features: 405
Correlation Threshold: 0.90		Number of features: 459
Correlation Threshold: 0.95		Number of features: 469

Variance Threshold: 0.05		Number of features: 9850
Correlation Threshold: 0.70		Number of features: 1225
Correlation Threshold: 0.80		Number of features: 1886
Correlation Threshold: 0.90		Number of features: 2657
Correlation Threshold: 0.95		Number of features: 2980


# Undersampled data (2 classes)

In [62]:
random_state = 0
y_merge = merge_classes(y_train, [1, 2], col=0)
X_under, _ = undersample(X_train, y=y_merge, random_state=random_state)

In [63]:
save_var_corr_mask(X_under, 'merged_under', random_state=random_state)

BEGINNING VARIANCE THRESHOLD: 0.100
Beginning correlation threshold: 0.70
Beginning correlation threshold: 0.80
Beginning correlation threshold: 0.90
Beginning correlation threshold: 0.95
VARIANCE THRESHOLD COMPLETE.

BEGINNING VARIANCE THRESHOLD: 0.050
Beginning correlation threshold: 0.70
Beginning correlation threshold: 0.80
Beginning correlation threshold: 0.90
Beginning correlation threshold: 0.95
VARIANCE THRESHOLD COMPLETE.


In [64]:
print_var_corr_num_features('merged_under', random_state=random_state)

Variance Threshold: 0.10		Number of features: 2670
Correlation Threshold: 0.70		Number of features: 334
Correlation Threshold: 0.80		Number of features: 396
Correlation Threshold: 0.90		Number of features: 454
Correlation Threshold: 0.95		Number of features: 469

Variance Threshold: 0.05		Number of features: 9850
Correlation Threshold: 0.70		Number of features: 1291
Correlation Threshold: 0.80		Number of features: 1933
Correlation Threshold: 0.90		Number of features: 2664
Correlation Threshold: 0.95		Number of features: 3000
