# MLP Clustering Significance Stability over Multiple Trainings

### Before Running on Perceptron

Go to `src/train_nn.py` to line 46, and change the path `./models/` to `/scratch/<your username>/models`

Do not forget to undo this change!

DO NOT COMMIT THIS CHANGE TO GITHUB!!!

In [1]:
%load_ext autoreload
%autoreload 1

In [7]:
import sys
sys.path.append('..')

import random
import subprocess
from multiprocessing import Pool

import numpy as np
import matplotlib.pylab as plt
import pandas as pd
from tqdm import tqdm, trange
from IPython import display

from src.visualization import draw_mlp_clustering_report, run_double_spectral_cluster, run_spectral_cluster
from src.utils import get_weights_paths, build_clustering_results, cohen_d_stats
from src.experiment_tagging import get_model_path, MODEL_TAG_LOOKUP
# from src.spectral_cluster_model import SHUFFLE_METHODS
SHUFFLE_METHODS = ['layer', 'layer_nonzero']

In [3]:
N_TRAINED_MODELS = 10

# we ar doing 320 experiments (=32 models x 10 times),
# and let's have expectation of false-positive <= 1
# so we can set n_shuffles to 320, and then p-values < 1/401
# if we make it too large, let see, 1000 shuffles...
# ... we might reduce the statistical power (given H1, the probability to reject it incorrectly)!
N_SHUFFLES = 320

N_GPUS = 4

TAGS_COMMANDS = {
    'MNIST': 'python -m src.train_nn with mlp_config dataset_name=mnist',
    'CIFAR10': 'python -m src.train_nn with mlp_config dataset_name=cifar10 pruning_epochs=40',
#    'LINE': 'python -m src.train_nn with mlp_config dataset_name=line',
    'FASHION': 'python -m src.train_nn with mlp_config dataset_name=fashion',
    'MNIST+DROPOUT': 'python -m src.train_nn with mlp_config dataset_name=mnist with_dropout=True',
    'CIFAR10+DROPOUT': 'python -m src.train_nn with mlp_config dataset_name=cifar10 epochs=100 pruning_epochs=40 with_dropout=True dropout_rate=0.2',
#    'LINE+DROPOUT': 'python -m src.train_nn with mlp_config dataset_name=line with_dropout=True',
    'FASHION+DROPOUT': 'python -m src.train_nn with mlp_config dataset_name=fashion with_dropout=True',
#    'LINE-MNIST': 'python -m src.train_nn with mlp_config dataset_name=line-mnist',
#    'LINE-CIFAR10': 'python -m src.train_nn with mlp_config dataset_name=line-cifar10 epochs=30 pruning_epochs=40',
#    'MNIST-CIFAR10': 'python -m src.train_nn with mlp_config dataset_name=mnist-cifar10 epochs=30 pruning_epochs=40',
#    'LINE-MNIST-SEPARATED': 'python -m src.train_nn with mlp_config dataset_name=line-mnist-separated',
#    'LINE-CIFAR10-SEPARATED': 'python -m src.train_nn with mlp_config dataset_name=line-cifar10-separated epochs=30 pruning_epochs=40',
#    'MNIST-CIFAR10-SEPARATED': 'python -m src.train_nn with mlp_config dataset_name=mnist-cifar10-separated epochs=30 pruning_epochs=40',
#    'LINE-MNIST+DROPOUT': 'python -m src.train_nn with mlp_config dataset_name=line-mnist with_dropout=True',
#    'LINE-CIFAR10+DROPOUT': 'python -m src.train_nn with mlp_config dataset_name=line-cifar10 epochs=30 pruning_epochs=40 with_dropout=True dropout_rate=0.2',
#    'MNIST-CIFAR10+DROPOUT': 'python -m src.train_nn with mlp_config dataset_name=mnist-cifar10 epochs=30 pruning_epochs=40 with_dropout=True dropout_rate=0.2',
#    'LINE-MNIST-SEPARATED+DROPOUT': 'python -m src.train_nn with mlp_config dataset_name=line-mnist-separated with_dropout=True',
#    'LINE-CIFAR10-SEPARATED+DROPOUT': 'python -m src.train_nn with mlp_config dataset_name=line-cifar10-separated epochs=30 pruning_epochs=40 with_dropout=True dropout_rate=0.2',
#    'MNIST-CIFAR10-SEPARATED+DROPOUT': 'python -m src.train_nn with mlp_config dataset_name=mnist-cifar10-separated epochs=30 pruning_epochs=40 with_dropout=True dropout_rate=0.2',
#    'RANDOM': 'python -m src.train_nn with mlp_config dataset_name=random',
#    'RANDOM+DROPOUT': 'python -m src.train_nn with mlp_config dataset_name=random with_dropout=True',
#    'MNIST-x1.5-EPOCHS': 'python -m src.train_nn with mlp_config dataset_name=mnist epochs=30',
#    'MNIST-x1.5-EPOCHS+DROPOUT':'python -m src.train_nn with mlp_config dataset_name=mnist epochs=30 with_dropout=True',
#    'MNIST-x2-EPOCHS':'python -m src.train_nn with mlp_config dataset_name=mnist epochs=40',
#    'MNIST-x2-EPOCHS+DROPOUT':'python -m src.train_nn with mlp_config dataset_name=mnist epochs=40 with_dropout=True',
#    'MNIST-x10-EPOCHS': 'python -m src.train_nn with mlp_config dataset_name=mnist epochs=200',
#    'MNIST-x10-EPOCHS+DROPOUT': 'python -m src.train_nn with mlp_config dataset_name=mnist epochs=200 with_dropout=True',
#    'RANDOM-x50-EPOCHS': 'python -m src.train_nn with mlp_config dataset_name=random epochs=1000',
#    'RANDOM-x50-EPOCHS+DROPOUT': 'python -m src.train_nn with mlp_config dataset_name=random epochs=1000 with_dropout=True',
#    'RANDOM-OVERFITTING': 'python -m src.train_nn with mlp_config dataset_name=random epochs=100 pruning_epochs=100 shuffle=False n_train=3000',
#    'RANDOM-OVERFITTING+DROPOUT': 'python -m src.train_nn with mlp_config dataset_name=random epochs=100 pruning_epochs=100 shuffle=False n_train=3000 with_dropout=True'
}
DATASETS_TAGS = [command.split()[5][13:] for command in TAGS_COMMANDS.values()]

In [4]:
if False:
    def train_model_multiply(command, n_models_per_command, gpu_id=None):    
        for _ in range(n_models_per_command):
            actual_command = f'cd .. && {command}'#' > /dev/null 2>&1'
            print(actual_command)
            subprocess.run(actual_command, shell=True,
                           env={'CUDA_VISIBLE_DEVICES': str(gpu_id)})


    def train_models(commands, n_models_per_command, gpu_id=None):
        for command in commands:
            train_model_multiply(command, n_models_per_command, gpu_id)

            commands = list(TAGS_COMMANDS.values())
    random.shuffle(commands)

    assert len(commands) % N_GPUS == 0
    n_commands_per_gpu = len(commands) // N_GPUS

    with Pool(N_GPUS) as p:

        p.starmap(train_models,
                 [(commands[gpu_id*n_commands_per_gpu : (gpu_id+1)*n_commands_per_gpu], N_TRAINED_MODELS, gpu_id)
                 for gpu_id in range(N_GPUS)])

In [17]:
results = {}

for shuffle_method in SHUFFLE_METHODS:
    results[shuffle_method] = {}
    
    for (model_tag, _), dataset_tag in zip(TAGS_COMMANDS.items(), tqdm(DATASETS_TAGS)):
        model_paths = get_model_path(model_tag, filter_='all', model_base_path='/scratch/shlomi/models/')


        results[shuffle_method][model_tag] = [run_spectral_cluster(path / f'{dataset_tag}-mlp-pruned-weights.pckl',
                                               n_samples=N_SHUFFLES,
                                               shuffle_method=shuffle_method)
            for path in tqdm(model_paths[-N_TRAINED_MODELS:])]





  0%|          | 0/6 [00:00<?, ?it/s][A[A[A[A




  0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A[A




 10%|█         | 1/10 [00:35<05:14, 35.00s/it][A[A[A[A[A




 20%|██        | 2/10 [01:11<04:42, 35.37s/it][A[A[A[A[A




 30%|███       | 3/10 [01:47<04:08, 35.56s/it][A[A[A[A[A




 40%|████      | 4/10 [02:22<03:32, 35.49s/it][A[A[A[A[A




 50%|█████     | 5/10 [02:59<02:59, 35.81s/it][A[A[A[A[A




 60%|██████    | 6/10 [03:35<02:23, 35.94s/it][A[A[A[A[A




 70%|███████   | 7/10 [04:11<01:47, 35.96s/it][A[A[A[A[A




 80%|████████  | 8/10 [04:47<01:11, 35.88s/it][A[A[A[A[A




 90%|█████████ | 9/10 [05:23<00:35, 35.91s/it][A[A[A[A[A




100%|██████████| 10/10 [05:58<00:00, 35.89s/it][A[A[A[A[A




 17%|█▋        | 1/6 [05:58<29:54, 358.86s/it][A[A[A[A




  0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A[A




 10%|█         | 1/10 [00:35<05:21, 35.69s/it][A[A[A[A[A




 20%|██        | 2/10 [01:10<04:43, 

 20%|██        | 2/10 [01:15<04:59, 37.48s/it][A[A[A[A[A[A





 30%|███       | 3/10 [01:53<04:24, 37.78s/it][A[A[A[A[A[A





 40%|████      | 4/10 [02:32<03:48, 38.13s/it][A[A[A[A[A[A





 50%|█████     | 5/10 [03:11<03:11, 38.39s/it][A[A[A[A[A[A





 60%|██████    | 6/10 [03:49<02:32, 38.19s/it][A[A[A[A[A[A





 70%|███████   | 7/10 [04:26<01:53, 37.98s/it][A[A[A[A[A[A





 80%|████████  | 8/10 [05:05<01:16, 38.04s/it][A[A[A[A[A[A





 90%|█████████ | 9/10 [05:42<00:37, 37.83s/it][A[A[A[A[A[A





100%|██████████| 10/10 [06:20<00:00, 38.00s/it][A[A[A[A[A[A





 83%|████████▎ | 5/6 [31:19<06:17, 377.01s/it][A[A[A[A[A





  0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A[A[A





 10%|█         | 1/10 [00:34<05:11, 34.58s/it][A[A[A[A[A[A





 20%|██        | 2/10 [01:09<04:37, 34.74s/it][A[A[A[A[A[A





 30%|███       | 3/10 [01:44<04:04, 34.87s/it][A[A[A[A[A[A





 40%|████      | 4/10 [02:19<0

In [18]:
def build_all_models_per_shuffle_method_df(single_shuffle_method_results):
    model_dfs = []

    for model_tag, model_results in single_shuffle_method_results.items():
        _, metrics  = zip(*model_results)
        model_dfs.append(pd.DataFrame(metrics).assign(model=model_tag))

    df = pd.concat(model_dfs, ignore_index=True)

    df['is_sig'] = np.isclose(df['percentile'], 1 / (N_SHUFFLES + 1))
    
    return df

layer_df = build_all_models_per_shuffle_method_df(results['layer'])
layer_nonzero_df = build_all_models_per_shuffle_method_df(results['layer_nonzero'])

In [19]:
layer_df.head()

Unnamed: 0,ave_in_out,mean,n_samples,ncut,percentile,stdev,test_acc,test_loss,train_acc,train_loss,z_score,model,is_sig
0,0.497503,2.037381,320,2.005006,0.065421,0.018056,0.9835,0.095332,1.0,4.8e-05,-1.793088,MNIST,False
1,0.481929,2.035213,320,2.036808,0.439252,0.014743,0.9847,0.090697,1.0,5.6e-05,0.108146,MNIST,False
2,0.509254,2.042848,320,1.981663,0.003115,0.012102,0.983,0.099833,1.0,5.9e-05,-5.055921,MNIST,True
3,0.508688,2.041044,320,1.982774,0.003115,0.020362,0.9828,0.099546,1.0,6.2e-05,-2.861703,MNIST,True
4,0.49217,2.04541,320,2.015783,0.003115,0.015201,0.9845,0.097756,1.0,8.1e-05,-1.949092,MNIST,True


In [20]:
layer_nonzero_df.head()

Unnamed: 0,ave_in_out,mean,n_samples,ncut,percentile,stdev,test_acc,test_loss,train_acc,train_loss,z_score,model,is_sig
0,0.497503,2.0716,320,2.005006,0.003115,0.026752,0.9835,0.095332,1.0,4.8e-05,-2.48931,MNIST,True
1,0.481929,2.082343,320,2.036808,0.003115,0.030976,0.9847,0.090697,1.0,5.6e-05,-1.470031,MNIST,True
2,0.509254,2.082176,320,1.981663,0.003115,0.031357,0.983,0.099833,1.0,5.9e-05,-3.205439,MNIST,True
3,0.508688,2.047636,320,1.982774,0.003115,0.017703,0.9828,0.099546,1.0,6.2e-05,-3.663943,MNIST,True
4,0.49217,2.083524,320,2.015783,0.003115,0.020555,0.9845,0.097756,1.0,8.1e-05,-3.295645,MNIST,True


In [22]:
MODEL_DATA_COLUMNS = ['model',
                      'train_acc', 'train_loss', 'test_acc', 'test_loss',
                      'ncut', 'ave_in_out',
                      'n_samples']

assert layer_df[MODEL_DATA_COLUMNS].equals(layer_nonzero_df[MODEL_DATA_COLUMNS])

SHUFFLING_DATA_COLUMNS = ['mean', 'percentile', 'stdev', 'z_score', 'is_sig']

two_shuffling_method_df = pd.merge(layer_df[SHUFFLING_DATA_COLUMNS],
                                     layer_nonzero_df[SHUFFLING_DATA_COLUMNS],
                                     left_index=True, right_index=True,
                                     suffixes=('_layer', '_layer_nonzero'))

assert layer_df[MODEL_DATA_COLUMNS].equals(layer_nonzero_df[MODEL_DATA_COLUMNS])

df = pd.merge(layer_df[MODEL_DATA_COLUMNS], two_shuffling_method_df,
              left_index=True, right_index=True,)

df['cohen_d'] = df.apply(lambda r: cohen_d_stats(r['mean_layer'], r['stdev_layer'], r['n_samples'],
                                 r['mean_layer_nonzero'], r['stdev_layer_nonzero'], r['n_samples']),
                                axis=1)

df

Unnamed: 0,model,train_acc,train_loss,test_acc,test_loss,ncut,ave_in_out,n_samples,mean_layer,percentile_layer,stdev_layer,z_score_layer,is_sig_layer,mean_layer_nonzero,percentile_layer_nonzero,stdev_layer_nonzero,z_score_layer_nonzero,is_sig_layer_nonzero,cohen_d
0,MNIST,1.0,4.8e-05,0.9835,0.095332,2.005006,0.497503,320,2.037381,0.065421,0.018056,-1.793088,False,2.0716,0.003115,0.026752,-2.48931,True,0.561531
1,MNIST,1.0,5.6e-05,0.9847,0.090697,2.036808,0.481929,320,2.035213,0.439252,0.014743,0.108146,False,2.082343,0.003115,0.030976,-1.470031,True,0.533618
2,MNIST,1.0,5.9e-05,0.983,0.099833,1.981663,0.509254,320,2.042848,0.003115,0.012102,-5.055921,True,2.082176,0.003115,0.031357,-3.205439,True,0.534062
3,MNIST,1.0,6.2e-05,0.9828,0.099546,1.982774,0.508688,320,2.041044,0.003115,0.020362,-2.861703,True,2.047636,0.003115,0.017703,-3.663943,True,0.641595
4,MNIST,1.0,8.1e-05,0.9845,0.097756,2.015783,0.49217,320,2.04541,0.003115,0.015201,-1.949092,True,2.083524,0.003115,0.020555,-3.295645,True,0.612506
5,MNIST,1.0,4.8e-05,0.9835,0.095626,1.976777,0.511748,320,2.04958,0.003115,0.017383,-4.188122,True,2.049504,0.003115,0.031693,-2.294742,True,0.534956
6,MNIST,1.0,5.6e-05,0.9838,0.087238,1.960098,0.520357,320,2.039954,0.003115,0.013662,-5.845216,True,2.045483,0.190031,0.100837,-0.846761,False,0.334619
7,MNIST,1.0,6.1e-05,0.9838,0.100444,2.001155,0.499423,320,2.044081,0.003115,0.016928,-2.535831,True,2.073917,0.003115,0.026112,-2.786532,True,0.568041
8,MNIST,1.0,4.7e-05,0.9828,0.093142,1.969089,0.515698,320,2.041622,0.003115,0.019876,-3.649229,True,2.025723,0.003115,0.014974,-3.782219,True,0.67577
9,MNIST,1.0,4.9e-05,0.9833,0.102283,2.073477,0.464563,320,2.050418,0.937695,0.019004,1.213369,False,2.098487,0.096573,0.015635,-1.599635,False,0.660279


In [None]:
# df = pd.read_csv('../results/mlp-clustering-stability-two-shuffling-methods-all-samples.csv' index_col=0)

In [None]:
df_grpby_model = df.groupby('model')

TWO_SHUFFLE_STATS_COLUMNS = [f'{stat}_{shuffle_method}'
                             for stat in ['mean', 'stdev', 'z_score', 'percentile', 'is_sig']
                             for shuffle_method in SHUFFLE_METHODS]

statistics_df = pd.concat([(df_grpby_model[['ncut'] + TWO_SHUFFLE_STATS_COLUMNS]
                             .agg(['mean', 'std'])),
                            df_grpby_model.size().rename('n_models'),
                            (df_grpby_model[['train_acc', 'test_acc']]
                             .agg('mean'))],
                           axis=1)

In [None]:
statistics_df

In [None]:
# TODO: ask for a code review
# std = np.sqrt(np.mean(x**2) - np.mean(x)**2)

def calc_overall_stdev(row, suffix=''):
    return np.sqrt(                 # 5. apply root square to get the SD from the variance
        (np.mean(                   # 3. divide by the number of trained model
            (row[f'stdev{suffix}']**2)       # 1. cancel the overall square root
             + row[f'mean{suffix}']**2))     # 2. add the mean and get the sum or squares
        - np.mean(row[f'mean{suffix}'])**2)  # 4. substrct the square of the overall mean

In [None]:
SHUFFLE_METHOD_SUFFIX = ['_layer', '_layer_nonzero']
for suffix in SHUFFLE_METHOD_SUFFIX:
    statistics_df[f'overall_std{suffix}'] = df_grpby_model.apply(calc_overall_stdev, suffix)

for suffix in SHUFFLE_METHOD_SUFFIX:
    statistics_df[f'is_stable{suffix}'] = (statistics_df[(f'is_sig{suffix}', 'mean')] == 1)

statistics_df = statistics_df.loc[TAGS_COMMANDS.keys(), :]

In [None]:
statistics_df

In [None]:
#df.to_csv('../results/mlp-clustering-stability-two-shuffling-methods-all-samples.csv')
#statistics_df.to_csv('../results/mlp-clustering-stability-two-shuffling-methods-statistics.csv')

### Model Stability Statistics

In [None]:
statistics_df

### Appendix - All Samples

In [None]:
df

### Unpruned results without shuffling
#### TODO: Refactor
- [ ] Combine with the unpruned table

In [None]:
unpruned_results = {}

for (model_tag, _), dataset_tag in zip(TAGS_COMMANDS.items(), tqdm(DATASETS_TAGS)):
    model_paths = get_model_path(model_tag, filter_='all', model_base_path='/scratch/shlomi/models/')
    unpruned_results[model_tag] = [run_spectral_cluster(path / f'{dataset_tag}-mlp-unpruned-weights.pckl',
                                               with_shuffle=False)
            for path in tqdm(model_paths[-N_TRAINED_MODELS:])]
       
unpruned_model_dfs = []

for model_tag, model_results in unpruned_results.items():
    _, metrics  = zip(*model_results)
    unpruned_model_dfs.append(pd.DataFrame(metrics).assign(model=model_tag))

unpruned_df = pd.concat(unpruned_model_dfs, ignore_index=True)

unpruned_df_grpby_model = unpruned_df.groupby('model')


unpruned_statistics_df = pd.concat([(unpruned_df_grpby_model[['ncut']]
                             .agg(['mean', 'std'])),
                            unpruned_df_grpby_model.size().rename('n_models'),
                            (unpruned_df_grpby_model[['train_acc', 'test_acc']]
                             .agg('mean'))],
                           axis=1)

unpruned_statistics_df = unpruned_statistics_df.loc[TAGS_COMMANDS.keys(), :]

unpruned_statistics_df

In [None]:
unpruned_df.to_csv('../results/mlp-clustering-stability-all-samples-unpruned.csv')
unpruned_statistics_df.to_csv('../results/mlp-clustering-stability-statistics-unpruned.csv')