# Table 1

Permutation tests

In [1]:
from datetime import datetime
from pathlib import Path

import matplotlib
import numpy as np
import pandas as pd
import scipy
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

import analysis
from metadata import METADATA

In [2]:
# Register tqdm with pandas for `progress_apply`
tqdm.pandas()

## Load data

In [3]:
df = pd.read_parquet('results/main.parquet')

In [4]:
# Unscrambled vs. scrambled
subset = df.loc[df['stimulus_is_scrambled_pair']]

## Permutation test functions

In [5]:
from numpy.random import default_rng, SeedSequence
# Get a random number generator using a high-quality seed created with 
# `SeedSequence().entropy`
RNG = default_rng(seed=9531952262819343520803317503529205167)

In [6]:
def sample(values, labels, statistic):
    # Shuffle the labels
    shuffled_labels = labels.values[RNG.permutation(len(labels.values))]
    # Return the evaluation
    return statistic(values, shuffled_labels)

In [7]:
def permutation_test(data, value_col=None, label_col=None, statistic=None, n=10_000):
    # All arguments must be keyword arguments to use `progress_apply`
    if any([value_col is None, label_col is None, statistic is None]):
        raise ValueError("must provide value_col, label_col and statistic")
        
    values = data[value_col]
    labels = data[label_col]
    
    # Get the observed response
    observed = statistic(values, labels)
    
    # Sample n times to get a null distribution
    null_distribution = pd.Series([
        sample(values, labels, statistic) 
        for i in range(n)
    ])
    
    # The p value is the fraction of permutation statistics that are 
    # at least as extreme as the observed statistic 
    # (one-sided test; not taking absolute value of the observed statistic)
    p_value = np.mean(null_distribution >= observed)
    
    return pd.concat([
        pd.Series({
            'p_value': p_value,
            'observed': observed,
            'statistic': statistic.__name__,
            'response_variable': value_col,
        }),
        null_distribution.describe(),
    ])

## Perform permutation tests

In [8]:
response = 'log(normalized differentiation)'

In [9]:
%%time

def mean_natural_artificial_difference(values, labels):
    return analysis.mean_difference(values, labels, a='natural', b='artificial')


permutation_test_results = subset.groupby('session').progress_apply(
    permutation_test, 
    value_col=response, 
    label_col='stimulus_type', 
    statistic=mean_natural_artificial_difference, 
    n=20_000,
)

  0%|          | 0/44 [00:00<?, ?it/s]

CPU times: user 8min 15s, sys: 806 ms, total: 8min 16s
Wall time: 8min 15s


In [10]:
permutation_test_results.name = 'p_value'
permutation_test_results = pd.DataFrame(permutation_test_results)
permutation_test_results.to_parquet(f'results/permutation_tests.parquet')

In [11]:
permutation_test_results = permutation_test_results.merge(METADATA, on='session')

## Summarize

In [12]:
ALPHA = 0.05

In [13]:
def is_significant(data):
    return (data.p_value <= ALPHA)

In [14]:
summary = (
    permutation_test_results
    .groupby(['layer', 'area'])
    .apply(
        lambda group: pd.Series({
            'significant': is_significant(group).sum(), 
            'total': len(group),
        })
    )
)

In [15]:
output_path = 'results/permutation_tests_summary.csv'

In [16]:
# Convert to wide format

summary_wide = (
    summary
    .astype(dict(significant=str, total=str))
    .agg(' / '.join, axis=1, )
    .reset_index()
    .pivot(index='layer', columns='area')
)
summary_wide.columns = summary_wide.columns.droplevel(level=0)

all_areas = (
    summary
    .groupby(level=0)
    .sum()
    .astype(dict(significant=str, total=str))
    .agg(' / '.join, axis=1, )
)

all_layers = (
    summary
    .groupby(level=1)
    .sum()
    .astype(dict(significant=str, total=str))
    .agg(' / '.join, axis=1, )
)

summary_wide.index = summary_wide.index.astype(str)
summary_wide.columns = summary_wide.columns.astype(str)
summary_wide.loc['All layers', :] = all_layers
summary_wide.loc[:, 'All areas'] = all_areas

summary_wide.to_csv(output_path)
summary_wide

area,V1,LM,AL,PM,AM,All areas
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
L2/3,1 / 3,1 / 3,3 / 3,0 / 3,3 / 3,8 / 15
L4,0 / 3,1 / 3,0 / 3,0 / 3,0 / 3,1 / 15
L5,0 / 3,0 / 3,0 / 2,0 / 3,0 / 3,0 / 14
All layers,1 / 9,2 / 9,3 / 8,0 / 9,3 / 9,
