In [2]:
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp
import git
from pathlib import Path
import warnings
import os
import matplotlib.pyplot as plt
import seaborn as sns
ROOT_DIR = Path(git.Repo('.', search_parent_directories=True).working_tree_dir)
os.chdir(os.path.join(ROOT_DIR, "utilities"))
from transform_audio import *
warnings.filterwarnings("ignore", category=UserWarning, module='numpy.lib.npyio')

## Choose data file

In [4]:
transform = 'fft' #['fft', 'erb', 'stft', 'cwt']
group = 'gender' #['gender', 'emotional_intensity', 'statement']
BAND = 'geo' #['geo','raw']
SUBSAMPLE_SIZE = int(1e5) #np.inf if no subsampling
COMPRESSION = "full" if SUBSAMPLE_SIZE == np.inf else f"{SUBSAMPLE_SIZE:.0e}".replace("+0", "")

FULL_DATA_NAME = 'ravdess_oriPitch_' + transform + '_' + group + '_geometric'
transformed_data = pd.read_pickle(os.path.join(ROOT_DIR, "transformed-data-audio", "geometric-transformed-data", f'{FULL_DATA_NAME}.pickle'))
groups = list(transformed_data['Original-Pitch'].keys())
BAND_TYPE = 'geometric-transformed-data'

## Symmetrize data and sort data

In [5]:
def symmetrize(data):
    for i in data:
        clone = data[i]
        clone = np.abs(clone)
        clone = np.concatenate([-clone, clone])
        clone = np.sort(clone)
        data[i] = clone
    return data

In [None]:
for i in transformed_data['Original-Pitch'][groups[0]]: #bands
    transformed_data['Original-Pitch'][groups[0]][i] = symmetrize(transformed_data['Original-Pitch'][groups[0]][i])
    transformed_data['Original-Pitch'][groups[1]][i] = symmetrize(transformed_data['Original-Pitch'][groups[0]][i])

## Symmetrize and sort complex data

In [15]:
def extract_complex_coeffs(transformed_data, pitch_key, group_key):
    return {
        idx: np.sort(np.concatenate((data['real'], data['imag'])))
        for idx, data in enumerate(transformed_data[pitch_key][group_key].values())
    }

In [16]:
group_0_complex_coeffs = extract_complex_coeffs(transformed_data, 'Original-Pitch', groups[0])
group_1_complex_coeffs = extract_complex_coeffs(transformed_data, 'Original-Pitch', groups[1])

## Subsample coefficients per group

In [None]:
def extract_coeffs(data_source, subsample_size, coeff_type):
    coeff_dict = {}
    size_dict = {}

    for idx, (key, data) in enumerate(data_source.items()):
        if coeff_type in ('real', 'imag'):
            coeffs = data[coeff_type]          
        elif coeff_type == 'comp':
            coeffs = data                    
        else:
            raise ValueError("coeff_type must be 'real', 'imag', or 'comp'")

        size_dict[idx] = len(coeffs) // 2

        if len(coeffs) > subsample_size:
            sample_idxs = np.linspace(0, len(coeffs) - 1, subsample_size, dtype=int)
            coeff_dict[idx] = coeffs[sample_idxs]
        else:
            coeff_dict[idx] = coeffs

    return coeff_dict, size_dict

In [23]:
group_0_real, group_0_real_size = extract_coeffs(transformed_data['Original-Pitch'][groups[0]], SUBSAMPLE_SIZE, 'real')
group_0_imag, group_0_imag_size = extract_coeffs(transformed_data['Original-Pitch'][groups[0]], SUBSAMPLE_SIZE, 'imag')
group_0_complex, group_0_complex_size = extract_coeffs(transformed_data['Original-Pitch'][groups[0]], SUBSAMPLE_SIZE, 'comp')
group_1_real, group_1_real_size = extract_coeffs(transformed_data['Original-Pitch'][groups[1]], SUBSAMPLE_SIZE, 'real')
group_1_imag, group_1_imag_size = extract_coeffs(transformed_data['Original-Pitch'][groups[1]], SUBSAMPLE_SIZE, 'imag')
group_1_complex, group_1_complex_size = extract_coeffs(transformed_data['Original-Pitch'][groups[1]], SUBSAMPLE_SIZE, 'comp')

## Export to pickle

In [26]:
EXPORT_DIR = os.path.join(ROOT_DIR, "transformed-data-audio", "subsample-data")

data_map = {
    groups[0]: {
        "comp": group_0_complex,
        "re": group_0_real,
        "im": group_0_imag,
    },
    groups[1]: {
        "comp": group_1_complex,
        "re": group_1_real,
        "im": group_1_imag,
    },
}

data_sizes = [
    (groups[0], {
        "comp": group_0_complex_size,
        "re": group_0_real_size,
        "im": group_0_imag_size,
    }),
    (groups[1], {
        "comp": group_1_complex_size,
        "re": group_1_real_size,
        "im": group_1_imag_size,
    }),
]



In [None]:
for group_name, components in data_map.items():
    for prefix, data in components.items():
        filename = f"ravdess-{COMPRESSION}{BAND}{prefix}-{transform}-{group_name}.pickle"
        pd.to_pickle(data, os.path.join(EXPORT_DIR, filename))

In [None]:
for group_name, components in data_sizes:
    for prefix, data in components.items():
        filename = f"ravdess-{COMPRESSION}{BAND}{prefix}-{transform}-{group_name}-size.pickle"
        filepath = os.path.join(EXPORT_DIR, filename)
        pd.to_pickle(data, filepath)