In [None]:
import os
from os.path import join

import dask.dataframe as dd
import dask.array as da
import numpy as np
import pandas as pd

from dask_ml.decomposition import IncrementalPCA

In [None]:
PATH = '/mnt/dssmcmlfs01/merlin_cxg_2023_05_15_sf-log1p'

In [None]:
def get_count_matrix(ddf):
    x = (
        ddf['X']
        .map_partitions(
            lambda xx: pd.DataFrame(np.vstack(xx.tolist())), 
            meta={col: 'f4' for col in range(19331)}
        )
        .to_dask_array(lengths=[1024] * ddf.npartitions)
    )

    return x


# Compute PCA for visualization

In [None]:
os.makedirs(join(PATH, 'pca'), exist_ok=True)


n_comps = 50


for split in ['test', 'val', 'train']:
    x = get_count_matrix(dd.read_parquet(join(PATH, split), split_row_groups=True))
    pca = IncrementalPCA(n_components=n_comps, iterated_power=3)
    x_pca = da.compute(pca.fit_transform(x))[0]
    with open(join(PATH, 'pca', f'x_pca_{split}_{n_comps}.npy'), 'wb') as f:
        np.save(f, x_pca)


# Compute PCA for model training

In [None]:
os.makedirs(join(PATH, 'pca'), exist_ok=True)


n_comps = 256


x_train = get_count_matrix(dd.read_parquet(join(PATH, 'train'), split_row_groups=True))
x_val = get_count_matrix(dd.read_parquet(join(PATH, 'val'), split_row_groups=True))
x_test = get_count_matrix(dd.read_parquet(join(PATH, 'test'), split_row_groups=True))


pca = IncrementalPCA(n_components=n_comps, iterated_power=3)
x_pca_train, x_pca_val, x_pca_test = da.compute(
    [pca.fit_transform(x_train), pca.transform(x_val), pca.transform(x_test)]
)[0]


with open(join(PATH, f'pca/x_pca_training_train_split_{n_comps}.npy'), 'wb') as f:
    np.save(f, x_pca_train)
with open(join(PATH, f'pca/x_pca_training_val_split_{n_comps}.npy'), 'wb') as f:
    np.save(f, x_pca_val)
with open(join(PATH, f'pca/x_pca_training_test_split_{n_comps}.npy'), 'wb') as f:
    np.save(f, x_pca_test)
