# PCA with missing data

The data we analyze in this notebook is based on the paper on missing data in PCA:

Yi, X., & Latch, E. K. (2022). Nonrandom missing data can bias Principal Component Analysis inference of population genetic structure. Molecular Ecology Resources, 22, 602– 611. https://doi.org/10.1111/1755-0998.13498

In [None]:
# the data is in RData format, we need the python package pyreadr to load the data
# ! pip install pyreadr

In [106]:
from pandora.dataset import NumpyDataset, bootstrap_and_embed_multiple_numpy
from pandora.embedding_comparison import BatchEmbeddingComparison
from pandora.plotting import *

import pyreadr
from plotly import graph_objects as go
from sklearn.impute import SimpleImputer


def get_population_from_sample_id(sample_id):
    return sample_id.split(".")[0].strip()

## p3_mig50 Data
### 20% missing data (random)

In [93]:
data = pyreadr.read_r("missing_data_study_simulated_data/p3_mig50_SNP_rand_miss0.2.RData")["rand"]
sample_ids = pd.Series(data.index)
populations = pd.Series([get_population_from_sample_id(sid) for sid in sample_ids])
input_data = data.to_numpy()

# we need to properly set the nan values in order for the imputer to work
input_data = np.nan_to_num(input_data, nan=np.nan)
# in analogy to Yi & Latch, we impute missing data using mean imputation
imputer = SimpleImputer(missing_values=np.NaN, strategy="mean")
input_data = imputer.fit_transform(input_data)

# initialize a numpy-based dataset and compute it's PCA (we will use this for plotting later)
dataset = NumpyDataset(input_data, sample_ids, populations)
dataset.run_pca(n_components=10)
# compute 100 bootstrap replicates and their PCA embeddings
bootstraps = bootstrap_and_embed_multiple_numpy(dataset, 100, EmbeddingAlgorithm.PCA, n_components=10, seed=0, threads=4)
# compare all bootstraps and compute the pandora similarity PS
batch = BatchEmbeddingComparison([b.pca for b in bootstraps])
pandora_similarity = batch.compare()

print(f"Pandora similarity for p3_mig with 20% random missing data = ", round(pandora_similarity, 2))

Pandora similarity for p3_mig with 20% random missing data =  0.57


In [94]:
sample_support_values = batch.get_sample_support_values().mean(axis=1)
sample_support_values.describe()

count    75.000000
mean      0.532960
std       0.022677
min       0.489362
25%       0.518158
50%       0.531965
75%       0.545458
max       0.591207
dtype: float64

In [105]:
batch.compare_clustering(kmeans_k=3)

0.5659115310730077

In [142]:
# we don't use the Pandora plotting functions here since we want to modify a few things specifically for this dataset
def plot_embedding_populations(embedding: PCA):
    embedding_data = embedding.embedding

    fig = go.Figure()
    populations = embedding_data.population.unique()
    colors = get_distinct_colors(populations.shape[0])
    symbols = ["triangle-down", "star", "square"]

    for population, color, symbol in zip(populations, colors, symbols):
        pop_data = embedding_data.loc[lambda x: x.population == population]
        assert not pop_data.empty, population
        fig.add_trace(
            go.Scatter(
                x=pop_data["D0"],
                y=pop_data["D1"],
                mode="markers",
                name=population,
                marker = dict(
                    color=color,
                    symbol=symbol,
                    size=7
                )
            )
        )

    fig.update_xaxes(title="PC 1")
    fig.update_yaxes(title="PC 2")
    fig.update_layout(template="plotly_white", height=750, width=750)
    return fig


In [149]:
fig = plot_embedding_populations(dataset.pca)
fig.update_layout(title="unmodified p3_mig50_0.2")
fig

In [147]:
fig = plot_embedding_populations(bootstraps[0].pca)
fig.update_layout(title="bootstrap p3_mig50_0.2")
fig