In [12]:
import pandas as pd
import numpy as np
import umap
from plotnine import *

In [48]:
data = pd.read_csv(
    'figure-1-source-data-1-hi-titer-at-baseline.csv'
).query(
    'visit == "V4"'
)[['participant_id', 'age_at_sampling', 'strain', 'log_hi_titer']]
data.head()
data.to_csv('titers.csv')

In [21]:
def escape_similarity(df):
    """Compute similarity between all pairs of conditions in `df`."""
    df = df[['participant_id', 'strain', 'log_hi_titer']].drop_duplicates()
    assert not df.isnull().any().any(), df
    
    pivoted_df = (
        df
        .pivot_table(index='strain',
                     columns='participant_id',
                     values='log_hi_titer',
                     fill_value=0)
        # for normalization: https://stackoverflow.com/a/58113206
        # to get norm: https://stackoverflow.com/a/47953601
        .transform(lambda x: x / np.linalg.norm(x, axis=0))
        )
    conditions = pivoted_df.columns.tolist()
    arr = pivoted_df.values.transpose()
    similarities = [x.dot(y).sum() for x in arr for y in arr]
    return pd.DataFrame(np.array(similarities).reshape(len(conditions), len(conditions)),
                        columns=conditions, index=conditions)

similarities = escape_similarity(data)
assert similarities.notnull().any().any()
dissimilarities = (1 - similarities).clip(lower=0)
dissimilarities.round(3)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,768,769,770,771,772,773,774,775,776,777
1,0.000,0.067,0.070,0.057,0.083,0.267,0.194,0.156,0.075,0.133,...,0.143,0.186,0.063,0.046,0.084,0.093,0.048,0.150,0.254,0.122
2,0.067,0.000,0.143,0.157,0.102,0.453,0.348,0.203,0.173,0.030,...,0.268,0.068,0.082,0.124,0.196,0.129,0.132,0.079,0.394,0.279
3,0.070,0.143,0.000,0.083,0.101,0.263,0.267,0.255,0.094,0.157,...,0.170,0.170,0.185,0.065,0.107,0.068,0.031,0.118,0.258,0.042
4,0.057,0.157,0.083,0.000,0.062,0.171,0.089,0.108,0.027,0.235,...,0.064,0.257,0.170,0.050,0.068,0.143,0.057,0.170,0.113,0.104
5,0.083,0.102,0.101,0.062,0.000,0.338,0.197,0.085,0.085,0.130,...,0.162,0.135,0.164,0.087,0.120,0.129,0.118,0.102,0.208,0.196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
773,0.093,0.129,0.068,0.143,0.129,0.324,0.349,0.242,0.159,0.135,...,0.228,0.169,0.137,0.061,0.194,0.000,0.068,0.149,0.327,0.136
774,0.048,0.132,0.031,0.057,0.118,0.181,0.209,0.215,0.070,0.174,...,0.114,0.206,0.139,0.029,0.092,0.068,0.000,0.143,0.204,0.057
775,0.150,0.079,0.118,0.170,0.102,0.471,0.420,0.290,0.189,0.062,...,0.308,0.051,0.260,0.170,0.231,0.149,0.143,0.000,0.387,0.249
776,0.254,0.394,0.258,0.113,0.208,0.140,0.067,0.189,0.119,0.493,...,0.059,0.484,0.352,0.195,0.143,0.327,0.204,0.387,0.000,0.223


In [44]:
reducer = umap.UMAP(random_state=0)
embedding = reducer.fit_transform(dissimilarities)
embedding.shape

(777, 2)

In [47]:
umap_coords = (
    pd.DataFrame(embedding, columns=['UMAP1', 'UMAP2'])
    .assign(
        participant_id=dissimilarities.columns,
    )
    .merge(data, on='participant_id', how='left')
    .drop(columns=['strain', 'log_hi_titer'])
    .drop_duplicates()
)
umap_coords.round(3)
umap_coords.to_csv('umap.csv')