# IAWG Data Summary

In [None]:
# Download location
from pathlib import Path

data_path = Path('data')

In [None]:
# Sync data from synapse
import synapseclient 
import synapseutils 
import os

syn = synapseclient.Synapse() 
syn.login(authToken=os.getenv('SYNAPSE_TOKEN'))
files = synapseutils.syncFromSynapse(syn, 'syn68154892', path=data_path)

In [None]:
# Load path map

dfs_paths = {
    f.stem: f
    for f in (data_path / 'Variant data').iterdir()
}

In [None]:
import pandas as pd

# We'll manually map columns for now.

# Common schemes:
won_scheme = ('CHR', 'BP_HG38', 'A1', 'A2')
shendure_scheme = ('chr', 'pos', 'A1', 'A2')

col_map = {
    'Won_CDMPRA_variantlist': won_scheme,
    'Won_Nana_MPRA_variantlist': won_scheme,
    'Won_SCZ_MPRA_variantlist': won_scheme,
    'Won_ADD_MPRA_variantlist': won_scheme,
    # 'Supplementary_Table_S3_cV2F_scores (1)' skip for now: only has rsid
    # Gene-wide: 'encode_re2g_gold_standard_crispr_e2g_data': ('')
    'Shendure_Ahituv_Kircher_cCRE_variants_Neuro_MPRA': shendure_scheme,
    'Shendure_Ahituv_Kircher_cCRE_variants_UndiffWTC11_MPRA': shendure_scheme,
    'IGVF_ColocBoost_Variant_Function_lipid_traits_liver': ('VariantChr', 'VariantStart', 'EffectAllele', 'OtherAllele'), # Note 0-based position
    'igvf_y2ave_variantlist': ('chr', 'pos', 'a1', 'a2')
}

# Read file and map columns in function
def get_df_variants(name):

    (chrom, pos, a1, a2) = col_map[name]

    df = (
        pd
        .read_table(
            dfs_paths[name],
            usecols=[chrom, pos, a1, a2]
        )
        .rename(
            columns={
                chrom: 'chr',
                pos: 'pos',
                a1: 'a1',
                a2: 'a2'
            }
        )
        .assign(
            pos = lambda d: pd.to_numeric(d.pos, downcast='integer'),
            a1 = lambda d: d.a1.str.upper(),
            a2 = lambda d: d.a2.str.upper()
        )
        .drop_duplicates()
    )
    
    # Manually move 0-based position to 1-based where needed
    if name == 'IGVF_ColocBoost_Variant_Function_lipid_traits_liver':
        df['pos'] = df['pos']+1

    return df
    

In [None]:
from tqdm.notebook import tqdm

In [None]:
# Normalized lists for upset
normalized_series = {
    dataset: pd.Series(
        True,
        index=pd.MultiIndex.from_frame(get_df_variants(dataset))
    )
    for dataset in tqdm(col_map)
}

In [None]:
# Get index for dataframe
index_union = None
for s in tqdm(normalized_series.values()):
    if index_union is None:
        index_union = s.index
    else:
        index_union = index_union.union(s.index)

# Dataframe for upset
variants_df = pd.DataFrame(index=index_union)
for dataset, s in tqdm(normalized_series.items()):
    variants_df[dataset] = s

variants_df.fillna(False, inplace=True)

In [None]:
# Attach totals to dataset names
col_rename_map = {
    col_name: f'{col_name} ({variants_df[col_name].sum()})'
    for col_name in variants_df.columns
}

# Count series
variant_counts = variants_df.rename(columns=col_rename_map).value_counts()

In [None]:
# Upset plot
import upsetplot
import matplotlib.pyplot as plt

fig = plt.figure(layout='constrained')
axs = upsetplot.plot(variant_counts, subset_size='sum', show_counts=True, totals_plot_elements=0, orientation='vertical', fig=fig)

axs['intersections'].semilogx()