# IAWG Data Summary

In [None]:
# Sync data from synapse
import synapseclient 
import synapseutils 
import os

syn = synapseclient.Synapse() 
syn.login(authToken=os.getenv('SYNAPSE_TOKEN'))
files = synapseutils.syncFromSynapse(syn, 'syn68154892')

In [None]:
# Load dataframes
from pathlib import Path
import pandas as pd

dfs_paths = {
    f.stem: f
    for d1 in (Path.home() / '.synapseCache').iterdir()
    for d2 in d1.iterdir()
    for f in d2.iterdir()
}

In [None]:
# We'll manually map columns for now.

# Common schemes:
won_scheme = ('CHR', 'BP_HG38', 'A1', 'A2')
shendure_scheme = ('chr', 'pos', 'A1', 'A2')

col_map = {
    'Won_CDMPRA_variantlist': won_scheme,
    'Won_Nana_MPRA_variantlist': won_scheme,
    'Won_SCZ_MPRA_variantlist': won_scheme,
    'Won_ADD_MPRA_variantlist': won_scheme,
    # 'Supplementary_Table_S3_cV2F_scores (1)' skip for now: only has rsid
    # Gene-wide: 'encode_re2g_gold_standard_crispr_e2g_data': ('')
    'Shendure_Ahituv_Kircher_cCRE_variants_Neuro_MPRA': shendure_scheme,
    'Shendure_Ahituv_Kircher_cCRE_variants_UndiffWTC11_MPRA': shendure_scheme,
    'IGVF_ColocBoost_Variant_Function_lipid_traits_liver': ('VariantChr', 'VariantStart', 'EffectAllele', 'OtherAllele'), # Note 0-based position
    'igvf_y2ave_variantlist': ('chr', 'pos', 'a1', 'a2')
}

# Read file and map columns in function
def get_df_variants(name):

    (chrom, pos, a1, a2) = col_map[name]

    df = pd.read_table(dfs_paths[name], usecols=col_map[name]).rename(
        columns={
            chrom: 'chr',
            pos: 'pos',
            a1: 'a1',
            a2: 'a2'
        }
    )
    
    # Manually move 0-based position to 1-based where needed
    if name == 'IGVF_ColocBoost_Variant_Function_lipid_traits_liver':
        df['pos'] = df['pos']+1

    return df
    

In [None]:
# Normalized lists for upset
normalized_series = {
    dataset: pd.Series(
        True,
        index=pd.MultiIndex.from_frame(get_df_variants(dataset))
    )
    for dataset in col_map
}

In [None]:
# Get index for dataframe
index_union = None
for s in normalized_series.values():
    if index_union is None:
        index_union = s.index
    else:
        index_union = index_union.union(s.index)

# Dataframe for upset
variants_df = pd.DataFrame(index=index_union)
for dataset, s in normalized_series.items():
    variants_df[dataset] = s

In [None]:
#varlist_df = pd.DataFrame().join(
#    normalized_dfs,
#    how='outer'
#).fillna(False)

In [None]:
# Upset plot
import upsetplot

upsetplot.UpSet(varlist_df.value_counts(), subset_size='sum', show_counts=True)


In [None]:
# Normalized lists for position upset

dfs['Supplementary_Table_S3_cV2F_scores (1)']['chr'] = dfs['Supplementary_Table_S3_cV2F_scores (1)']['CHR'].apply(lambda x: f'chr{x}')

pos_col_map = {
    'Won_CDMPRA_variantlist': won_scheme,
    'Won_Nana_MPRA_variantlist': won_scheme,
    'Won_SCZ_MPRA_variantlist': won_scheme,
    'Won_ADD_MPRA_variantlist': won_scheme,
    'Supplementary_Table_S3_cV2F_scores (1)': ('chr', 'BP', None, None),
    # Gene-wide: 'encode_re2g_gold_standard_crispr_e2g_data': ('')
    'Shendure_Ahituv_Kircher_cCRE_variants_Neuro_MPRA': shendure_scheme,
    'Shendure_Ahituv_Kircher_cCRE_variants_UndiffWTC11_MPRA': shendure_scheme,
    'IGVF_ColocBoost_Variant_Function_lipid_traits_liver': ('VariantChr', 'pos1based', 'EffectAllele', 'OtherAllele')
}

pos_normalized_dfs = [
    dfs[dataset][[chrom, pos]].drop_duplicates().rename(
        columns={
            chrom: 'chr',
            pos: 'pos'
        }
    ).assign(**{dataset: True}).set_index(['chr', 'pos'], drop=True)
    for dataset, (chrom, pos, _, _) in pos_col_map.items()
]

In [None]:

poslist_df = pd.DataFrame().join(
    pos_normalized_dfs,
    how='outer'
).fillna(False)

In [None]:
poslist_df

In [None]:
upsetplot.UpSet(poslist_df.value_counts(list(pos_col_map.keys())), show_counts=True, orientation='vertical')

In [None]:
# Build upset plot

In [None]:
combined_df = pd.concat(
    {
        f.name: pd.read_table(f)
        for f in Path().iterdir()
    },
    names=['Dataset']
)

# Scratch space

In [None]:
dfs['Supplementary_Table_S3_cV2F_scores (1)']