<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import os
import glob
import pandas as pd

In [2]:
import re
def read_and_extract(
    files, 
    pattern,
    **kwargs
    ):
    """
    Parameters:
        files: List of paths to dataframes to read
        pattern: Regex pattern with group for extracting cluster id.
            "Kmeans(\d+).csv"
            "ShapeCluster(\d+)"
        kwargs: Arguments passed to :func:`pandas.read_csv`
    Returns:
        Dictionary mapping cluster ID to its dataframe
    """
    files.sort()
    clusters = [re.findall(pattern,f)[0] for f in files]
    cluster2df = {c:pd.read_csv(f, **kwargs) for c, f in zip(clusters, files)}
    return cluster2df

In [16]:
indir = '/Users/racng/git/paper_upn109/data/tcr/'
outdir = '/Users/racng/git/connectTCR/output/timeseries/'

In [4]:
comps = ['csf', 'pbmc']

# read kmeans and shape clusters dataframes
shape_files = {c: glob.glob(indir+"{}ShapeCluster*.csv".format(c.upper()))
    for c in comps}
shapes = {c:read_and_extract(shape_files[c], "ShapeCluster(\d+).csv") 
    for c in comps}

kmeans_files = {c: glob.glob(indir+"{}/Signal{}Kmeans*.csv".format(c, c.upper()))
    for c in comps}
kmeans = {c:read_and_extract(kmeans_files[c], "Kmeans(\d+).csv") 
    for c in comps}

In [5]:
trb2shape = {}
for c in comps:
    tcr_map = {}
    for i, df in shapes[c].items():
        for trb in df['TRB'].values:
            if trb in tcr_map:
                raise ValueError('TRB repeated')
            tcr_map[trb] = i
    trb2shape[c] = tcr_map

In [6]:
trb2kmeans = {}
for c in comps:
    tcr_map = {}
    for i, df in kmeans[c].items():
        for trb in df['TRB'].values:
            if trb in tcr_map:
                raise ValueError('TRB repeated')
            tcr_map[trb] = i.lstrip("0") # remove leading zero
    trb2kmeans[c] = tcr_map

In [7]:
signal = {c:pd.read_csv(indir+'{}(Signal).csv'.format(c.upper()))['TRB'] for c in comps}

In [8]:
noise = {c:pd.read_csv(indir+'{}(Noise).csv'.format(c.upper()))['TRB'] for c in comps}

In [17]:
meta = {}
savename = {'csf':'csf', 'pbmc':'pb'}
for c in comps:
    meta[c] = pd.DataFrame({
        'TRB': signal[c].append(noise[c]).values,
        'Signal/Noise': ['Signal']*len(signal[c])+['Noise']*len(noise[c])
    })
    meta[c]['Kmeans'] = meta[c]['TRB'].map(trb2kmeans[c])
    meta[c]['Shape'] = meta[c]['TRB'].map(trb2shape[c])
    meta[c].to_csv(outdir+'{}.meta.csv'.format(savename[c]),index=False)

In [10]:
for i, df in meta.items():
    print(len(df))
    print(len(df.dropna()))
    print(len(signal[i]))
    print((df.dropna()['Signal/Noise']=='Signal').sum())

19043
1509
1509
1509
1048575
152
152
152


In [23]:
meta['pbmc'].loc[meta['pbmc']['Shape']=='1', 'TRB']

KeyError: 'pb'

In [22]:
meta['csf'].loc[meta['csf']['Shape']=='1', 'TRB']

10      CASSLRASGMGTDTQYF
22         CASSWTGGSYEQYF
23         CASSTDGQGNTIYF
25         CASSFLAGTDTQYF
29        CATSRDRGAAQPQHF
30          CASSLGEGYEQYF
31            CASSTGNEQYF
32         CASSQGGGSEKLFF
33         CASSLSLLYNEQFF
36            CSAKGQGEAFF
38        CASSLSSSGANVLTF
39         CASSESTQSTEAFF
40          CASSLTVSYEQYF
41            CASSFSYEQYF
42          CSVEERESTEAFF
44          CASSYSHLETQYF
46          CASSPPLYSEAFF
47         CASSLMAGFGELFF
48        CSARDYRGRRTEAFF
52         CASSLLQVNTEAFF
53          CSARDPNHQPQHF
54       CASGWAAEPESDEQFF
55        CASSFLSGGDTEAFF
56           CAWRAQLYGYTF
57         CASSLVTLNTEAFF
59          CASSVGSQETQYF
60          CASSPLADNEQFF
61         CASSPGQENEKLFF
62        CASSTSGGANYGYTF
63       CASSTTGLVGYNEQFF
              ...        
1479       CSATTQRHYYEQYF
1480       CASSYQAYNSPLHF
1481      CASSDGGPQDEKLFF
1482      CASSLRPSGANVLTF
1483     CASSPERGAENTEAFF
1484      CASSFSYDGAYEQYF
1485     CSARDSLRPDSNTQYF
1486      CA