# Exclusion of seropositive samples

In [1]:
from datetime import datetime; print("START:", datetime.now())
import socket; print("Simons Foundation, Rusty HPC,", socket.gethostname())

START: 2021-08-18 19:06:56.286755
Simons Foundation, Rusty HPC, worker1024


In [2]:
%cd /mnt/home/zzhang/ceph/jemm
%load_ext rpy2.ipython

/mnt/ceph/users/zzhang/jemm


In [3]:
import os
from tqdm import tqdm
import pickle
import pandas as pd
import numpy as np
from multiprocessing import Pool
from jemm import rmats_helper, suppa_helper
from jemm import kallisto_helper
from jemm.junction import JunctionCountTable
from jemm.transcript import TranscriptMeasureTable

In [4]:
meta = pd.read_table('./data-V9/charm_master.csv', sep="\t", low_memory=False, index_col=0)
meta = meta[meta.pid > 148]
meta = meta[meta.Sex.isin(['F', 'M'])]
print(meta.shape)

(19317, 69)


In [5]:
df_gen = {x[0]: x[1].sort_values('tp') for x in meta.groupby('pid')}

In [6]:
RBD_RATIO_CUTOFF = 1
BEFORE_INFECT_CUTOFF = 0

def check_is_exclusion(df):
    # exclude high T0 IgG RBD screening ELISA, if present
    if df.iloc[0]['tp']==0 and pd.notna(df.iloc[0]['IgG']):
        if df.iloc[0]['IgG'] > RBD_RATIO_CUTOFF:
            return 'excl', 'Has T0 pos IgG'
        else:
            return 'keep', 'Has T0 neg IgG'
    
    # if no T0, exclude if no valid serology
    sero_df = df[df['IgG'].notna()]
    if len(sero_df) == 0:
        return 'excl', 'No valid IgG'
    
    # if w/ valid serology, exclude if close to infection
    # infection timepoint
    infect_tp = df.loc[df['final']=="First", 'tp'].to_list()
    # if never infected, first sero needs to be low
    if len(infect_tp) == 0:
        if sero_df.iloc[0]['IgG'] < RBD_RATIO_CUTOFF:
            return 'keep', 'Never PCR+, First IgG low'
        else:
            return 'excl', 'Never PCR+, First IgG high'
        
    assert len(infect_tp) == 1, "multiple first for %s" % df.iloc[0].pid
    infect_tp = infect_tp[0]
    
    # if w/ valid serology, exclude if the First valid serology is high before infection
    if infect_tp - sero_df.iloc[0]['tp'] > BEFORE_INFECT_CUTOFF:
        if sero_df.iloc[0]['IgG'] > RBD_RATIO_CUTOFF:
            return 'excl', 'First IgG high before infection'
        else:
            return 'keep', 'First IgG low before infection'
    else:
        return 'excl', 'First IgG too close to infection'
        

In [7]:
excl_df = {
    pid : check_is_exclusion(df_gen[pid])
    for pid in df_gen
}

In [8]:
excl_df = pd.DataFrame.from_dict(excl_df, orient='index', columns=['fz_label', 'fz_reason'])

In [9]:
print(len(excl_df.index.unique()))

3326


# Compare with NS, YG

In [10]:
ns_1 = pd.read_table('data-V7.p1/NS.charm_pids_to_keep.txt')
ns_1['ns_label'] = 'keep'
ns_2 = pd.read_table('data-V7.p1/NS.charm_pids_to_exclude.txt')
ns_2['ns_label'] = 'excl'
ns = pd.concat([ns_1, ns_2], axis=0)
ns.index = ns.pid
print(len(ns['pid'].unique()))

3326


In [11]:
yg_1 = pd.read_table('data-V7.p1/YG.charm_pids_to_keep.txt')
yg_1['yg_label'] = 'keep'
yg_2 = pd.read_table('data-V7.p1/YG.charm_pids_to_exclude.txt')
yg_2['yg_label'] = 'excl'
yg = pd.concat([yg_1, yg_2], axis=0)
yg.index = yg.pid
yg.rename(columns={'exclude_reason': 'yg_reason'}, inplace=True)
print(len(yg['pid'].unique()))

3326


In [12]:
merged = excl_df.join(ns).join(yg[['yg_label', 'yg_reason']])

In [13]:
merged

Unnamed: 0,fz_label,fz_reason,pid,ns_label,yg_label,yg_reason
149,keep,Has T0 neg IgG,149,keep,keep,
150,keep,Has T0 neg IgG,150,keep,keep,
151,keep,Has T0 neg IgG,151,keep,keep,
152,keep,Has T0 neg IgG,152,keep,keep,
153,keep,Has T0 neg IgG,153,keep,keep,
...,...,...,...,...,...,...
3475,keep,Has T0 neg IgG,3475,keep,keep,
3476,keep,Has T0 neg IgG,3476,keep,keep,
3477,keep,Has T0 neg IgG,3477,keep,keep,
3478,keep,Has T0 neg IgG,3478,keep,keep,


In [14]:
merged.query('fz_label=="excl" and (ns_label=="keep" or yg_label=="keep")')

Unnamed: 0,fz_label,fz_reason,pid,ns_label,yg_label,yg_reason
712,excl,No valid IgG,712,keep,excl,No IgG data
1269,excl,No valid IgG,1269,keep,excl,No IgG data


In [15]:
merged.query('fz_label=="keep" and (ns_label=="excl" or yg_label=="excl")')

Unnamed: 0,fz_label,fz_reason,pid,ns_label,yg_label,yg_reason
254,keep,First IgG low before infection,254,excl,keep,


In [16]:
merged.to_csv('data-V7.p1/merged_seropos.20210814.txt', sep="\t")

# Make new meta data

In [46]:
old_meta = pd.read_table('./data-V9/charm_master.csv', sep="\t", index_col=0, low_memory=False)
old_meta.head()

Unnamed: 0_level_0,pid,Sex,T0_date,tp,paxgene_date,RNAseq_plate,contrast,manual,notes,final,...,Serum_date,VTM_date,scRNA_date,scATAC_date,virus_seq_date,Fluidigm_plate,EPIC850K,SNP_plate,Comments,company
sid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20_0001-T00,1,F,05/08/2020,0,5/14/2020,P1,,Late,,Exposed,...,05/20/2020,,,,,,,,,
20_0002-T00,2,F,05/08/2020,0,5/14/2020,P1,,Late,,Mild,...,05/20/2020,,,,,,,,03/23/2020,
20_0002-T28,2,F,05/08/2020,28,6/8/2020,P4,,Late,,Mild,...,06/08/2020,06/08/2020,,,,,,,03/23/2020,
20_0003-T00,3,F,05/08/2020,0,5/14/2020,P3;P4,,Late,,Moderate,...,05/20/2020,,,,,,,,03/28/2020; 03/30/2020,
20_0003-T28,3,F,05/08/2020,28,6/8/2020,P4,,Late,,Moderate,...,06/08/2020,06/08/2020,,,,,,,03/28/2020; 03/30/2020,


In [47]:
new_meta = old_meta[old_meta.pid.isin(merged.query('fz_label=="keep"').pid)]

In [61]:
new_meta['RNAseq_plate'] = ['Multiple' if type(x) is str and ';' in x else x for x in new_meta['RNAseq_plate']]
new_meta['RNAseq_plate'] = [np.nan if type(x) is str and '?' in x else x for x in new_meta['RNAseq_plate']]

new_meta[new_meta['RNAseq_plate'].notna()].rename(columns={'RNAseq_plate': 'plateNum'})[['pid', 'plateNum', 'final', 'Sex']].\
    to_csv('data-V9/charm_master.no_seropos.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [57]:
new_meta[new_meta['RNAseq_plate'].notna()].\
    query('final=="Control" or final=="First" or final=="Mid"')[['pid', 'Sex']].\
    drop_duplicates().value_counts('Sex')

Sex
M    255
F     55
dtype: int64

In [58]:
new_meta[new_meta['RNAseq_plate'].notna()][['pid','Sex','final']].drop_duplicates().\
    query('final=="Control" or final=="First" or final=="Mid" or final=="Post"'). \
    groupby(['final', 'Sex']).size()

final    Sex
Control  F       38
         M      229
First    F       31
         M       58
Mid      F       26
         M      177
Post     F       25
         M      176
dtype: int64

In [None]:
print("FINISH:", datetime.now())