# Introduction

Here I make the final preparations for DESeq2 as was done for the bulk RNA-sequencing data.

In [1]:
import pandas as pd
import pickle as pkl
import numpy as np
from tqdm.notebook import tqdm

In [2]:
# prefix = '/data/codec/production.run/'

In [3]:
mountpoint = '/data/clue/'
prefix_adts = mountpoint + 'prod/adts/'
prefix_mrna = mountpoint + 'prod/mrna/'
prefix_comb = mountpoint + 'prod/comb/'

# `ct2`

## mRNA

In [4]:
counts = pd.read_pickle(prefix_mrna + 'pkls/pseudobulk_ct2.pkl')

Just get the raw data, that's all I want.

In [5]:
counts['CT'].value_counts()

T8_Naive    384
B_Mem       384
NK          384
pDC         384
B_Naive     384
HSC         384
T4_EM       384
M_cDC       384
T_Tox       384
T4_Naive    384
ncM         320
cM          320
cDC         320
Name: CT, dtype: int64

In [6]:
np.unique(counts['CT'].values)

array(['B_Mem', 'B_Naive', 'HSC', 'M_cDC', 'NK', 'T4_EM', 'T4_Naive',
       'T8_Naive', 'T_Tox', 'cDC', 'cM', 'ncM', 'pDC'], dtype=object)

In [7]:
counts.isna().sum().sum()

0

Create 5 separate sets of files, one for each stimulation, that has counts for both controls and the stim conditions, and in the appropriate format for easy reading into DESeq2.

In [8]:
for cond in tqdm(['A','B','G','P','R']):
    # extract out only the cond conditions and controls
    df = counts[(counts['COND'] == cond) | (counts['COND'] == 'C')]
    
    # make a new index that encompasses the entirety of the sample name
    df.index = ['-'.join([i,j,k]) for i,j,k in zip(df['CT'], df['COND'], df['FID'])]
    

    # With the bulk data, I had to remove and sum over duplicate genes
    # but because this is pseuobulked from an adata object, I already ran 
    # var_names_make_unique, so I think I'm good (I also confirmed in a separate cell)
    
    # extract out only the genes, then rotate because that's what DESeq2 expects
    cts = df.iloc[:,3:].T
    
    
    # get new columns, and then make a separate dfs for the coldata
    coldata_columns = df.columns[:3]
    coldata = pd.DataFrame(data=df[coldata_columns].values, index=cts.columns, columns=coldata_columns)
    
    
    # next line was required, was getting the following error from DESeq2:
    # every gene contains at least one zero, cannot compute log geometric means
    # this was the suggested fix:
    cts = cts.loc[cts.sum(1) > 0,:] + 1

    #export to csv
    cts.to_csv(prefix_mrna + 'vals/de/all/ct2/input/%s_cts.csv' % cond)
    coldata.to_csv(prefix_mrna + 'vals/de/all/ct2/input/%s_col.csv' % cond)

  0%|          | 0/5 [00:00<?, ?it/s]

## ADTs

In [9]:
counts = pd.read_pickle(prefix_adts + 'pkls/pseudobulk_ct2.pkl')

In [10]:
counts

Unnamed: 0,CT,COND,FID,CD103|ITGAE,CD137|TNFRSF9,CD274|CD274,CD11b|ITGAM,CD39|ENTPD1,CD197|CCR7,CD20|MS4A1,...,CD49a|ITGA1,CD133|PROM,CD357|TNFRSF18,CD270|TNFRSF14,IL-21R|IL21R,B7-H4|VTCN1,CD26|DPP4,CD155|PVR,CD30|TNFRSF8,CD49E|ITGA5
0,T4_Naive,A,0,118.0,1590.0,987.0,805.0,1028.0,1907.0,236.0,...,320.0,313.0,790.0,1637.0,2019.0,2463.0,6362.0,1131.0,668.0,1248.0
1,T4_Naive,A,1,199.0,1950.0,1438.0,774.0,1329.0,2990.0,379.0,...,436.0,450.0,1017.0,1981.0,2645.0,3334.0,5773.0,1577.0,1020.0,1603.0
2,T4_Naive,A,10,501.0,5193.0,3877.0,2439.0,3893.0,6985.0,1121.0,...,1254.0,1210.0,2989.0,5679.0,7913.0,9536.0,19177.0,4672.0,2866.0,3612.0
3,T4_Naive,A,11,139.0,1732.0,1207.0,674.0,1178.0,2347.0,358.0,...,369.0,384.0,955.0,1940.0,2613.0,3226.0,5598.0,1538.0,879.0,1332.0
4,T4_Naive,A,12,203.0,2777.0,1857.0,1052.0,1891.0,3274.0,495.0,...,629.0,599.0,1515.0,2977.0,3999.0,5079.0,7580.0,2146.0,1405.0,2027.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,M_cDC,R,62,134.0,3320.0,2444.0,6729.0,1420.0,1220.0,268.0,...,602.0,324.0,909.0,2307.0,1939.0,2473.0,1786.0,1147.0,763.0,11072.0
316,M_cDC,R,63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
317,M_cDC,R,7,353.0,6702.0,17348.0,12877.0,2107.0,1628.0,290.0,...,1105.0,342.0,1073.0,3566.0,2115.0,2496.0,3555.0,1199.0,1030.0,16470.0
318,M_cDC,R,8,148.0,2819.0,6210.0,3463.0,1342.0,1460.0,220.0,...,489.0,311.0,861.0,1944.0,1829.0,2745.0,1844.0,1073.0,779.0,8548.0


Just get the raw data, that's all I want.

In [11]:
counts['CT'].value_counts()

T8_Naive    384
B_Mem       384
NK          384
pDC         384
B_Naive     384
HSC         384
T4_EM       384
M_cDC       384
T_Tox       384
T4_Naive    384
ncM         320
cM          320
cDC         320
Name: CT, dtype: int64

In [12]:
np.unique(counts['CT'].values)

array(['B_Mem', 'B_Naive', 'HSC', 'M_cDC', 'NK', 'T4_EM', 'T4_Naive',
       'T8_Naive', 'T_Tox', 'cDC', 'cM', 'ncM', 'pDC'], dtype=object)

In [13]:
counts.isna().sum().sum()

0

Create 5 separate sets of files, one for each stimulation, that has counts for both controls and the stim conditions, and in the appropriate format for easy reading into DESeq2.

In [14]:
for cond in tqdm(['A','B','G','P','R']):
    # extract out only the cond conditions and controls
    df = counts[(counts['COND'] == cond) | (counts['COND'] == 'C')]
    
    # make a new index that encompasses the entirety of the sample name
    df.index = ['-'.join([i,j,k]) for i,j,k in zip(df['CT'], df['COND'], df['FID'])]
    

    # With the bulk data, I had to remove and sum over duplicate genes
    # but because this is pseuobulked from an adata object, I already ran 
    # var_names_make_unique, so I think I'm good (also confirmed in a separate cell)
    
    # extract out only the genes, then rotate because that's what DESeq2 expects
    cts = df.iloc[:,3:].T
    
    
    # get new columns, and then make a separate dfs for the coldata
    coldata_columns = df.columns[:3]
    coldata = pd.DataFrame(data=df[coldata_columns].values, index=cts.columns, columns=coldata_columns)
    
#     if cond == 'P':
#         cts = cts.loc[:,~np.any(np.stack([cts.columns.str.contains('Mono_C-C'), cts.columns.str.contains('Mono_NC-C')]), axis=0)]
#         coldata = coldata.loc[~np.any(np.stack([coldata.index.str.contains('Mono_C-C'), coldata.index.str.contains('Mono_NC-C')]), axis=0),:]
    
    # next line was required, was getting the following error from DESeq2:
    # every gene contains at least one zero, cannot compute log geometric means
    # this was the suggested fix:
    cts = cts.loc[cts.sum(1) > 0,:] + 1

    #export to csv
    cts.to_csv(prefix_adts + 'vals/de/all/ct2/input/%s_cts.csv' % cond)
    coldata.to_csv(prefix_adts + 'vals/de/all/ct2/input/%s_col.csv' % cond)

  0%|          | 0/5 [00:00<?, ?it/s]

# `ct3`

## mRNA

In [15]:
counts = pd.read_pickle(prefix_mrna + 'pkls/pseudobulk_ct3.pkl')

In [16]:
counts

Unnamed: 0,CT,COND,FID,AL627309.1,AL627309.3,AL669831.5,LINC00115,FAM41C,AL645608.3,SAMD11,...,AC145212.1,MAFIP,AC011043.1,AL592183.1,AC007325.4,AL354822.1,AC004556.1,AC233755.2,AC233755.1,AC240274.1
0,MAIT,A,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,MAIT,A,1,0.0,0.0,1.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0
2,MAIT,A,10,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,MAIT,A,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,MAIT,A,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6075,cDC2,R,62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6076,cDC2,R,63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6077,cDC2,R,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6078,cDC2,R,8,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Just get the raw data, that's all I want.

In [17]:
counts['CT'].value_counts()

NK_CD56++           320
T4_RO+_SELL+        320
T4_CM               320
T4_RA+_SELLint      320
T4_RA+_SELL+        320
T_CD10+             320
T_gd                320
T4_RO+_Act          320
T8_HOBIT+HELIOS+    320
T8_TEMRA            320
MAIT                320
NK_CD16+            320
T8_RA+_SELL+        320
T4_Treg_Act         320
T4_Treg_Resting     320
T8_CM               320
T8_EM               320
cDC1                320
cDC2                320
Name: CT, dtype: int64

In [18]:
np.unique(counts['CT'].values)

array(['MAIT', 'NK_CD16+', 'NK_CD56++', 'T4_CM', 'T4_RA+_SELL+',
       'T4_RA+_SELLint', 'T4_RO+_Act', 'T4_RO+_SELL+', 'T4_Treg_Act',
       'T4_Treg_Resting', 'T8_CM', 'T8_EM', 'T8_HOBIT+HELIOS+',
       'T8_RA+_SELL+', 'T8_TEMRA', 'T_CD10+', 'T_gd', 'cDC1', 'cDC2'],
      dtype=object)

In [19]:
counts.isna().sum().sum()

0

Create 5 separate sets of files, one for each stimulation, that has counts for both controls and the stim conditions, and in the appropriate format for easy reading into DESeq2.

In [20]:
for cond in tqdm(['A','B','G','P','R']):
    # extract out only the cond conditions and controls
    df = counts[(counts['COND'] == cond) | (counts['COND'] == 'C')]
    
    # make a new index that encompasses the entirety of the sample name
    df.index = ['-'.join([i,j,k]) for i,j,k in zip(df['CT'], df['COND'], df['FID'])]
    

    # With the bulk data, I had to remove and sum over duplicate genes
    # but because this is pseuobulked from an adata object, I already ran 
    # var_names_make_unique, so I think I'm good (I also confirmed in a separate cell)
    
    # extract out only the genes, then rotate because that's what DESeq2 expects
    cts = df.iloc[:,3:].T
    
    
    # get new columns, and then make a separate dfs for the coldata
    coldata_columns = df.columns[:3]
    coldata = pd.DataFrame(data=df[coldata_columns].values, index=cts.columns, columns=coldata_columns)
    
    
    # next line was required, was getting the following error from DESeq2:
    # every gene contains at least one zero, cannot compute log geometric means
    # this was the suggested fix:
    cts = cts.loc[cts.sum(1) > 0,:] + 1

    #export to csv
    cts.to_csv(prefix_mrna + 'vals/de/all/ct3/input/%s_cts.csv' % cond)
    coldata.to_csv(prefix_mrna + 'vals/de/all/ct3/input/%s_col.csv' % cond)

  0%|          | 0/5 [00:00<?, ?it/s]

## ADTs

In [21]:
counts = pd.read_pickle(prefix_adts + 'pkls/pseudobulk_ct3.pkl')

In [22]:
counts

Unnamed: 0,CT,COND,FID,CD103|ITGAE,CD137|TNFRSF9,CD274|CD274,CD11b|ITGAM,CD39|ENTPD1,CD197|CCR7,CD20|MS4A1,...,CD49a|ITGA1,CD133|PROM,CD357|TNFRSF18,CD270|TNFRSF14,IL-21R|IL21R,B7-H4|VTCN1,CD26|DPP4,CD155|PVR,CD30|TNFRSF8,CD49E|ITGA5
0,MAIT,A,0,11.0,59.0,23.0,27.0,33.0,38.0,5.0,...,20.0,11.0,26.0,44.0,54.0,78.0,513.0,24.0,14.0,98.0
1,MAIT,A,1,30.0,376.0,203.0,145.0,181.0,257.0,52.0,...,69.0,54.0,150.0,281.0,362.0,455.0,4388.0,207.0,145.0,415.0
2,MAIT,A,10,24.0,275.0,117.0,73.0,148.0,158.0,25.0,...,43.0,48.0,110.0,201.0,290.0,336.0,2133.0,168.0,106.0,120.0
3,MAIT,A,11,20.0,306.0,133.0,82.0,162.0,156.0,25.0,...,52.0,47.0,109.0,249.0,260.0,317.0,2520.0,147.0,78.0,206.0
4,MAIT,A,12,8.0,176.0,77.0,81.0,101.0,99.0,16.0,...,31.0,25.0,62.0,137.0,141.0,228.0,1178.0,95.0,55.0,119.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6075,cDC2,R,62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6076,cDC2,R,63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6077,cDC2,R,7,17.0,497.0,495.0,113.0,127.0,163.0,14.0,...,107.0,27.0,81.0,175.0,124.0,178.0,600.0,53.0,61.0,954.0
6078,cDC2,R,8,0.0,96.0,85.0,7.0,19.0,49.0,5.0,...,14.0,5.0,25.0,34.0,36.0,41.0,55.0,17.0,18.0,99.0


Just get the raw data, that's all I want.

In [23]:
counts['CT'].value_counts()

NK_CD56++           320
T4_RO+_SELL+        320
T4_CM               320
T4_RA+_SELLint      320
T4_RA+_SELL+        320
T_CD10+             320
T_gd                320
T4_RO+_Act          320
T8_HOBIT+HELIOS+    320
T8_TEMRA            320
MAIT                320
NK_CD16+            320
T8_RA+_SELL+        320
T4_Treg_Act         320
T4_Treg_Resting     320
T8_CM               320
T8_EM               320
cDC1                320
cDC2                320
Name: CT, dtype: int64

In [24]:
np.unique(counts['CT'].values)

array(['MAIT', 'NK_CD16+', 'NK_CD56++', 'T4_CM', 'T4_RA+_SELL+',
       'T4_RA+_SELLint', 'T4_RO+_Act', 'T4_RO+_SELL+', 'T4_Treg_Act',
       'T4_Treg_Resting', 'T8_CM', 'T8_EM', 'T8_HOBIT+HELIOS+',
       'T8_RA+_SELL+', 'T8_TEMRA', 'T_CD10+', 'T_gd', 'cDC1', 'cDC2'],
      dtype=object)

In [25]:
counts.isna().sum().sum()

0

Create 5 separate sets of files, one for each stimulation, that has counts for both controls and the stim conditions, and in the appropriate format for easy reading into DESeq2.

In [26]:
for cond in tqdm(['A','B','G','P','R']):
    # extract out only the cond conditions and controls
    df = counts[(counts['COND'] == cond) | (counts['COND'] == 'C')]
    
    # make a new index that encompasses the entirety of the sample name
    df.index = ['-'.join([i,j,k]) for i,j,k in zip(df['CT'], df['COND'], df['FID'])]
    

    # With the bulk data, I had to remove and sum over duplicate genes
    # but because this is pseuobulked from an adata object, I already ran 
    # var_names_make_unique, so I think I'm good (also confirmed in a separate cell)
    
    # extract out only the genes, then rotate because that's what DESeq2 expects
    cts = df.iloc[:,3:].T
    
    
    # get new columns, and then make a separate dfs for the coldata
    coldata_columns = df.columns[:3]
    coldata = pd.DataFrame(data=df[coldata_columns].values, index=cts.columns, columns=coldata_columns)
    
#     if cond == 'P':
#         cts = cts.loc[:,~np.any(np.stack([cts.columns.str.contains('Mono_C-C'), cts.columns.str.contains('Mono_NC-C')]), axis=0)]
#         coldata = coldata.loc[~np.any(np.stack([coldata.index.str.contains('Mono_C-C'), coldata.index.str.contains('Mono_NC-C')]), axis=0),:]
    
    # next line was required, was getting the following error from DESeq2:
    # every gene contains at least one zero, cannot compute log geometric means
    # this was the suggested fix:
    cts = cts.loc[cts.sum(1) > 0,:] + 1

    #export to csv
    cts.to_csv(prefix_adts + 'vals/de/all/ct3/input/%s_cts.csv' % cond)
    coldata.to_csv(prefix_adts + 'vals/de/all/ct3/input/%s_col.csv' % cond)

  0%|          | 0/5 [00:00<?, ?it/s]

# IFNs

## mRNA

### ct2

In [27]:
counts = pd.read_pickle(prefix_mrna + 'pkls/pseudobulk_ct2.pkl')

In [28]:
counts

Unnamed: 0,CT,COND,FID,AL627309.1,AL627309.3,AL669831.5,LINC00115,FAM41C,AL645608.3,SAMD11,...,AC145212.1,MAFIP,AC011043.1,AL592183.1,AC007325.4,AL354822.1,AC004556.1,AC233755.2,AC233755.1,AC240274.1
0,T4_Naive,A,0,0.0,0.0,1.0,2.0,3.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,1.0,0.0,0.0,0.0,4.0
1,T4_Naive,A,1,0.0,0.0,6.0,2.0,3.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0
2,T4_Naive,A,10,0.0,1.0,16.0,3.0,4.0,0.0,0.0,...,1.0,0.0,1.0,4.0,2.0,4.0,0.0,0.0,0.0,3.0
3,T4_Naive,A,11,0.0,0.0,0.0,0.0,4.0,1.0,0.0,...,0.0,0.0,0.0,1.0,2.0,2.0,1.0,0.0,0.0,4.0
4,T4_Naive,A,12,0.0,0.0,3.0,2.0,2.0,0.0,0.0,...,0.0,1.0,0.0,4.0,0.0,2.0,55.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,M_cDC,R,62,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,4.0
316,M_cDC,R,63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
317,M_cDC,R,7,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,9.0,0.0,0.0,0.0
318,M_cDC,R,8,0.0,0.0,3.0,6.0,1.0,0.0,0.0,...,0.0,0.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,1.0


Just get the raw data, that's all I want.

In [29]:
counts['CT'].value_counts()

T8_Naive    384
B_Mem       384
NK          384
pDC         384
B_Naive     384
HSC         384
T4_EM       384
M_cDC       384
T_Tox       384
T4_Naive    384
ncM         320
cM          320
cDC         320
Name: CT, dtype: int64

In [30]:
np.unique(counts['CT'].values)

array(['B_Mem', 'B_Naive', 'HSC', 'M_cDC', 'NK', 'T4_EM', 'T4_Naive',
       'T8_Naive', 'T_Tox', 'cDC', 'cM', 'ncM', 'pDC'], dtype=object)

In [31]:
counts.isna().sum().sum()

0

In [32]:
for cond in tqdm(['B']):
    # extract out only the cond conditions and controls
    df = counts[(counts['COND'] == cond) | (counts['COND'] == 'G')]
    
    # make a new index that encompasses the entirety of the sample name
    df.index = ['-'.join([i,j,k]) for i,j,k in zip(df['CT'], df['COND'], df['FID'])]
    

    # With the bulk data, I had to remove and sum over duplicate genes
    # but because this is pseuobulked from an adata object, I already ran 
    # var_names_make_unique, so I think I'm good (also confirmed in a separate cell)
    
    # extract out only the genes, then rotate because that's what DESeq2 expects
    cts = df.iloc[:,3:].T
    
    
    # get new columns, and then make a separate dfs for the coldata
    coldata_columns = df.columns[:3]
    coldata = pd.DataFrame(data=df[coldata_columns].values, index=cts.columns, columns=coldata_columns)
    
#     if cond == 'P':
#         cts = cts.loc[:,~np.any(np.stack([cts.columns.str.contains('Mono_C-C'), cts.columns.str.contains('Mono_NC-C')]), axis=0)]
#         coldata = coldata.loc[~np.any(np.stack([coldata.index.str.contains('Mono_C-C'), coldata.index.str.contains('Mono_NC-C')]), axis=0),:]
    
    # next line was required, was getting the following error from DESeq2:
    # every gene contains at least one zero, cannot compute log geometric means
    # this was the suggested fix:
    cts = cts.loc[cts.sum(1) > 0,:] + 1

    #export to csv
    cts.to_csv(prefix_mrna + 'vals/de/IFNs/ct2/input/B-G_cts.csv')
    coldata.to_csv(prefix_mrna + 'vals/de/IFNs/ct2/input/B-G_col.csv')

  0%|          | 0/1 [00:00<?, ?it/s]

In [33]:
for cond in tqdm(['G']):
    # extract out only the cond conditions and controls
    df = counts[(counts['COND'] == cond) | (counts['COND'] == 'B')]
    
    # make a new index that encompasses the entirety of the sample name
    df.index = ['-'.join([i,j,k]) for i,j,k in zip(df['CT'], df['COND'], df['FID'])]
    

    # With the bulk data, I had to remove and sum over duplicate genes
    # but because this is pseuobulked from an adata object, I already ran 
    # var_names_make_unique, so I think I'm good (also confirmed in a separate cell)
    
    # extract out only the genes, then rotate because that's what DESeq2 expects
    cts = df.iloc[:,3:].T
    
    
    # get new columns, and then make a separate dfs for the coldata
    coldata_columns = df.columns[:3]
    coldata = pd.DataFrame(data=df[coldata_columns].values, index=cts.columns, columns=coldata_columns)
    
#     if cond == 'P':
#         cts = cts.loc[:,~np.any(np.stack([cts.columns.str.contains('Mono_C-C'), cts.columns.str.contains('Mono_NC-C')]), axis=0)]
#         coldata = coldata.loc[~np.any(np.stack([coldata.index.str.contains('Mono_C-C'), coldata.index.str.contains('Mono_NC-C')]), axis=0),:]
    
    # next line was required, was getting the following error from DESeq2:
    # every gene contains at least one zero, cannot compute log geometric means
    # this was the suggested fix:
    cts = cts.loc[cts.sum(1) > 0,:] + 1

    #export to csv
    cts.to_csv(prefix_mrna + 'vals/de/IFNs/ct2/input/G-B_cts.csv')
    coldata.to_csv(prefix_mrna + 'vals/de/IFNs/ct2/input/G-B_col.csv')

  0%|          | 0/1 [00:00<?, ?it/s]

### ct3

In [34]:
counts = pd.read_pickle(prefix_mrna + 'pkls/pseudobulk_ct3.pkl')

In [35]:
counts

Unnamed: 0,CT,COND,FID,AL627309.1,AL627309.3,AL669831.5,LINC00115,FAM41C,AL645608.3,SAMD11,...,AC145212.1,MAFIP,AC011043.1,AL592183.1,AC007325.4,AL354822.1,AC004556.1,AC233755.2,AC233755.1,AC240274.1
0,MAIT,A,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,MAIT,A,1,0.0,0.0,1.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0
2,MAIT,A,10,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,MAIT,A,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,MAIT,A,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6075,cDC2,R,62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6076,cDC2,R,63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6077,cDC2,R,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6078,cDC2,R,8,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Just get the raw data, that's all I want.

In [36]:
counts['CT'].value_counts()

NK_CD56++           320
T4_RO+_SELL+        320
T4_CM               320
T4_RA+_SELLint      320
T4_RA+_SELL+        320
T_CD10+             320
T_gd                320
T4_RO+_Act          320
T8_HOBIT+HELIOS+    320
T8_TEMRA            320
MAIT                320
NK_CD16+            320
T8_RA+_SELL+        320
T4_Treg_Act         320
T4_Treg_Resting     320
T8_CM               320
T8_EM               320
cDC1                320
cDC2                320
Name: CT, dtype: int64

In [37]:
np.unique(counts['CT'].values)

array(['MAIT', 'NK_CD16+', 'NK_CD56++', 'T4_CM', 'T4_RA+_SELL+',
       'T4_RA+_SELLint', 'T4_RO+_Act', 'T4_RO+_SELL+', 'T4_Treg_Act',
       'T4_Treg_Resting', 'T8_CM', 'T8_EM', 'T8_HOBIT+HELIOS+',
       'T8_RA+_SELL+', 'T8_TEMRA', 'T_CD10+', 'T_gd', 'cDC1', 'cDC2'],
      dtype=object)

In [38]:
counts.isna().sum().sum()

0

In [39]:
for cond in tqdm(['B']):
    # extract out only the cond conditions and controls
    df = counts[(counts['COND'] == cond) | (counts['COND'] == 'G')]
    
    # make a new index that encompasses the entirety of the sample name
    df.index = ['-'.join([i,j,k]) for i,j,k in zip(df['CT'], df['COND'], df['FID'])]
    

    # With the bulk data, I had to remove and sum over duplicate genes
    # but because this is pseuobulked from an adata object, I already ran 
    # var_names_make_unique, so I think I'm good (also confirmed in a separate cell)
    
    # extract out only the genes, then rotate because that's what DESeq2 expects
    cts = df.iloc[:,3:].T
    
    
    # get new columns, and then make a separate dfs for the coldata
    coldata_columns = df.columns[:3]
    coldata = pd.DataFrame(data=df[coldata_columns].values, index=cts.columns, columns=coldata_columns)
    
#     if cond == 'P':
#         cts = cts.loc[:,~np.any(np.stack([cts.columns.str.contains('Mono_C-C'), cts.columns.str.contains('Mono_NC-C')]), axis=0)]
#         coldata = coldata.loc[~np.any(np.stack([coldata.index.str.contains('Mono_C-C'), coldata.index.str.contains('Mono_NC-C')]), axis=0),:]
    
    # next line was required, was getting the following error from DESeq2:
    # every gene contains at least one zero, cannot compute log geometric means
    # this was the suggested fix:
    cts = cts.loc[cts.sum(1) > 0,:] + 1

    #export to csv
    cts.to_csv(prefix_mrna + 'vals/de/IFNs/ct3/input/B-G_cts.csv')
    coldata.to_csv(prefix_mrna + 'vals/de/IFNs/ct3/input/B-G_col.csv')

  0%|          | 0/1 [00:00<?, ?it/s]

In [40]:
for cond in tqdm(['G']):
    # extract out only the cond conditions and controls
    df = counts[(counts['COND'] == cond) | (counts['COND'] == 'B')]
    
    # make a new index that encompasses the entirety of the sample name
    df.index = ['-'.join([i,j,k]) for i,j,k in zip(df['CT'], df['COND'], df['FID'])]
    

    # With the bulk data, I had to remove and sum over duplicate genes
    # but because this is pseuobulked from an adata object, I already ran 
    # var_names_make_unique, so I think I'm good (also confirmed in a separate cell)
    
    # extract out only the genes, then rotate because that's what DESeq2 expects
    cts = df.iloc[:,3:].T
    
    
    # get new columns, and then make a separate dfs for the coldata
    coldata_columns = df.columns[:3]
    coldata = pd.DataFrame(data=df[coldata_columns].values, index=cts.columns, columns=coldata_columns)
    
#     if cond == 'P':
#         cts = cts.loc[:,~np.any(np.stack([cts.columns.str.contains('Mono_C-C'), cts.columns.str.contains('Mono_NC-C')]), axis=0)]
#         coldata = coldata.loc[~np.any(np.stack([coldata.index.str.contains('Mono_C-C'), coldata.index.str.contains('Mono_NC-C')]), axis=0),:]
    
    # next line was required, was getting the following error from DESeq2:
    # every gene contains at least one zero, cannot compute log geometric means
    # this was the suggested fix:
    cts = cts.loc[cts.sum(1) > 0,:] + 1

    #export to csv
    cts.to_csv(prefix_mrna + 'vals/de/IFNs/ct3/input/G-B_cts.csv')
    coldata.to_csv(prefix_mrna + 'vals/de/IFNs/ct3/input/G-B_col.csv')

  0%|          | 0/1 [00:00<?, ?it/s]

## ADTs

### ct2

In [41]:
counts = pd.read_pickle(prefix_adts + 'pkls/pseudobulk_ct2.pkl')

In [42]:
counts

Unnamed: 0,CT,COND,FID,CD103|ITGAE,CD137|TNFRSF9,CD274|CD274,CD11b|ITGAM,CD39|ENTPD1,CD197|CCR7,CD20|MS4A1,...,CD49a|ITGA1,CD133|PROM,CD357|TNFRSF18,CD270|TNFRSF14,IL-21R|IL21R,B7-H4|VTCN1,CD26|DPP4,CD155|PVR,CD30|TNFRSF8,CD49E|ITGA5
0,T4_Naive,A,0,118.0,1590.0,987.0,805.0,1028.0,1907.0,236.0,...,320.0,313.0,790.0,1637.0,2019.0,2463.0,6362.0,1131.0,668.0,1248.0
1,T4_Naive,A,1,199.0,1950.0,1438.0,774.0,1329.0,2990.0,379.0,...,436.0,450.0,1017.0,1981.0,2645.0,3334.0,5773.0,1577.0,1020.0,1603.0
2,T4_Naive,A,10,501.0,5193.0,3877.0,2439.0,3893.0,6985.0,1121.0,...,1254.0,1210.0,2989.0,5679.0,7913.0,9536.0,19177.0,4672.0,2866.0,3612.0
3,T4_Naive,A,11,139.0,1732.0,1207.0,674.0,1178.0,2347.0,358.0,...,369.0,384.0,955.0,1940.0,2613.0,3226.0,5598.0,1538.0,879.0,1332.0
4,T4_Naive,A,12,203.0,2777.0,1857.0,1052.0,1891.0,3274.0,495.0,...,629.0,599.0,1515.0,2977.0,3999.0,5079.0,7580.0,2146.0,1405.0,2027.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,M_cDC,R,62,134.0,3320.0,2444.0,6729.0,1420.0,1220.0,268.0,...,602.0,324.0,909.0,2307.0,1939.0,2473.0,1786.0,1147.0,763.0,11072.0
316,M_cDC,R,63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
317,M_cDC,R,7,353.0,6702.0,17348.0,12877.0,2107.0,1628.0,290.0,...,1105.0,342.0,1073.0,3566.0,2115.0,2496.0,3555.0,1199.0,1030.0,16470.0
318,M_cDC,R,8,148.0,2819.0,6210.0,3463.0,1342.0,1460.0,220.0,...,489.0,311.0,861.0,1944.0,1829.0,2745.0,1844.0,1073.0,779.0,8548.0


Just get the raw data, that's all I want.

In [43]:
counts['CT'].value_counts()

T8_Naive    384
B_Mem       384
NK          384
pDC         384
B_Naive     384
HSC         384
T4_EM       384
M_cDC       384
T_Tox       384
T4_Naive    384
ncM         320
cM          320
cDC         320
Name: CT, dtype: int64

In [44]:
np.unique(counts['CT'].values)

array(['B_Mem', 'B_Naive', 'HSC', 'M_cDC', 'NK', 'T4_EM', 'T4_Naive',
       'T8_Naive', 'T_Tox', 'cDC', 'cM', 'ncM', 'pDC'], dtype=object)

In [45]:
counts.isna().sum().sum()

0

Create 5 separate sets of files, one for each stimulation, that has counts for both controls and the stim conditions, and in the appropriate format for easy reading into DESeq2.

In [46]:
for cond in tqdm(['B']):
    # extract out only the cond conditions and controls
    df = counts[(counts['COND'] == cond) | (counts['COND'] == 'G')]
    
    # make a new index that encompasses the entirety of the sample name
    df.index = ['-'.join([i,j,k]) for i,j,k in zip(df['CT'], df['COND'], df['FID'])]
    

    # With the bulk data, I had to remove and sum over duplicate genes
    # but because this is pseuobulked from an adata object, I already ran 
    # var_names_make_unique, so I think I'm good (also confirmed in a separate cell)
    
    # extract out only the genes, then rotate because that's what DESeq2 expects
    cts = df.iloc[:,3:].T
    
    
    # get new columns, and then make a separate dfs for the coldata
    coldata_columns = df.columns[:3]
    coldata = pd.DataFrame(data=df[coldata_columns].values, index=cts.columns, columns=coldata_columns)
    
#     if cond == 'P':
#         cts = cts.loc[:,~np.any(np.stack([cts.columns.str.contains('Mono_C-C'), cts.columns.str.contains('Mono_NC-C')]), axis=0)]
#         coldata = coldata.loc[~np.any(np.stack([coldata.index.str.contains('Mono_C-C'), coldata.index.str.contains('Mono_NC-C')]), axis=0),:]
    
    # next line was required, was getting the following error from DESeq2:
    # every gene contains at least one zero, cannot compute log geometric means
    # this was the suggested fix:
    cts = cts.loc[cts.sum(1) > 0,:] + 1

    #export to csv
    cts.to_csv(prefix_adts + 'vals/de/IFNs/ct2/input/B-G_cts.csv')
    coldata.to_csv(prefix_adts + 'vals/de/IFNs/ct2/input/B-G_col.csv')

  0%|          | 0/1 [00:00<?, ?it/s]

In [47]:
for cond in tqdm(['G']):
    # extract out only the cond conditions and controls
    df = counts[(counts['COND'] == cond) | (counts['COND'] == 'B')]
    
    # make a new index that encompasses the entirety of the sample name
    df.index = ['-'.join([i,j,k]) for i,j,k in zip(df['CT'], df['COND'], df['FID'])]
    

    # With the bulk data, I had to remove and sum over duplicate genes
    # but because this is pseuobulked from an adata object, I already ran 
    # var_names_make_unique, so I think I'm good (also confirmed in a separate cell)
    
    # extract out only the genes, then rotate because that's what DESeq2 expects
    cts = df.iloc[:,3:].T
    
    
    # get new columns, and then make a separate dfs for the coldata
    coldata_columns = df.columns[:3]
    coldata = pd.DataFrame(data=df[coldata_columns].values, index=cts.columns, columns=coldata_columns)
    
#     if cond == 'P':
#         cts = cts.loc[:,~np.any(np.stack([cts.columns.str.contains('Mono_C-C'), cts.columns.str.contains('Mono_NC-C')]), axis=0)]
#         coldata = coldata.loc[~np.any(np.stack([coldata.index.str.contains('Mono_C-C'), coldata.index.str.contains('Mono_NC-C')]), axis=0),:]
    
    # next line was required, was getting the following error from DESeq2:
    # every gene contains at least one zero, cannot compute log geometric means
    # this was the suggested fix:
    cts = cts.loc[cts.sum(1) > 0,:] + 1

    #export to csv
    cts.to_csv(prefix_adts + 'vals/de/IFNs/ct2/input/G-B_cts.csv')
    coldata.to_csv(prefix_adts + 'vals/de/IFNs/ct2/input/G-B_col.csv')

  0%|          | 0/1 [00:00<?, ?it/s]

### ct3

In [48]:
counts = pd.read_pickle(prefix_adts + 'pkls/pseudobulk_ct3.pkl')

In [49]:
counts

Unnamed: 0,CT,COND,FID,CD103|ITGAE,CD137|TNFRSF9,CD274|CD274,CD11b|ITGAM,CD39|ENTPD1,CD197|CCR7,CD20|MS4A1,...,CD49a|ITGA1,CD133|PROM,CD357|TNFRSF18,CD270|TNFRSF14,IL-21R|IL21R,B7-H4|VTCN1,CD26|DPP4,CD155|PVR,CD30|TNFRSF8,CD49E|ITGA5
0,MAIT,A,0,11.0,59.0,23.0,27.0,33.0,38.0,5.0,...,20.0,11.0,26.0,44.0,54.0,78.0,513.0,24.0,14.0,98.0
1,MAIT,A,1,30.0,376.0,203.0,145.0,181.0,257.0,52.0,...,69.0,54.0,150.0,281.0,362.0,455.0,4388.0,207.0,145.0,415.0
2,MAIT,A,10,24.0,275.0,117.0,73.0,148.0,158.0,25.0,...,43.0,48.0,110.0,201.0,290.0,336.0,2133.0,168.0,106.0,120.0
3,MAIT,A,11,20.0,306.0,133.0,82.0,162.0,156.0,25.0,...,52.0,47.0,109.0,249.0,260.0,317.0,2520.0,147.0,78.0,206.0
4,MAIT,A,12,8.0,176.0,77.0,81.0,101.0,99.0,16.0,...,31.0,25.0,62.0,137.0,141.0,228.0,1178.0,95.0,55.0,119.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6075,cDC2,R,62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6076,cDC2,R,63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6077,cDC2,R,7,17.0,497.0,495.0,113.0,127.0,163.0,14.0,...,107.0,27.0,81.0,175.0,124.0,178.0,600.0,53.0,61.0,954.0
6078,cDC2,R,8,0.0,96.0,85.0,7.0,19.0,49.0,5.0,...,14.0,5.0,25.0,34.0,36.0,41.0,55.0,17.0,18.0,99.0


Just get the raw data, that's all I want.

In [50]:
counts['CT'].value_counts()

NK_CD56++           320
T4_RO+_SELL+        320
T4_CM               320
T4_RA+_SELLint      320
T4_RA+_SELL+        320
T_CD10+             320
T_gd                320
T4_RO+_Act          320
T8_HOBIT+HELIOS+    320
T8_TEMRA            320
MAIT                320
NK_CD16+            320
T8_RA+_SELL+        320
T4_Treg_Act         320
T4_Treg_Resting     320
T8_CM               320
T8_EM               320
cDC1                320
cDC2                320
Name: CT, dtype: int64

In [51]:
np.unique(counts['CT'].values)

array(['MAIT', 'NK_CD16+', 'NK_CD56++', 'T4_CM', 'T4_RA+_SELL+',
       'T4_RA+_SELLint', 'T4_RO+_Act', 'T4_RO+_SELL+', 'T4_Treg_Act',
       'T4_Treg_Resting', 'T8_CM', 'T8_EM', 'T8_HOBIT+HELIOS+',
       'T8_RA+_SELL+', 'T8_TEMRA', 'T_CD10+', 'T_gd', 'cDC1', 'cDC2'],
      dtype=object)

In [52]:
counts.isna().sum().sum()

0

Create 5 separate sets of files, one for each stimulation, that has counts for both controls and the stim conditions, and in the appropriate format for easy reading into DESeq2.

In [53]:
for cond in tqdm(['B']):
    # extract out only the cond conditions and controls
    df = counts[(counts['COND'] == cond) | (counts['COND'] == 'G')]
    
    # make a new index that encompasses the entirety of the sample name
    df.index = ['-'.join([i,j,k]) for i,j,k in zip(df['CT'], df['COND'], df['FID'])]
    

    # With the bulk data, I had to remove and sum over duplicate genes
    # but because this is pseuobulked from an adata object, I already ran 
    # var_names_make_unique, so I think I'm good (also confirmed in a separate cell)
    
    # extract out only the genes, then rotate because that's what DESeq2 expects
    cts = df.iloc[:,3:].T
    
    
    # get new columns, and then make a separate dfs for the coldata
    coldata_columns = df.columns[:3]
    coldata = pd.DataFrame(data=df[coldata_columns].values, index=cts.columns, columns=coldata_columns)
    
#     if cond == 'P':
#         cts = cts.loc[:,~np.any(np.stack([cts.columns.str.contains('Mono_C-C'), cts.columns.str.contains('Mono_NC-C')]), axis=0)]
#         coldata = coldata.loc[~np.any(np.stack([coldata.index.str.contains('Mono_C-C'), coldata.index.str.contains('Mono_NC-C')]), axis=0),:]
    
    # next line was required, was getting the following error from DESeq2:
    # every gene contains at least one zero, cannot compute log geometric means
    # this was the suggested fix:
    cts = cts.loc[cts.sum(1) > 0,:] + 1

    #export to csv
    cts.to_csv(prefix_adts + 'vals/de/IFNs/ct3/input/B-G_cts.csv')
    coldata.to_csv(prefix_adts + 'vals/de/IFNs/ct3/input/B-G_col.csv')

  0%|          | 0/1 [00:00<?, ?it/s]

In [54]:
for cond in tqdm(['G']):
    # extract out only the cond conditions and controls
    df = counts[(counts['COND'] == cond) | (counts['COND'] == 'B')]
    
    # make a new index that encompasses the entirety of the sample name
    df.index = ['-'.join([i,j,k]) for i,j,k in zip(df['CT'], df['COND'], df['FID'])]
    

    # With the bulk data, I had to remove and sum over duplicate genes
    # but because this is pseuobulked from an adata object, I already ran 
    # var_names_make_unique, so I think I'm good (also confirmed in a separate cell)
    
    # extract out only the genes, then rotate because that's what DESeq2 expects
    cts = df.iloc[:,3:].T
    
    
    # get new columns, and then make a separate dfs for the coldata
    coldata_columns = df.columns[:3]
    coldata = pd.DataFrame(data=df[coldata_columns].values, index=cts.columns, columns=coldata_columns)
    
#     if cond == 'P':
#         cts = cts.loc[:,~np.any(np.stack([cts.columns.str.contains('Mono_C-C'), cts.columns.str.contains('Mono_NC-C')]), axis=0)]
#         coldata = coldata.loc[~np.any(np.stack([coldata.index.str.contains('Mono_C-C'), coldata.index.str.contains('Mono_NC-C')]), axis=0),:]
    
    # next line was required, was getting the following error from DESeq2:
    # every gene contains at least one zero, cannot compute log geometric means
    # this was the suggested fix:
    cts = cts.loc[cts.sum(1) > 0,:] + 1

    #export to csv
    cts.to_csv(prefix_adts + 'vals/de/IFNs/ct3/input/G-B_cts.csv')
    coldata.to_csv(prefix_adts + 'vals/de/IFNs/ct3/input/G-B_col.csv')

  0%|          | 0/1 [00:00<?, ?it/s]