# Assembling the complete ECCITE-seq dataset

In [120]:
import scanpy as sc
import pandas as pd
import seaborn as sns
pd.set_option('display.max_rows', 500)

In [5]:
data_path = '/data_volume/memento/eccite/'

In [4]:
adata = sc.read_10x_h5(data_path + 'filtered_feature_bc_matrix.h5')


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [5]:
adata.var_names_make_unique()

In [6]:
adata.obs['lane'] = adata.obs.index.str.split('-').str[-1]

In [7]:
adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

In [8]:
adata.obs.head(5)

Unnamed: 0,lane,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt
AAACCTGAGACAGACC-1,1,2609,8691.0,472.0,5.430905
AAACCTGAGCAGGCTA-1,1,3431,15767.0,167.0,1.059174
AAACCTGAGCCAGAAC-1,1,3816,15634.0,339.0,2.168351
AAACCTGAGGTACTCT-1,1,4205,16914.0,692.0,4.091285
AAACCTGAGGTGATTA-1,1,1869,4257.0,986.0,23.16185


In [9]:
adata = adata[adata.obs.n_genes_by_counts > 100, :]
adata = adata[adata.obs.pct_counts_mt < 10, :]

In [10]:
adata

View of AnnData object with n_obs × n_vars = 63474 × 36601
    obs: 'lane', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'

In [11]:
adata.write(data_path + 'filtered_eccite_cDNA.h5ad')

Trying to set attribute `.obs` of view, copying.
... storing 'lane' as categorical
Trying to set attribute `.var` of view, copying.
... storing 'feature_types' as categorical
Trying to set attribute `.var` of view, copying.
... storing 'genome' as categorical


In [12]:
adata.obs['original_bc'] = adata.obs.index.str.split('-').str[0]

In [16]:
for lane in range(1, 9):
    
    bcs = adata.obs.query('lane == "{}"'.format(lane))[['original_bc']]
    bcs.to_csv(data_path + 'cell_bcs/run{}.csv'.format(lane), index=False, header=False)

### Combine HTO counts

In [67]:
adata = sc.read(data_path + 'filtered_eccite_cDNA.h5ad')

In [68]:
# Read HTO assignments
df_list = []
for lane in range(1, 9):
    
    df = pd.read_csv(data_path + 'hto_counts/hto{}_out/umi_count/multi_class.csv'.format(lane), index_col=0)
    df.index = df.index.values + '-' + str(lane)
    df_list.append(df.copy())
multiseq_df = pd.concat(df_list)

In [69]:
overlap_bcs = list(set(adata.obs.index) & set(multiseq_df.index))

In [70]:
adata = adata[overlap_bcs, :].copy()

  res = method(*args, **kwargs)


In [75]:
adata.obs = adata.obs.join(multiseq_df, how='left')

In [77]:
adata.obs['MULTI_ID'].value_counts()

rep3-tx-AGGACCATCCAA        12075
rep1-tx-AGGACCATCCAA         9404
Negative                     9082
rep2-tx-TCGATAATGCGA         8507
rep4-tx-TCGATAATGCGA         8027
Doublet                      8010
rep3-ctrl-ACATGTTACCGT       1385
rep4-ctrl-GAGGCTGAGCTA       1222
rep2-ctrl-GAGGCTGAGCTA       1037
PDL1g1-ctrl-ACTGTCTAACGG      981
PDL1g2-ctrl-TAACGACGTGGT      969
unmapped                      944
rep1-ctrl-ACATGTTACCGT        683
PDL1g1-tx-GTGTGACGTATT        598
PDL1g2-tx-CACATAATGACG        539
Name: MULTI_ID, dtype: int64

In [78]:
adata = adata[adata.obs['MULTI_ID'].str.startswith('rep')].copy()

  res = method(*args, **kwargs)


In [79]:
adata.obs['replicate'] = adata.obs['MULTI_ID'].str.split('-').str[0]
adata.obs['treatment'] = adata.obs['MULTI_ID'].str.split('-').str[1]

In [82]:
adata.write(data_path + 'filtered_eccite_cDNA_hto.h5ad')

... storing 'orig.ident' as categorical
... storing 'MULTI_ID' as categorical
... storing 'MULTI_classification' as categorical
... storing 'replicate' as categorical
... storing 'treatment' as categorical


### Attach guide information

In [159]:
# Read HTO assignments
df_list = []
for lane in range(1, 9):
    
    df = pd.read_csv(data_path + 'gdo_counts/gdo{}_out/umi_count/gdo_counts.csv'.format(lane), index_col=0).T
    df.index = df.index.values + '-' + str(lane)
    df_list.append(df.copy())
gdo_df = pd.concat(df_list)

In [160]:
gdo_df = gdo_df[gdo_df.columns[:-1]]
gdo_df = gdo_df[~((gdo_df > 5).sum(axis=1) == 0)]

In [None]:
gdo_df['guide_ID'] = gdo_df.idxmax(axis=1)

In [163]:
def second_percent(row):
    
    return row.nlargest(2).values[-1]/row.nlargest(1).values[-1]

In [164]:
gdo_df['second_percent'] = gdo_df.iloc[:, :-1].apply(second_percent,axis=1)

In [165]:
filtered_gdo_df = gdo_df.query('second_percent < 0.30')

In [178]:
adata = sc.read(data_path + 'filtered_eccite_cDNA_hto.h5ad')

In [179]:
overlap_bcs = list(set(adata.obs.index) & set(filtered_gdo_df.index))

In [180]:
adata = adata[overlap_bcs, :].copy()

In [181]:
adata.shape

(28346, 36601)

In [182]:
adata.obs = adata.obs.join(filtered_gdo_df.iloc[:, -2:], how='left')

In [183]:
adata.obs['gene'] = adata.obs['guide_ID'].str.split('g').str[0]

In [184]:
# adata = adata[adata.obs['replicate'] != 'rep4']

In [185]:
adata.write(data_path + 'eccite.h5ad')

... storing 'guide_ID' as categorical
... storing 'gene' as categorical


In [190]:
adata.obs[adata.obs['gene']=='STAT1'].guide_ID.value_counts()

STAT1g3-CAGCATGTTGTACCAAAGG       296
STAT1g2-CATCCTTTGGTACAACATG       100
STAT1g1-TGGCCTGGAGTAATACTTT        46
STAT1g4-GTCAAACTCCTCAGGAGAC        16
ATF2g1-TTCATTTCTCAGCAGGGTG          0
PDCD1LG2g4-CTGCTAATGTTGAGCCTGG      0
SMAD4g2-CAGAAGGGTCCACGTATCC         0
SMAD4g1-GTGGTCACTAAGGCACCTG         0
POU2F2g4-ACGACCATTTCCCGCTTCG        0
POU2F2g3-ACCTTCAAGCAACGCCGCA        0
POU2F2g2-TTCTTGCGTCTCCGGCCGG        0
POU2F2g1-GGGCAAGCTCTACGGCAAC        0
PDL1g3-AGTTCTGCGCAGCTTCCCG          0
PDL1g1-ATATAGGTCCTTGGGAACC          0
PDCD1LG2g3-TTTGTTGTGGTGACAGGTC      0
SMAD4g4-CACCTTTACATTCCAACTG         0
PDCD1LG2g2-ACGTGAGTATTCCAGAACA      0
PDCD1LG2g1-CTGGCAGAAACTTCAGCTG      0
NTg10-TGTAGGAGCGGCGTTAGTA           0
NTg9-ATGTCTCGCCCCGATATGG            0
NTg8-TAGAGAGCGGCGCGCCTAC            0
NTg7-GCCGTTAAGCGGAAACGAT            0
NTg5-TGAACGGGCCGCGGAAGCG            0
NTg4-TGCGACGCTTAGCCTCCGT            0
NTg3-AGGACAGCAACTTCTGCCC            0
NTg2-ATAAACACTATACCATGTA            0
NTg1-ACCTCTC

In [189]:
adata.obs[adata.obs['gene']=='NT'].guide_ID.value_counts()

NTg4-TGCGACGCTTAGCCTCCGT          646
NTg9-ATGTCTCGCCCCGATATGG          515
NTg7-GCCGTTAAGCGGAAACGAT          476
NTg5-TGAACGGGCCGCGGAAGCG          419
NTg1-ACCTCTCAATGCCGTGGTT          408
NTg2-ATAAACACTATACCATGTA          308
NTg10-TGTAGGAGCGGCGTTAGTA         283
NTg3-AGGACAGCAACTTCTGCCC          279
NTg8-TAGAGAGCGGCGCGCCTAC           59
SMAD4g2-CAGAAGGGTCCACGTATCC         0
POU2F2g3-ACCTTCAAGCAACGCCGCA        0
POU2F2g4-ACGACCATTTCCCGCTTCG        0
SMAD4g1-GTGGTCACTAAGGCACCTG         0
ATF2g1-TTCATTTCTCAGCAGGGTG          0
SMAD4g3-GTATCCATCAACAGTAACA         0
POU2F2g1-GGGCAAGCTCTACGGCAAC        0
SMAD4g4-CACCTTTACATTCCAACTG         0
POU2F2g2-TTCTTGCGTCTCCGGCCGG        0
PDCD1LG2g2-ACGTGAGTATTCCAGAACA      0
PDL1g3-AGTTCTGCGCAGCTTCCCG          0
PDL1g1-ATATAGGTCCTTGGGAACC          0
PDCD1LG2g4-CTGCTAATGTTGAGCCTGG      0
PDCD1LG2g3-TTTGTTGTGGTGACAGGTC      0
ATF2g2-AAGAAGCTGTTTCAGCTGT          0
PDCD1LG2g1-CTGGCAGAAACTTCAGCTG      0
NFKBIAg4-CCAGGGCTATTCTCCCTAC        0
SPI1g1-GTCAT

In [187]:
adata.obs.query('treatment == "ctrl" & replicate != "rep4"').gene.value_counts()

NT          233
ATF2        133
CD86        130
IRF1        125
IFNGR1      114
TNFRSF14    112
IFNGR2      106
MARCH8       96
CMTM6        95
JAK2         91
ETV7         85
CAV1         78
NFKBIA       74
STAT5A       72
PDCD1LG2     65
STAT2        60
POU2F2       59
SMAD4        53
IRF7         53
STAT1        51
STAT3        48
UBE2L6       41
BRD4         28
CUL3         27
MYC          10
SPI1          5
PDL1          2
Name: gene, dtype: int64