In [1]:
import numpy as np
import pandas as pd
# import scanpy.api as sc
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import matplotlib as mpl
mpl.rcParams['figure.facecolor'] = (1,1,1,1)
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

In [2]:
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80, color_map='viridis', 
                              transparent=False, frameon=False, fontsize=20)  # low dpi (dots per inch) yields small inline figures

import matplotlib as mpl
# 2 lines below solved the facecolor problem.
# mpl.rcParams['figure.facecolor'] = 'white'
mpl.rcParams['figure.facecolor'] = (1,1,1,1)
sc.settings.autosave = True
sc.logging.print_versions()

version = '210711_merged_thymoma_MG21.22.23.03_Tcell'

# file_mat = 'F1314_200323_222536_summary/10425963-2_3DE_genematrix.csv'


results_file_Tcell = './scanpy/{}/merge_Tcell.h5ad'.format(version)
results_file_CD4Tcell = './scanpy/{}/merge_CD4Tcell.h5ad'.format(version)
results_file_Treg = './scanpy/{}/merge_Treg.h5ad'.format(version)
results_file_Tcell_minor_cluster = './scanpy/{}/merge_Tcell_minor_cluster.h5ad'.format(version)
results_file_cg_Tcell_minor_cluster = './scanpy/{}/merge_cg_Tcell_minor_cluster.h5ad'.format(version)

results_file_master = './scanpy/210711_merged_thymoma_MG21.22.23.03/merge.h5ad'
results_raw_file_master = './scanpy/210711_merged_thymoma_MG21.22.23.03/merge.raw.h5ad'

# for cellphonedb
raw_matrix = './scanpy/{}/merge.raw.tsv'.format(version)

sc.settings.figdir = './scanpy/{}/graph'.format(version)
sc.settings.cachedir = './scanpy/{}/cache'.format(version)
%config InlineBackend.figure_format = 'retina' 



-----
anndata     0.7.6
scanpy      1.7.2
sinfo       0.3.1
-----
PIL                 8.1.2
anndata             0.7.6
backcall            0.2.0
cffi                1.14.5
colorama            0.4.4
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.1
decorator           4.4.2
get_version         2.1
google              NA
h5py                2.10.0
igraph              0.9.1
ipykernel           5.3.4
ipython_genutils    0.2.0
ipywidgets          7.6.3
jedi                0.17.2
joblib              1.0.1
kiwisolver          1.3.1
legacy_api_wrap     1.2
leidenalg           0.8.4
llvmlite            0.34.0
matplotlib          3.4.1
mpl_toolkits        NA
natsort             7.1.1
numba               0.51.0
numexpr             2.7.3
numpy               1.20.2
packaging           20.9
pandas              1.2.4
parso               0.7.0
pexpect             4.8.0
pickleshare         0.7.5
pkg_resources       NA
prompt_toolkit      3.0.8
psutil              5.8.0
ptyproce

In [3]:
adata_raw = sc.read(results_raw_file_master)
adata = sc.read(results_file_master)

In [4]:
sc.pl.umap(adata, color='major_cluster', legend_loc='on data', title='', frameon=False, save='major_cluster_all')



<Figure size 320x320 with 1 Axes>

In [5]:
adata

AnnData object with n_obs × n_vars = 65935 × 1506
    obs: 'sample', 'sample_type', 'site', 'fraction', 'n_genes', 'individual', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_counts', 'leiden', 'leiden_R', 'major_cluster'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'mean', 'std'
    uns: 'individual_colors', 'leiden', 'leiden_R_colors', 'leiden_colors', 'major_cluster_colors', 'neighbors', 'pca', 'rank_genes_groups', 'sample_colors', 'site_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [6]:
adata_cp = adata_raw[adata[adata.obs['major_cluster'] == 'T cell'].obs.index].copy()

# for c in ['score_DEGs', 'score_yellow', 'score_GWAS']:
#     adata_cp.obs[c] = adata.obs.loc[adata_cp.obs.index, c]
    
adata = adata_cp.copy()
adata_cp = None

In [7]:
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
adata.raw = sc.pp.log1p(adata, copy=True)

normalizing by total count per cell
    finished (0:00:09): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)


In [8]:
plt.figure(figsize=(8,3))
pd.Series(adata.obs['sample']).value_counts().plot(kind='bar')
plt.grid(None)
plt.title('Run')
# plt.savefig(str(sc.settings.figdir) + '/run.pdf', bbox_inches='tight')

Text(0.5, 1.0, 'Run')

<Figure size 640x240 with 1 Axes>

In [9]:
# adata = adata[adata.obs['sample'].isin(['MG03_TE', 'MG21_TE', 'MG22_TE'])]

In [10]:
filter_result = sc.pp.filter_genes_dispersion(
    adata.X, min_mean=0.0125, max_mean=2.5, min_disp=0.7)
sc.pl.filter_genes_dispersion(filter_result)
print([sum([i[0] for i in filter_result]),len(filter_result)])

extracting highly variable genes
    finished (0:00:00)


<Figure size 640x320 with 2 Axes>

[1249, 36601]


In [11]:
adata = adata[:, filter_result.gene_subset]
sc.pp.log1p(adata)
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata)
adata.obsm['X_pca'] *= -1  # multiply by -1 to match Seurat
sc.pl.pca_variance_ratio(adata, log=True)

  view_to_actual(adata)
... as `zero_center=True`, sparse input is densified and may lead to large memory consumption
computing PCA
    with n_comps=50
    finished (0:00:05)


<Figure size 320x320 with 1 Axes>

In [12]:
sc.external.pp.bbknn(adata, batch_key='sample', n_pcs=40)
sc.tl.umap(adata, spread=2)
# sc.tl.leiden(adata, resolution=10)

computing batch balanced neighbors
	finished: added to `.uns['neighbors']`
	`.obsp['distances']`, distances for each pair of neighbors
	`.obsp['connectivities']`, weighted adjacency matrix (0:00:05)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:14)


In [13]:
sc.pl.umap(adata, color=['n_counts', 'n_genes'], save='qc')



<Figure size 772.8x320 with 4 Axes>

In [14]:
sc.tl.leiden(adata, resolution=3)

running Leiden clustering
    finished: found 34 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:09)


In [15]:
sc.pl.umap(adata, color='leiden', legend_loc='on data', title='', frameon=False, save='leiden.pdf')



<Figure size 320x320 with 1 Axes>

In [16]:
with plt.rc_context({"figure.figsize": (8, 8), "figure.dpi": (300)}):
    sc.pl.umap(adata, color=['CD4', 'CD8A', 'leiden'], s=20)



<Figure size 8019x2400 with 5 Axes>

In [17]:
sc.tl.leiden(adata, resolution=1, restrict_to=('leiden', ['8','13']))

running Leiden clustering
    finished: found 43 clusters and added
    'leiden_R', the cluster labels (adata.obs, categorical) (0:00:00)


In [18]:
sc.pl.umap(adata, color='leiden_R', legend_loc='on data', title='', frameon=False, save='leidenR.pdf')



<Figure size 320x320 with 1 Axes>

In [19]:
sc.tl.leiden(adata, resolution=0.6, restrict_to=('leiden_R', ['3']))

running Leiden clustering
    finished: found 48 clusters and added
    'leiden_R', the cluster labels (adata.obs, categorical) (0:00:00)


In [20]:
with plt.rc_context({"figure.figsize": (20, 20)}):
    sc.pl.umap(adata, color='leiden_R', legend_loc='on data', title='', frameon=False, save='leidenR.large.pdf', s=20)



<Figure size 1600x1600 with 1 Axes>

In [21]:
with plt.rc_context({"figure.figsize": (20, 20)}):
    sc.pl.umap(adata[adata.obs['leiden'].isin(['8', '13'])], color=['site', 'leiden_R'], legend_loc='on data', 
               title='', frameon=False, save='leidenR.large.pdf', s=20)

Trying to set attribute `.uns` of view, copying.


<Figure size 3384x1600 with 2 Axes>

In [22]:
pd.crosstab(adata.obs['site'], adata.obs['leiden_R']).T

site,Periphery,Thymus
leiden_R,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1834,34
1,1317,18
2,1077,28
30,284,15
31,148,115
32,213,19
33,130,70
34,1,89
35,19,0
4,1036,50


In [23]:
pd.crosstab(adata.obs['site'], adata.obs['leiden_R']).T.plot.bar(figsize=(18,3))

<AxesSubplot:xlabel='leiden_R'>

<Figure size 1440x240 with 1 Axes>

In [24]:
with plt.rc_context({"figure.figsize": (8, 8), "figure.dpi": (300)}):
    sc.pl.umap(adata, color=['CD4', 'CD8A', 'leiden'], s=20)



<Figure size 8019x2400 with 5 Axes>

In [25]:
list_mait = ['KLRB1', 'SLC4A10', 'IL7R', 'DPP4', 'TRAJ33', 'PDCD1']
sc.pl.umap(adata, color=list_mait)



<Figure size 1545.6x640 with 12 Axes>

In [26]:
list_g = ['SPINK2', 'CSF1', 'CYP26A1']
sc.pl.umap(adata, color=list_g)



<Figure size 1159.2x320 with 6 Axes>

In [27]:
list_g = ['CD8A', 'CCR7', 'CD44', 'SELL', 'CD28', 'FAS']
sc.pl.umap(adata, color=list_g, ncols=2)



<Figure size 772.8x960 with 12 Axes>

In [28]:
sc.pl.dotplot(adata, list_g, groupby='leiden_R', dendrogram=True)

    using 'X_pca' with n_pcs = 50
Storing dendrogram info using `.uns['dendrogram_leiden_R']`


<Figure size 361.6x1424 with 5 Axes>

In [29]:
# ITGAX : CD11C
# CD27 : memory B
sc.pl.umap(adata, color=['CD8A', 'NR4A1', 'IKZF2', 'NFKBID', 'BCL2L11', 'sample', 'site'],
          save=False, ncols=5)

<Figure size 1932x640 with 12 Axes>

In [30]:
# ITGAX : CD11C
# CD27 : memory B
sc.pl.umap(adata, color=['CD1D', 'KLRB1', 'NKG7', 'SELL'],
          save=False, ncols=5)

<Figure size 1545.6x320 with 8 Axes>

In [31]:
sc.pl.dotplot(adata, var_names=['CD3E', 'CD4', 'CD8A', 'CD8B', 'NKG7', 'KLRB1', 'ZBTB16', 'EOMES',
                                'GNG4', 'ZNF683', 'TRGC2', 'TRDC'], groupby='leiden_R',
             dendrogram=True)



<Figure size 539.2x1424 with 5 Axes>

In [32]:
# CD62L : SELL, CCR7 Tn, Tscm, Tcm
# CCR7 Tn, Tscm, Tcm
# IL2RB : Tscm, Tcm, Tem, Teff
# CD95 : FAS : Tscm, Tcm, Tem, Teff

sc.pl.dotplot(adata, var_names=['CD3E', 'CD4', 'CD8A', 'SELL', 'CCR7', 'TCF7', 'IL7R', 'IL2RB',
                               'FCGR3A', 'CXCR4', 'PRF1', 'GZMK', 'XCL2', 'HAVCR2', 'CCL5',
                               'CXCR6', 'ITGA1'], 
              groupby='leiden_R', dendrogram=True)



<Figure size 687.2x1424 with 5 Axes>

In [33]:
adata.obs['CD4'] = np.array(adata.raw[:,'CD4'].X.todense()).flatten()
adata.obs['CD8A'] = np.array(adata.raw[:,'CD8A'].X.todense()).flatten()

In [34]:
adata.obs['CD4/CD8'] = 'DN'
adata.obs.loc[(adata.obs['CD8A'] > 0.5) & (adata.obs['CD4'] > 0.5), 'CD4/CD8'] = 'DP'
adata.obs.loc[(adata.obs['CD8A'] < 0.5) & (adata.obs['CD4'] > 0.5), 'CD4/CD8'] = 'CD4'
adata.obs.loc[(adata.obs['CD8A'] > 0.5) & (adata.obs['CD4'] < 0.5), 'CD4/CD8'] = 'CD8'

In [35]:
sc.pl.umap(adata, color=['CD4/CD8'])

... storing 'CD4/CD8' as categorical


<Figure size 320x320 with 1 Axes>

In [36]:
adata.obs['CD4'].hist()

<AxesSubplot:>

<Figure size 320x320 with 1 Axes>

In [37]:
adata.obs['CD8A'].hist()

<AxesSubplot:>

<Figure size 320x320 with 1 Axes>

In [38]:
plt.scatter(np.array(adata.raw[:,'CD4'].X.todense()).flatten(), 
           np.array(adata.raw[:,'CD8A'].X.todense()).flatten())

<matplotlib.collections.PathCollection at 0x7fd5d8648410>

<Figure size 320x320 with 1 Axes>

In [39]:
adata.obs = adata.obs.drop('CD4', axis=1)
adata.obs = adata.obs.drop('CD8A', axis=1)

In [40]:
# ITGAX : CD11C
# CD27 : memory B
sc.pl.umap(adata, color=['CD3E', 'CD4', 'CD8A', 'CCR7', 'IL7R', 'KLRB1', 'ZBTB16', 'CD200R1', 'FOXP3', 'SATB1', 
                         'EPCAM', 'PECAM1', 'MS4A1', 'CD27', 'BCL6',
                         'NKG7', 'TRDC', 'TRGC1', 'TRGC2', 'CD14', 'FCGR3A', 'FCER1A', 'ITGAX', 'HLA-DQB1', 'MKI67', 'sample', 'site'],
          save='panel', ncols=5)



<Figure size 1932x1920 with 52 Axes>

In [41]:
# ITGAX : CD11C
# CD27 : memory B
sc.pl.umap(adata, color=['CD3E', 'CD4', 'CD8A', 'FOXP3', 'CTLA4', 'IL2RA', 'SATB1', 'LEF1',
                         'IKZF2', 'IKZF4', 'IL7R', 'GATA3', 'TBX21', 'RUNX3',
                         'CXCR5', 'sample', 'site'],
          save='panel_Treg', ncols=5)



<Figure size 1932x1280 with 32 Axes>

In [42]:
sc.pl.umap(adata, color=['CD3E', 'CD4', 'CD8A', 'CCR7', 'CD28', 'FAS', 'SELL', 'sample', 'site'],
          save='panel_memory', ncols=5)



<Figure size 1932x640 with 16 Axes>

In [43]:
sc.tl.rank_genes_groups(adata, 'leiden_R', method='t-test_overestim_var')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

ranking genes
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:12)


<Figure size 1280x3840 with 48 Axes>

In [44]:
# ITGAX : CD11C
# CD27 : memory B
sc.pl.umap(adata, color=list(adata.uns['rank_genes_groups']['names']['8-13,10'])[:40],
          save=False, ncols=5)

<Figure size 1932x2560 with 80 Axes>

In [45]:
sc.pl.dotplot(adata, var_names=list(adata.uns['rank_genes_groups']['names']['8-13,10'])[:40], 
              groupby='leiden_R')



<Figure size 1304x1424 with 4 Axes>

In [46]:
# ITGAX : CD11C
# CD27 : memory B
sc.pl.umap(adata, color=list(adata.uns['rank_genes_groups']['names']['18'])[:40],
          save=False, ncols=5)

<Figure size 1932x2560 with 80 Axes>

In [47]:
# ITGAX : CD11C
# CD27 : memory B
sc.pl.umap(adata, color=['GNLY', 'LYZ', 'CD69', 'CXCR6', 'CD1D', 'ITGA1'],
          save=False, ncols=5)

<Figure size 1932x640 with 12 Axes>

In [48]:
sc.pl.dotplot(adata, var_names=list(adata.uns['rank_genes_groups']['names']['33'])[:10], groupby='leiden')



<Figure size 416x1032 with 4 Axes>

In [49]:
sc.pl.umap(adata,color=["sample", "PTPRC"])



<Figure size 772.8x320 with 3 Axes>

In [50]:
df_clusters =  pd.read_csv('210430_clusters/Tcell_clusters.csv', index_col=0)
df_clusters.index = df_clusters.index.astype(str)
adata.obs['Tcell_cluster'] = [df_clusters.loc[x, 'cluster'] for x in adata.obs['leiden_R']]
df_clusters

Unnamed: 0_level_0,cluster
leiden,Unnamed: 1_level_1
0,CD8 Tnaive
1,CD4 T cell
2,CD4 T cell
30,aa CD8 T cell (II)
31,aa CD8 T cell (II)
32,aa CD8 T cell (II)
33,aa CD8 T cell (II)
34,NKT cell (thymus)
35,aa CD8 T cell (II)
4,CD4 T cell


In [51]:
with plt.rc_context({"figure.figsize": (5, 5)}):
    sc.pl.umap(adata, color='Tcell_cluster', add_outline=True,
           title='', frameon=False, save='Tcell_cluster.pdf', s=30)

... storing 'Tcell_cluster' as categorical


<Figure size 400x400 with 1 Axes>

In [52]:
with plt.rc_context({"figure.figsize": (20, 20)}):
    sc.pl.umap(adata, color='Tcell_cluster', add_outline=True,
           title='', legend_loc='on data', save='Tcell_cluster.pdf', s=30)



<Figure size 1600x1600 with 1 Axes>

In [53]:
sc.pl.umap(adata, color='Tcell_cluster', title='', add_outline=True, 
           frameon=False, save='Tcell_cluster_out.pdf')



<Figure size 320x320 with 1 Axes>

In [54]:
pd.crosstab(adata.obs['site'], adata.obs['Tcell_cluster']).T.plot.bar(figsize=(18,3))
plt.yscale('log')

<Figure size 1440x240 with 1 Axes>

In [55]:
sc.tl.rank_genes_groups(adata, 'Tcell_cluster', method='t-test_overestim_var')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

ranking genes
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:04)


<Figure size 1280x1280 with 15 Axes>

In [56]:
sc.tl.dendrogram(adata, groupby="Tcell_cluster")
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, groupby='Tcell_cluster', save='Tcell.pdf')

    using 'X_pca' with n_pcs = 50
Storing dendrogram info using `.uns['dendrogram_Tcell_cluster']`


<Figure size 2404x500 with 6 Axes>

In [58]:
adata.write(results_file_Tcell)

In [59]:
adata

AnnData object with n_obs × n_vars = 26072 × 1249
    obs: 'sample', 'sample_type', 'site', 'fraction', 'n_genes', 'individual', 'n_counts', 'leiden', 'leiden_R', 'CD4/CD8', 'Tcell_cluster'
    var: 'mean', 'std'
    uns: 'log1p', 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors', 'leiden_R_colors', 'dendrogram_leiden_R', 'sample_colors', 'site_colors', 'CD4/CD8_colors', 'rank_genes_groups', 'Tcell_cluster_colors', 'dendrogram_Tcell_cluster'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

## summarize clusters

In [60]:
adata_tcell = sc.read(results_file_Tcell)
adata_cd4tcell = sc.read(results_file_CD4Tcell)
# adata_treg = sc.read(results_file_Treg)

In [61]:
df_cluster = adata_tcell.obs[['Tcell_cluster']]
df_cluster.columns = ['minor_cluster']
df_cluster['minor_cluster'] = df_cluster['minor_cluster'].astype(str)
df_cluster.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,minor_cluster
AACCATGGTCTCAGAT,CD4 T cell
AACGTCAGTACATTGC,Thymic CD8 T cell
AAGAACAAGCGGCTCT,CD4 T cell
AAGCATCCATGACGTT,CD4 T cell
AAGCGAGCAGACAAAT,CD4 T cell


In [62]:
df_cluster_cd4tcell = adata_cd4tcell.obs[['CD4T_cluster']]
df_cluster_cd4tcell.columns = ['minor_cluster']
df_cluster_cd4tcell['minor_cluster'] = df_cluster_cd4tcell['minor_cluster'].astype(str)
df_cluster_cd4tcell.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,minor_cluster
AACCATGGTCTCAGAT,Naive Treg
AAGAACAAGCGGCTCT,Thymic CD4 T cell (II)
AAGCATCCATGACGTT,CD4 Tcm (Th17)
AAGCGAGCAGACAAAT,CD4 Tnaive
ACAAGCTAGAGAGGGC,CD4 Tnaive


In [63]:
df_cluster.loc[df_cluster_cd4tcell.index, 'minor_cluster'] = list(df_cluster_cd4tcell['minor_cluster'])
# df_cluster.loc[df_cluster_treg.index, 'minor_cluster'] = list(df_cluster_treg['minor_cluster'])
adata_tcell.obs['minor_cluster'] = list(df_cluster['minor_cluster'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [64]:
sc.pl.umap(adata_tcell, 
           color=['minor_cluster'], 
           save='minor_cluster')

... storing 'minor_cluster' as categorical


<Figure size 320x320 with 1 Axes>

In [65]:
sc.pl.umap(adata_tcell, color='minor_cluster', title='', add_outline=True, 
           frameon=False, save='minor_cluster_out.pdf')



<Figure size 320x320 with 1 Axes>

In [66]:
with plt.rc_context({"figure.figsize": (8, 8)}):
    sc.pl.umap(adata_tcell, color='minor_cluster', add_outline=True,
           title='', frameon=False, save='minor_cluster_middle.pdf', s=20)



<Figure size 640x640 with 1 Axes>

In [67]:
with plt.rc_context({"figure.figsize": (20, 20)}):
    sc.pl.umap(adata_tcell, color='minor_cluster', add_outline=True,
           title='', frameon=False, save='minor_cluster_large.pdf', s=100)



<Figure size 1600x1600 with 1 Axes>

In [68]:
with plt.rc_context({"figure.figsize": (20, 20)}):
    sc.pl.umap(adata_tcell, color='minor_cluster', add_outline=False,
           title='', frameon=False, legend_loc='on data', save='minor_cluster_ondata.pdf', s=100)



<Figure size 1600x1600 with 1 Axes>

In [69]:
sc.tl.rank_genes_groups(adata_tcell, 'minor_cluster', method='t-test_overestim_var')
sc.pl.rank_genes_groups(adata_tcell, n_genes=25, sharey=False)

ranking genes
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:36)


<Figure size 1280x2240 with 27 Axes>

In [70]:
sc.pl.rank_genes_groups_dotplot(adata_tcell, n_genes=5, groupby='minor_cluster', save='Tcell_minorcluster.pdf')

    using 'X_pca' with n_pcs = 50
Storing dendrogram info using `.uns['dendrogram_minor_cluster']`


<Figure size 4180x836 with 6 Axes>

In [71]:
print('\n'.join([x for x in list(adata_tcell.uns['rank_genes_groups']['names']['CD8 Tnaive'])[:100] if not x.startswith('RP')]))

MT-ND3
CD8B
MT-CO3
MT-CO2
MT-ND4
LINC02446
MT-ATP6
SNHG29
KLRK1
MT-CYB
EEF1B2
TPT1
MT-ND1
PRKCQ-AS1
PASK
NACA
MT-CO1
CCR7
NPM1
LINC01871
UBA52
MT-ND2
NOSIP
TXNIP
S100B
TMSB10
DSEL
LRRN3
AIF1
CD8A
FAU
SNHG5
PCED1B-AS1
EEF1G
PRMT2


In [72]:
sc.pl.umap(adata_tcell,
       color=[x for x in list(adata_tcell.uns['rank_genes_groups']['names']['CD4 Tnaive'])[:10] if not x.startswith('RP')])



<Figure size 1545.6x640 with 12 Axes>

In [73]:
df_p = adata_tcell.obs[['minor_cluster', 'site']].copy()
df_p['RGS1'] = np.array(adata_tcell.raw[:,'RGS1'].X.todense()).flatten()

In [74]:
plt.figure(figsize=(10,4))
sns.violinplot(x='minor_cluster', y='RGS1', hue='site', data=df_p, split=True)
plt.xticks(rotation=90)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26]),
 [Text(0, 0, 'Activated Treg'),
  Text(1, 0, 'CD4 Tcm (Tfh)'),
  Text(2, 0, 'CD4 Tcm (Th0)'),
  Text(3, 0, 'CD4 Tcm (Th2)'),
  Text(4, 0, 'CD4 Tcm (Th17)'),
  Text(5, 0, 'CD4 Tem (Th1)'),
  Text(6, 0, 'CD4 Tem (Th1/17)'),
  Text(7, 0, 'CD4 Temra (Th1)'),
  Text(8, 0, 'CD4 Tnaive'),
  Text(9, 0, 'CD8 Tem'),
  Text(10, 0, 'CD8 Temra'),
  Text(11, 0, 'CD8 Tnaive'),
  Text(12, 0, 'CD8 Trm'),
  Text(13, 0, 'DP T cell'),
  Text(14, 0, 'Doublet'),
  Text(15, 0, 'ILC'),
  Text(16, 0, 'NK cell'),
  Text(17, 0, 'NKT cell (periphery)'),
  Text(18, 0, 'NKT cell (thymus)'),
  Text(19, 0, 'Naive Treg'),
  Text(20, 0, 'T agonist'),
  Text(21, 0, 'Thymic CD4 T cell (I)'),
  Text(22, 0, 'Thymic CD4 T cell (II)'),
  Text(23, 0, 'Thymic CD8 T cell'),
  Text(24, 0, 'aa CD8 T cell (I)'),
  Text(25, 0, 'aa CD8 T cell (II)'),
  Text(26, 0, 'gd T cell')])

<Figure size 800x320 with 1 Axes>

In [75]:
sc.pl.violin(adata_tcell, 'RGS1', groupby='site', )



<Figure size 372.24x320 with 1 Axes>

In [76]:
sc.pl.rank_genes_groups_dotplot(adata_tcell, n_genes=15, groupby='minor_cluster', save='Tcell_minorcluster_15.pdf')



<Figure size 12172x836 with 6 Axes>

In [77]:
sc.tl.rank_genes_groups(adata_tcell, 'minor_cluster', method='t-test_overestim_var', reference='aa CD8 T cell (I)', 
                        groups=['aa CD8 T cell (II)'])
sc.pl.rank_genes_groups(adata_tcell, n_genes=25, sharey=False)

ranking genes
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:00)


<Figure size 320x320 with 1 Axes>

In [78]:
adata_tcell.write(results_file_Tcell_minor_cluster)

In [79]:
adata_raw = sc.read(results_raw_file_master)

adata_cg = adata_raw[adata_tcell.obs.index].copy()
adata_cg.obs = adata_tcell.obs
adata_cg.obsm = adata_tcell.obsm
adata_cg.uns = adata_tcell.uns

sc.pp.normalize_per_cell(adata_cg, counts_per_cell_after=1e4)
sc.pp.log1p(adata_cg)

adata_cg.write(results_file_cg_Tcell_minor_cluster)

normalizing by total count per cell
    finished (0:00:05): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)


## Velocyto

In [254]:
import scvelo as scv
mpl.rcParams['axes.grid'] = False

In [255]:
adata = sc.read('./data/velocyto.210430.merged.loom')
adata.var_names_make_unique()
adata.obs.index = [x.split(':')[1][:-1] for x in adata.obs.index]
adata.obs_names_make_unique()

Only considering the two last: ['.merged', '.loom'].
Only considering the two last: ['.merged', '.loom'].
--> This might be very slow. Consider passing `cache=True`, which enables much faster reading from a cache file.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [256]:
adata

AnnData object with n_obs × n_vars = 68442 × 36601
    obs: 'Clusters', '_X', '_Y'
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
    layers: 'matrix', 'ambiguous', 'spliced', 'unspliced'

In [182]:
list_cluster_retain = ['CD4 T agonist', 'Th1', 'Th2', 'Th17', 'Treg', 'naive Th',
       'thymic CD4 T cell']

In [49]:
mdata = sc.read(results_file_abTcell)
mdata = mdata[mdata.obs['Tcell_cluster'].isin(list_cluster_retain)]
list_overlap_cells = list(set(adata.obs.index) & set(mdata.obs.index))
adata = adata[list_overlap_cells,:]
mdata = mdata[list_overlap_cells,:]
adata.obs['abTcell_cluster'] = mdata.obs['abTcell_cluster']
adata.uns['abTcell_cluster_colors'] = mdata.uns['abTcell_cluster_colors']
adata.obsm['X_umap'] = mdata.obsm['X_umap']

Trying to set attribute `.obs` of view, copying.


In [50]:
scv.pl.proportions(adata, groupby="abTcell_cluster")
# plt.grid(False)

<Figure size 1000x200 with 2 Axes>

In [51]:
scv.pl.proportions(adata, groupby="sample")
# plt.grid(False)

<Figure size 1000x200 with 1 Axes>

In [52]:
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000)
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)

Filtered out 30531 genes that are detected 20 counts (shared).
Normalized count data: X, spliced, unspliced.
Exctracted 2000 highly variable genes.
Logarithmized X.
computing PCA
    on highly variable genes
    with n_comps=30
    finished (0:00:01)
computing neighbors
    finished (0:00:08) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:01) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)


In [53]:
scv.tl.recover_dynamics(adata)

recovering dynamics
    finished (0:03:00) --> added 
    'fit_pars', fitted parameters for splicing dynamics (adata.var)


In [54]:
scv.tl.velocity(adata, mode='deterministic', filter_genes=True)
scv.tl.velocity_graph(adata, n_recurse_neighbors=2)

computing velocities
    finished (0:00:01) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
computing velocity graph
    finished (0:00:08) --> added 
    'velocity_graph', sparse matrix with cosine correlations (adata.uns)


In [66]:
scv.tl.velocity_embedding(adata, basis='umap', direct_pca_projection=False)
# scv.tl.velocity_embedding(adata, basis='draw_graph_fr')

computing velocity embedding
    finished (0:00:03) --> added
    'velocity_umap', embedded velocity vectors (adata.obsm)


In [67]:
scv.pl.velocity_embedding(adata, basis='umap', color='abTcell_cluster', 
                          legend_loc='on data', scale=.5, figsize=(10,10))
# scv.pl.velocity_embedding(adata, basis='draw_graph_fr', color='louvain', scale=3, legend_loc='on data')

<Figure size 800x800 with 1 Axes>

In [68]:
scv.pl.velocity_embedding_grid(adata, basis='umap', color='abTcell_cluster',
                               density=1, scale=.3, figsize=(10,10))
# scv.pl.velocity_embedding_grid(adata, basis='draw_graph_fr', color='louvain', density=1, scale=2)

<Figure size 800x800 with 1 Axes>

In [69]:
scv.pl.velocity_embedding_stream(adata, basis='umap', color='abTcell_cluster', 
                                 legend_loc='right mergin')

<Figure size 320x320 with 1 Axes>

In [72]:
scv.tl.paga(adata, groups='abTcell_cluster')
scv.pl.paga(adata, basis='umap', size=50, alpha=.1,
            min_edge_width=2, node_size_scale=1.5)

running PAGA using priors: ['velocity_pseudotime']
    finished (0:00:03) --> added
    'paga/connectivities', connectivities adjacency (adata.uns)
    'paga/connectivities_tree', connectivities subtree (adata.uns)
    'paga/transitions_confidence', velocity transitions (adata.uns)


<Figure size 320x320 with 1 Axes>

In [70]:
scv.tl.terminal_states(adata)
scv.pl.scatter(adata, color=['root_cells', 'end_points'])

computing terminal states
    identified 5 regions of root cells and 2 regions of end points .
    finished (0:00:04) --> added
    'root_cells', root cells of Markov diffusion process (adata.obs)
    'end_points', end points of Markov diffusion process (adata.obs)


<Figure size 640x320 with 4 Axes>

In [60]:
scv.tl.velocity_confidence(adata)
scv.pl.scatter(adata, color='velocity_confidence', perc=[2,98])

--> added 'velocity_length' (adata.obs)
--> added 'velocity_confidence' (adata.obs)
--> added 'velocity_confidence_transition' (adata.obs)


<Figure size 320x320 with 2 Axes>

In [61]:
scv.tl.rank_velocity_genes(adata, groupby='abTcell_cluster', min_corr=.3)

df = scv.DataFrame(adata.uns['rank_velocity_genes']['names'])
df.head()

ranking velocity genes
    finished (0:00:00) --> added 
    'rank_velocity_genes', sorted scores by group ids (adata.uns) 
    'spearmans_score', spearmans correlation scores (adata.var)


Unnamed: 0,CD4 T agonist,Th1,Th2,Th17,Treg,naive Th,thymic CD4 T cell
0,RPS21,SRGN,RPS3A,SRGN,RPS3A,TMSB4X,TOX
1,ITM2A,C1orf54,SRGN,RPS3A,RPL4,TMSB10,RPS21
2,CCR9,AL138963.4,RPS7,RPS7,HSP90AA1,ARHGEF1,SATB1
3,TOX,RPL4,RPL13,RPL30,RPL23,FMNL1,TOX2
4,CD99,RPS3A,FAU,FAU,FOS,CD74,ITM2A


In [62]:
scv.pl.velocity(adata, var_names=df.head(5)['Treg'],
                color='abTcell_cluster', basis='umap', figsize=(10,10))

<Figure size 1200x2000 with 25 Axes>

In [63]:
scv.tl.latent_time(adata)
scv.pl.scatter(adata, color='latent_time', color_map='gnuplot', size=80)

computing latent time using root_cells as prior
    finished (0:00:03) --> added 
    'latent_time', shared time (adata.obs)


<Figure size 320x320 with 2 Axes>

In [64]:
top_genes = adata.var['fit_likelihood'].sort_values(ascending=False).index[:300]
scv.pl.heatmap(adata, var_names=top_genes, sortby='latent_time', 
               col_color='abTcell_cluster', 
               n_convolve=100)

<Figure size 640x320 with 4 Axes>

## TCR

In [3]:
import scirpy as ir

In [4]:
adata = sc.read(results_file_CD4Tcell)
adata = adata[adata.obs['sample'].isin(['MG03_PT', 'MG03_TL'])]

In [5]:
df_ir = pd.concat([pd.read_csv('data/cellranger/MG03_PT_TCR/filtered_contig_annotations.csv'),
pd.read_csv('data/cellranger/MG03_TL_TCR/filtered_contig_annotations.csv')])
df_ir['barcode'] = df_ir['barcode'].str.split('-').str.get(0)
df_ir.to_csv('data/cellranger/MG03_TCR.csv', index=None)

In [6]:
mg03_ir = ir.io.read_10x_vdj('data/cellranger/MG03_TCR.csv')
ir.pp.merge_with_ir(adata, mg03_ir)

In [7]:
adata[adata.obs['has_ir'] == 'True'].obs['sample'].value_counts()

MG03_PT    9900
MG03_TL     404
Name: sample, dtype: int64

In [8]:
adata.obs

Unnamed: 0,sample,sample_type,site,fraction,n_genes,individual,score_DEGs,score_yellow,score_GWAS,n_counts,...,IR_VDJ_2_sequence_id,IR_VJ_1_v_call,IR_VJ_2_v_call,IR_VDJ_1_v_call,IR_VDJ_2_v_call,IR_VJ_1_v_cigar,IR_VJ_2_v_cigar,IR_VDJ_1_v_cigar,IR_VDJ_2_v_cigar,has_ir
AAACCTGGTCGACTAT,MG03_TL,TL,Thymus,Lymphocytes,1495,MG03_TL,-0.012844,-0.015708,0.028718,2634.0,...,,TRAV8-3,,TRBV6-2,,,,,,True
AAAGATGGTCTCCACT,MG03_TL,TL,Thymus,Lymphocytes,2070,MG03_TL,-0.021947,-0.021999,-0.068362,4768.0,...,,TRAV25,,TRBV10-2,,,,,,True
AAAGCAAAGCGTCTAT,MG03_TL,TL,Thymus,Lymphocytes,1555,MG03_TL,-0.021947,-0.021999,0.088329,2987.0,...,,TRAV41,,TRBV9,,,,,,True
AACACGTAGACATAAC,MG03_TL,TL,Thymus,Lymphocytes,1457,MG03_TL,-0.001561,-0.004398,0.118344,2704.0,...,,TRAV9-2,,TRBV7-8,,,,,,True
AACACGTCAGACGTAG,MG03_TL,TL,Thymus,Lymphocytes,1553,MG03_TL,-0.006078,-0.007100,0.201709,2750.0,...,,TRAV21,,TRBV9,,,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCGAAGAG,MG03_PT,PT,Periphery,Stroma,1876,MG03_PT,-0.013117,-0.015981,0.114369,5211.0,...,,TRAV21,,TRBV20-1,,,,,,True
TTTGTCATCCTAAGTG,MG03_PT,PT,Periphery,Stroma,4154,MG03_PT,-0.044635,-0.047574,0.174934,14176.0,...,,,,,,,,,,
TTTGTCATCTCAAACG,MG03_PT,PT,Periphery,Stroma,2841,MG03_PT,-0.027689,-0.027755,0.155677,9485.0,...,,TRAV4,TRAV29/DV5,TRBV11-1,TRBV10-2,,,,,True
TTTGTCATCTGGGCCA,MG03_PT,PT,Periphery,Stroma,2121,MG03_PT,-0.018526,-0.018570,0.138510,5629.0,...,,TRAV23/DV6,,TRBV6-1,,,,,,True


In [9]:
adata[adata.obs['has_ir'] == 'True'].obs['CD4T_cluster'].value_counts()

CD4 Tnaive                4571
CD4 Tem (Th1/17)          1546
CD4 Tcm (Tfh)              763
CD4 Tcm (Th17)             651
CD4 Tcm (Th0)              644
Naive Treg                 440
CD4 Temra (Th1)            313
T agonist                  273
CD4 Tcm (Th2)              258
Thymic CD4 T cell (II)     239
Activated Treg             209
Thymic CD4 T cell (I)      179
CD4 Tem (Th1)              165
CD8 Tnaive                  53
Name: CD4T_cluster, dtype: int64

In [10]:
ir.tl.chain_qc(adata)

In [11]:
ax = ir.pl.group_abundance(adata, groupby="receptor_subtype", target_col="sample")

... storing 'receptor_type' as categorical
... storing 'receptor_subtype' as categorical
... storing 'chain_pairing' as categorical


<Figure size 412.8x309.6 with 1 Axes>

In [12]:
ir.pp.ir_dist(
    adata,
    metric="alignment",
    sequence="aa",
    cutoff=15,
)

Computing sequence x sequence distance matrix for VJ sequences.


  0%|          | 0/18721 [00:00<?, ?it/s]

Computing sequence x sequence distance matrix for VDJ sequences.


  0%|          | 0/18721 [00:00<?, ?it/s]

In [13]:
ir.tl.define_clonotype_clusters(
    adata, sequence="aa", metric="alignment", receptor_arms="all", dual_ir="any"
)

Initializing lookup tables. 
--> Done initializing lookup tables. (0:00:01)
Computing clonotype x clonotype distances.
NB: Computation happens in chunks. The progressbar only advances when a chunk has finished. 


  0%|          | 0/9423 [00:00<?, ?it/s]

--> Done computing clonotype x clonotype distances.  (0:00:13)
Stored clonal assignments in `adata.obs["cc_aa_alignment"]`.


In [14]:
ir.tl.clonotype_network(adata, min_cells=3, sequence="aa", metric="alignment")

In [15]:
ir.pl.clonotype_network(
    adata, color="CD4T_cluster", label_fontsize=9, panel_size=(7, 7), base_size=20
)

... storing 'cc_aa_alignment' as categorical


<AxesSubplot:>

<Figure size 760x560 with 4 Axes>

In [16]:
adata

AnnData object with n_obs × n_vars = 10897 × 1053
    obs: 'sample', 'sample_type', 'site', 'fraction', 'n_genes', 'individual', 'score_DEGs', 'score_yellow', 'score_GWAS', 'n_counts', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'leiden_R', 'CD4T_cluster', 'multi_chain', 'extra_chains', 'is_cell', 'high_confidence', 'IR_VJ_1_c_call', 'IR_VJ_2_c_call', 'IR_VDJ_1_c_call', 'IR_VDJ_2_c_call', 'IR_VJ_1_consensus_count', 'IR_VJ_2_consensus_count', 'IR_VDJ_1_consensus_count', 'IR_VDJ_2_consensus_count', 'IR_VJ_1_d_call', 'IR_VJ_2_d_call', 'IR_VDJ_1_d_call', 'IR_VDJ_2_d_call', 'IR_VJ_1_d_cigar', 'IR_VJ_2_d_cigar', 'IR_VDJ_1_d_cigar', 'IR_VDJ_2_d_cigar', 'IR_VJ_1_duplicate_count', 'IR_VJ_2_duplicate_count', 'IR_VDJ_1_duplicate_count', 'IR_VDJ_2_duplicate_count', 'IR_VJ_1_germline_alignment', 'IR_VJ_2_germline_alignment', 'IR_VDJ_1_germline_alignment', 'IR_VDJ_2_germline_alignment', 'IR_VJ_1_j_call', 'IR_VJ_2_j_call', 'IR_VDJ_1_j_call', 'IR_VDJ_2_j_call', '

In [17]:
ir.tl.define_clonotypes(adata)

ir_dist for sequence='nt' and metric='identity' not found. Computing with default parameters.
Computing sequence x sequence distance matrix for VJ sequences.
Computing sequence x sequence distance matrix for VDJ sequences.
Initializing lookup tables. 
--> Done initializing lookup tables. (0:00:01)
Computing clonotype x clonotype distances.
NB: Computation happens in chunks. The progressbar only advances when a chunk has finished. 


  0%|          | 0/9425 [00:00<?, ?it/s]

--> Done computing clonotype x clonotype distances.  (0:00:08)
Stored clonal assignments in `adata.obs["clone_id"]`.


In [18]:
ir.tl.clonal_expansion(adata)
sc.pl.umap(adata, color=["clonal_expansion"], save='clonal_expansion')

... storing 'clone_id' as categorical
... storing 'clonal_expansion' as categorical


<Figure size 320x320 with 1 Axes>

In [19]:
ir.pl.clonal_expansion(adata, "CD4T_cluster",figsize=(6,3))
plt.savefig(str(sc.settings.figdir) + '/clonotypesize.pdf', bbox_inches='tight')

<Figure size 720x360 with 1 Axes>

In [20]:
adata.obs.columns

Index(['sample', 'sample_type', 'site', 'fraction', 'n_genes', 'individual',
       'score_DEGs', 'score_yellow', 'score_GWAS', 'n_counts',
       ...
       'IR_VDJ_2_v_cigar', 'has_ir', 'receptor_type', 'receptor_subtype',
       'chain_pairing', 'cc_aa_alignment', 'cc_aa_alignment_size', 'clone_id',
       'clone_id_size', 'clonal_expansion'],
      dtype='object', length=102)

In [21]:
adata.obs['chain_pairing']

AAACCTGGTCGACTAT        single pair
AAAGATGGTCTCCACT        single pair
AAAGCAAAGCGTCTAT        single pair
AACACGTAGACATAAC        single pair
AACACGTCAGACGTAG        single pair
                         ...       
TTTGTCATCCGAAGAG        single pair
TTTGTCATCCTAAGTG              no IR
TTTGTCATCTCAAACG    two full chains
TTTGTCATCTGGGCCA        single pair
TTTGTCATCTGTTTGT        single pair
Name: chain_pairing, Length: 10897, dtype: category
Categories (7, object): ['extra VDJ', 'extra VJ', 'no IR', 'orphan VDJ', 'orphan VJ', 'single pair', 'two full chains']

In [22]:
adata.obs['CD4T_cluster'].value_counts()

CD4 Tnaive                4787
CD4 Tem (Th1/17)          1632
CD4 Tcm (Tfh)              801
CD4 Tcm (Th17)             687
CD4 Tcm (Th0)              677
Naive Treg                 469
CD4 Temra (Th1)            369
T agonist                  281
CD4 Tcm (Th2)              277
Thymic CD4 T cell (II)     249
Activated Treg             233
CD4 Tem (Th1)              198
Thymic CD4 T cell (I)      181
CD8 Tnaive                  56
Name: CD4T_cluster, dtype: int64

In [23]:
adata = adata[~adata.obs['clone_id'].isna()]

In [24]:
ir.tl.repertoire_overlap(adata, "CD4T_cluster", inplace=True)
ir.pl.repertoire_overlap(adata, "CD4T_cluster")
plt.savefig(str(sc.settings.figdir) + '/clonotypesize.pdf', bbox_inches='tight')

Trying to set attribute `.uns` of view, copying.


<Figure size 800x800 with 4 Axes>

In [25]:
adata.obs['sample_cluster'] = adata.obs['sample'].astype(str) + ' ' + adata.obs['CD4T_cluster'].astype(str)
ir.tl.repertoire_overlap(adata, "sample_cluster", inplace=True)
ir.pl.repertoire_overlap(adata, "sample_cluster", heatmap_cats=['site', 'CD4T_cluster'])

... storing 'sample_cluster' as categorical


<seaborn.matrix.ClusterGrid at 0x7f5e98839750>

<Figure size 800x800 with 5 Axes>

In [33]:
from scipy.spatial import distance as sc_distance
import networkx as nx

adata.obs['site_cluster'] = adata.obs['site'].astype(str) + ' ' + adata.obs['CD4T_cluster'].astype(str)
sc.pl.umap(adata, color=['site_cluster'])
a = adata.copy()

df, dst, lk = ir.tl.repertoire_overlap(a, "site_cluster", inplace=False)

... storing 'site_cluster' as categorical


<Figure size 320x320 with 1 Axes>

In [34]:
distM = sc_distance.squareform(dst)
np.fill_diagonal(distM, 1)
scaling_factor = distM.min()
np.fill_diagonal(distM, scaling_factor)
distM = pd.DataFrame(distM, index=a.obs['site_cluster'].cat.categories, 
                     columns=a.obs['site_cluster'].cat.categories)

In [36]:
tidy_dist = (1-distM).melt(ignore_index=False)
tidy_dist_auto = tidy_dist[tidy_dist.index == tidy_dist.variable]
tidy_dist = tidy_dist.loc[[x for x in tidy_dist.index if 'Thymus' in x]]
tidy_dist = tidy_dist.loc[['Periphery' in x for x in tidy_dist.variable]]


In [40]:
tidy_dist['value'].hist(bins=20)
plt.yscale('log')

<Figure size 320x320 with 1 Axes>

In [41]:
tidy_dist = tidy_dist.loc[tidy_dist['value'] > 0]

In [42]:
tidy_dist.columns = ['target', 'weight']
tidy_dist['source'] = tidy_dist.index
tidy_dist = tidy_dist.reset_index(drop=True)
tidy_dist = tidy_dist[['source', 'target', 'weight']]
tidy_dist

Unnamed: 0,source,target,weight
0,Thymus Activated Treg,Periphery Naive Treg,0.002299
1,Thymus CD4 Tcm (Th17),Periphery CD4 Tcm (Th17),0.001613
2,Thymus Naive Treg,Periphery Activated Treg,0.005051
3,Thymus Thymic CD4 T cell (I),Periphery CD4 Tem (Th1/17),0.000776
4,Thymus Activated Treg,Periphery Naive Treg,0.002299
...,...,...,...
95,Thymus Thymic CD4 T cell (I),Periphery CD4 Tem (Th1/17),0.000776
96,Thymus Activated Treg,Periphery Naive Treg,0.002299
97,Thymus CD4 Tcm (Th17),Periphery CD4 Tcm (Th17),0.001613
98,Thymus Naive Treg,Periphery Activated Treg,0.005051


In [67]:
tidy_dist

Unnamed: 0,source,target,weight
0,Thymus Activated Treg,Periphery Naive Treg,0.002299
1,Thymus CD4 Tcm (Th17),Periphery CD4 Tcm (Th17),0.001613
2,Thymus Naive Treg,Periphery Activated Treg,0.005051
3,Thymus Thymic CD4 T cell (I),Periphery CD4 Tem (Th1/17),0.000776
4,Thymus Activated Treg,Periphery Naive Treg,0.002299
...,...,...,...
95,Thymus Thymic CD4 T cell (I),Periphery CD4 Tem (Th1/17),0.000776
96,Thymus Activated Treg,Periphery Naive Treg,0.002299
97,Thymus CD4 Tcm (Th17),Periphery CD4 Tcm (Th17),0.001613
98,Thymus Naive Treg,Periphery Activated Treg,0.005051


In [65]:
G = nx.from_pandas_edgelist(tidy_dist, edge_attr=True)

n_label = a.obs['site_cluster'].cat.categories.shape[0]
pos_nodes = {x: (int(2*i/n_label),int(i%(n_label/2))) 
       for i,x in enumerate(a.obs['site_cluster'].cat.categories)}
pos_label = {x: ((int(2*i/n_label)-0.5)*2+0.5,int(i%(n_label/2))) 
       for i,x in enumerate(a.obs['site_cluster'].cat.categories)}

c_nodes = [a.uns['CD4T_cluster_colors'][list(a.obs['CD4T_cluster'].cat.categories).index(
    x.replace('Thymus ', "").replace('Periphery ', ""))]
 for x in G.nodes()] 

edge_colors = [c['weight'] for u, v, c in G.edges(data=True)]

In [66]:
plt.figure(figsize=(7,7))
nodes = nx.draw_networkx_nodes(G, pos_nodes, node_color=c_nodes)
nx.draw_networkx_labels(G, pos_label)
edges = nx.draw_networkx_edges(
    G,
    pos_nodes,
    arrowstyle="->",
    arrowsize=10,
    edge_color=edge_colors,
    edge_cmap=plt.cm.Greys,
    width=2,
)
plt.axis("off")
plt.xlim(-1,2)

(-1.0, 2.0)

<Figure size 560x560 with 1 Axes>

In [69]:
nx.write_gml(G, str(sc.settings.figdir) + '/../TCR_cluster.gml')

In [70]:
c_nodes

['#1f77b4',
 '#ffbb78',
 '#aa40fc',
 '#aa40fc',
 '#ffbb78',
 '#1f77b4',
 '#ff9896',
 '#e377c2']

In [64]:
edge_colors

[3.004597701149425, 3.0032258064516126, 3.0101010101010104, 3.001552795031056]

In [148]:
adata[adata.obs['IR_VJ_1_j_call']=='TRAJ24'].obs['CD4T_cluster'].value_counts()

CD4 Tnaive                43
CD4 Tem (Th1/17)          15
CD4 Tcm (Th17)             7
CD4 Tcm (Tfh)              5
CD4 Tcm (Th0)              5
Naive Treg                 5
Thymic CD4 T cell (I)      5
CD4 Tcm (Th2)              4
T agonist                  2
Activated Treg             1
CD8 Tnaive                 1
Thymic CD4 T cell (II)     1
Name: CD4T_cluster, dtype: int64

In [151]:
adata[(adata.obs['IR_VJ_1_j_call']=='TRAJ24') & (adata.obs['IR_VJ_1_v_call']=='TRAV13-2')].obs['CD4T_cluster']

GGGAATGGTGCAGTAG    CD4 Tem (Th1/17)
TGTGGTAAGGGCACTA          CD4 Tnaive
Name: CD4T_cluster, dtype: category
Categories (2, object): ['CD4 Tem (Th1/17)', 'CD4 Tnaive']

In [150]:
sc.pl.umap(adata[adata.obs['IR_VJ_1_j_call']=='TRAJ24'])



<Figure size 320x320 with 1 Axes>