# Get cell cycle label

2021-06-15

In [None]:
# Import Packages

%load_ext autoreload
%autoreload 2
 
import os
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from anndata import AnnData

# Customized packages
import starmap.sc_util as su

# test()

## Input

In [None]:
# # Set path
# base_path = 'Z:/Data/Analyzed/2021-07-09-Hu-HelaRIBOmap'
# # Load adata
# adata = sc.read_h5ad('Z:/Data/Analyzed/2021-07-09-Hu-HelaRIBOmap/output/2021-10-03-Hu-HelaRIBOmap-after-pp-300.h5ad')

# Set path
base_path = 'Z:/Data/Analyzed/2021-08-25-Hu-HelaSTARmap'
# Load adata
adata = sc.read_h5ad('Z:/Data/Analyzed/2021-08-25-Hu-HelaSTARmap/output/2021-10-03-Hu-HelaSTARmap-after-pp-300.h5ad')


out_path = os.path.join(base_path, 'output')
fig_path = os.path.join(base_path, 'figures')
if not os.path.exists(fig_path):
    os.mkdir(fig_path)

adata

In [None]:
# Assign dummy label
adata.obs['phase_ref'] = 'NA'
adata.obs['phase_new'] = 'NA'

In [None]:
# sc.pl.violin(adata, 'total_counts', groupby='sample', rotation=45)

## Subset

In [None]:
# Plot top 20 most expressed genes 
sc.pl.highest_expr_genes(adata, n_top=20)

## Preprocessing

In [None]:
# Normalization scaling
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

# adata.layers['norm'] = adata.X
adata.raw = adata

# Scale data to unit variance and zero mean
sc.pp.scale(adata)
# adata.layers['scaled'] = adata.X

# Regress out total_counts
sc.pp.regress_out(adata, 'total_counts')

## Cell cycle scoring

In [None]:
use_ref = True

if use_ref:
    # Load reference gene list 
    cell_cycle_genes = [x.strip() for x in open(os.path.join(base_path, 'gene_modules/regev_lab_cell_cycle_genes.txt'))]
    s_genes = cell_cycle_genes[:43]
    g2m_genes = cell_cycle_genes[43:]
else:
    # Load self defined markers 
    s_genes = pd.read_excel(os.path.join(base_path, 'gene_modules/cell_cycle_markers.xlsx'), sheet_name='G1_S', header=None)[0].to_list()
    g2m_genes = pd.read_excel(os.path.join(base_path, 'gene_modules/cell_cycle_markers.xlsx'), sheet_name='G2_M', header=None)[0].to_list()
    cell_cycle_genes = s_genes + g2m_genes

In [None]:
# Get cell cycle genes 
missed_genes = [x for x in cell_cycle_genes if x not in adata.var_names]
cell_cycle_genes = [x for x in cell_cycle_genes if x in adata.var_names]
s_genes_true = [x for x in s_genes if x in adata.var_names]
g2m_genes_true = [x for x in g2m_genes if x in adata.var_names]

print(len(cell_cycle_genes), len(missed_genes))
print(f"G1/S genes: {len(s_genes)} - G2/M genes: {len(g2m_genes)}")
print(f"G1/S genes in our dataest: {len(s_genes_true)} - G2/M genes in our dataest: {len(g2m_genes_true)}")

In [None]:
# Compute gene set score
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)
adata_cc_genes = adata[:, cell_cycle_genes]
sc.tl.pca(adata_cc_genes)
sc.pl.pca_scatter(adata_cc_genes, color='phase')

In [None]:
adata.obs['phase'].value_counts()

## Clustering

In [None]:
sc.tl.pca(adata)
sc.pl.pca_overview(adata)

In [None]:
sc.pl.pca(adata, color='phase')

In [None]:
# default tsne
sc.tl.tsne(adata, perplexity=30, metric='cosine')
sc.pl.tsne(adata, color='phase')

In [None]:
sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
sc.tl.leiden(adata, resolution=.7)
sc.pl.tsne(adata, color='leiden')

In [None]:
# default UMAP
sc.tl.umap(adata)
sc.pl.umap(adata, color='phase')

In [None]:
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=10)
sc.tl.leiden(adata, resolution=.5)
sc.pl.umap(adata, color='leiden')

In [None]:
g2m_genes_true

In [None]:
sc.pl.scatter(adata, x='leiden', y='MCM6', color='leiden', title='MCM6')

In [None]:
sc.pl.violin(adata, keys='MCM6', groupby='leiden')

In [None]:
sc.pl.umap(adata, color='S_score')

In [None]:
# default tsne pc=2
sc.tl.tsne(adata_cc_genes, n_pcs=2)
sc.pl.tsne(adata_cc_genes)

In [None]:
# default tsne use_rep='X_pca'
sc.tl.tsne(adata_cc_genes, use_rep='X_pca')
sc.pl.tsne(adata_cc_genes, color='phase')

In [None]:
# default tsne with cosine
sc.tl.tsne(adata_cc_genes, metric='cosine', use_rep='X')
sc.pl.tsne(adata_cc_genes, color='phase')

### Assign back to adata

In [None]:
if use_ref:
    current_column = 'phase_ref'
    adata.obs.loc[:, current_column] = adata.obs['phase'].values
else:
    current_column = 'phase_new'
    adata.obs.loc[:, current_column] = adata.obs['phase'].values

In [None]:
confusion_matrix = pd.crosstab(adata.obs.loc[:, 'phase_ref'], adata.obs.loc[:, 'phase_new'], rownames=['Use reference list'], colnames=['Use our list'])
sns.heatmap(confusion_matrix, annot=True, fmt='')
plt.show()

## Output

In [None]:
adata.obs = adata.obs.iloc[:, :-3]

In [None]:
from datetime import datetime
date = datetime.today().strftime('%Y-%m-%d')
# adata.write_h5ad(f"{base_path}/output/{date}-Hu-HelaRIBOmap-cc.h5ad")
adata.write_h5ad(f"{base_path}/output/{date}-Hu-HelaSTARmap-cc.h5ad")