# Get cell cycle label

2023-04-04

In [None]:
# Import Packages

%load_ext autoreload
%autoreload 2

import os
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from anndata import AnnData
from tifffile import imread
from skimage.measure import regionprops

# Customized packages
import starmap.sc_util as su

# test()

## Input

In [None]:
# Set path
base_path = './'
out_path = os.path.join(base_path, 'output')
fig_path = os.path.join(base_path, 'figures')
if not os.path.exists(fig_path):
    os.mkdir(fig_path)

In [None]:
# Load adata
adata = sc.read_h5ad('./output/2021-09-24-Rena-EU-starmap-after-pp-300.h5ad')
adata

In [None]:
# Assign dummy label
adata.obs['phase_ref'] = 'NA'
adata.obs['phase_new'] = 'NA'

## Subset

In [None]:
# Remove KD genes 
cdata = adata[:, 7:]
cdata

In [None]:
# Subset sample
current_sample = '20h_labeling'
cdata = cdata[cdata.obs['sample'] == current_sample, ]
cdata

In [None]:
# Plot top 20 most expressed genes 
sc.pl.highest_expr_genes(cdata, n_top=20)

## Preprocessing

In [None]:
# Normalization scaling
sc.pp.normalize_total(cdata)
sc.pp.log1p(cdata)

# adata.layers['norm'] = adata.X
cdata.raw = cdata

# Scale data to unit variance and zero mean
sc.pp.scale(cdata)
# adata.layers['scaled'] = adata.X

## Cell cycle scoring

In [None]:
use_ref = False

if use_ref:
    # Load reference gene list 
    cell_cycle_genes = [x.strip() for x in open('./gene_modules/regev_lab_cell_cycle_genes.txt')]
    s_genes = cell_cycle_genes[:43]
    g2m_genes = cell_cycle_genes[43:]
else:
    # Load self defined markers 
    s_genes = pd.read_excel('./gene_modules/cell_cycle_markers.xlsx', sheet_name='G1_S', header=None)[0].to_list()
    g2m_genes = pd.read_excel('./gene_modules/cell_cycle_markers.xlsx', sheet_name='G2_M', header=None)[0].to_list()
    cell_cycle_genes = s_genes + g2m_genes

In [None]:
# Get cell cycle genes 
missed_genes = [x for x in cell_cycle_genes if x not in cdata.var_names]
cell_cycle_genes = [x for x in cell_cycle_genes if x in cdata.var_names]

print(len(cell_cycle_genes), len(missed_genes))
print(f"G1/S genes: {len(s_genes)} - G2/M genes: {len(g2m_genes)}")
print(f"G1/S genes in our dataest: {len([x for x in s_genes if x in cdata.var_names])} - G2/M genes in our dataest: {len([x for x in g2m_genes if x in cdata.var_names])}")

In [None]:
# Compute gene set score
sc.tl.score_genes_cell_cycle(cdata, s_genes=s_genes, g2m_genes=g2m_genes)
cdata_cc_genes = cdata[:, cell_cycle_genes]
sc.tl.pca(cdata_cc_genes)
sc.pl.pca_scatter(cdata_cc_genes, color='phase')

### Assign back to adata

In [None]:
if use_ref:
    current_column = 'phase_ref'
    adata.obs.loc[adata.obs['sample'] == current_sample, current_column] = cdata.obs['phase'].values
else:
    current_column = 'phase_new'
    adata.obs.loc[adata.obs['sample'] == current_sample, current_column] = cdata.obs['phase'].values

In [None]:
confusion_matrix = pd.crosstab(adata.obs.loc[adata.obs['sample'] == current_sample, 'phase_ref'], adata.obs.loc[adata.obs['sample'] == current_sample, 'phase_new'], rownames=['Use reference list'], colnames=['Use our list'])
sns.heatmap(confusion_matrix, annot=True, fmt='')
plt.show()

## Output

In [None]:
from datetime import datetime
date = datetime.today().strftime('%Y-%m-%d')
adata.write_h5ad(f"./output/{date}-Rena-EU-starmap-cc.h5ad")

## Generate phase_ref label for all conditions

In [None]:
# Set path
base_path = './'
out_path = os.path.join(base_path, 'output')
fig_path = os.path.join(base_path, 'figures')
if not os.path.exists(fig_path):
    os.mkdir(fig_path)

# Load adata
adata = sc.read_h5ad('./output/2021-09-24-Rena-EU-starmap-after-pp-300.h5ad')
adata

# Assign dummy label
adata.obs['phase_ref'] = 'NA'

for current_sample in adata.obs['sample'].unique():
    
    # Remove KD genes 
    cdata = adata[:, 7:]
    cdata = cdata[cdata.obs['sample'] == current_sample, ]

    # Normalization scaling
    sc.pp.normalize_total(cdata)
    sc.pp.log1p(cdata)
    cdata.raw = cdata

    # Scale data to unit variance and zero mean
    sc.pp.scale(cdata)
    
    # Load reference gene list 
    cell_cycle_genes = [x.strip() for x in open('./gene_modules/regev_lab_cell_cycle_genes.txt')]
    s_genes = cell_cycle_genes[:43]
    g2m_genes = cell_cycle_genes[43:]


    # Get cell cycle genes 
    missed_genes = [x for x in cell_cycle_genes if x not in cdata.var_names]
    cell_cycle_genes = [x for x in cell_cycle_genes if x in cdata.var_names]

    # Compute gene set score
    sc.tl.score_genes_cell_cycle(cdata, s_genes=s_genes, g2m_genes=g2m_genes)

    adata.obs.loc[adata.obs['sample'] == current_sample, 'phase_ref'] = cdata.obs['phase'].values
    
# output
from datetime import datetime
date = datetime.today().strftime('%Y-%m-%d')
adata.write_h5ad(f"./output/{date}-Rena-EU-starmap-cc.h5ad")

In [None]:
adata.obs['phase_ref'] = adata.obs['phase_ref'].astype('category')
adata.obs['phase_ref'] = adata.obs['phase_ref'].cat.reorder_categories(['G1', 'S', 'G2M'])
sns.countplot(x="sample", hue="phase_ref", data=adata.obs, palette='plasma')
plt.xticks(rotation=45)
plt.show()