# Generate Knock Down (KD) Form 

2021-06-25

In [1]:
# Import Packages

%load_ext autoreload
%autoreload 2

import os
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from anndata import AnnData

# Customized packages
import starmap.sc_util as su

# test()

## Input

In [2]:
# Set path
base_path = './'
out_path = os.path.join(base_path, 'output')
form_path = os.path.join(base_path, 'forms')
if not os.path.exists(form_path):
    os.mkdir(form_path)

In [3]:
# Load adata
adata = sc.read_h5ad('./output/2021-09-24-Rena-EU-starmap-raw-KD-combined.h5ad')
adata

AnnData object with n_obs × n_vars = 19603 × 998
    obs: 'orig_index', 'sample', 'volume', 'x', 'y', 'z', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'n_genes', 'n_counts', 'condition', 'KD_label_combined'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'max_counts', 'n_cells'
    layers: 'cytoplasm', 'er', 'nucleus', 'outer_cytoplasm'

In [4]:
# Generate table for each cluster 
with pd.ExcelWriter('./forms/2021-09-24-TEMPOmap-pseudobulk.xlsx') as writer:
    
    for current_kd_cluster in adata.obs['KD_label_combined'].cat.categories:
        # Subset 
        cdata = adata[adata.obs['KD_label_combined'] == current_kd_cluster, ]
        # Whole cell  
        expr_df = pd.DataFrame(cdata.X, columns=cdata.var.index, index=cdata.obs.index)
        expr_df['sample'] = cdata.obs['sample'].values
        expr_df = expr_df.groupby(by='sample').mean().T
        # nucleus
        nuc_expr_df = pd.DataFrame(cdata.layers['nucleus'], columns=cdata.var.index, index=cdata.obs.index)
        nuc_expr_df['sample'] = cdata.obs['sample'].values
        nuc_expr_df = nuc_expr_df.groupby(by='sample').mean().T
        # cytoplasm
        cyto_expr_df = pd.DataFrame(cdata.layers['cytoplasm'], columns=cdata.var.index, index=cdata.obs.index)
        cyto_expr_df['sample'] = cdata.obs['sample'].values
        cyto_expr_df = cyto_expr_df.groupby(by='sample').mean().T
        # er
        er_expr_df = pd.DataFrame(cdata.layers['er'], columns=cdata.var.index, index=cdata.obs.index)
        er_expr_df['sample'] = cdata.obs['sample'].values
        er_expr_df = er_expr_df.groupby(by='sample').mean().T
        # output
        output_df = pd.concat([expr_df, nuc_expr_df, cyto_expr_df, er_expr_df], axis=1)
        output_df.to_excel(writer, sheet_name=f"{current_kd_cluster}-raw")

## Test

In [5]:
sdata = adata[adata.obs['KD_label_combined'] == 'siYTHDC1', :]
sdata

View of AnnData object with n_obs × n_vars = 8617 × 998
    obs: 'orig_index', 'sample', 'volume', 'x', 'y', 'z', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'n_genes', 'n_counts', 'condition', 'KD_label_combined'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'max_counts', 'n_cells'
    layers: 'cytoplasm', 'er', 'nucleus', 'outer_cytoplasm'

In [6]:
expr = sdata[:, 'YTHDC1'].X.flatten()
expr_order = np.argsort(expr)
expr_order

sdata.obs = sdata.obs.reset_index()
test = sdata[expr_order, :]
test.obs = test.obs.reset_index()
test = test[:2000, :]

In [7]:
# Generate table for each cluster 
with pd.ExcelWriter('./forms/2021-09-24-TEMPOmap-pseudobulk-DC1-2000.xlsx') as writer:
    
    for current_kd_cluster in test.obs['KD_label_combined'].cat.categories:
        # Subset 
        cdata = test[test.obs['KD_label_combined'] == current_kd_cluster, ]
        # Whole cell  
        expr_df = pd.DataFrame(cdata.X, columns=cdata.var.index, index=cdata.obs.index)
        expr_df['sample'] = cdata.obs['sample'].values
        expr_df = expr_df.groupby(by='sample').mean().T
        # nucleus
        nuc_expr_df = pd.DataFrame(cdata.layers['nucleus'], columns=cdata.var.index, index=cdata.obs.index)
        nuc_expr_df['sample'] = cdata.obs['sample'].values
        nuc_expr_df = nuc_expr_df.groupby(by='sample').mean().T
        # cytoplasm
        cyto_expr_df = pd.DataFrame(cdata.layers['cytoplasm'], columns=cdata.var.index, index=cdata.obs.index)
        cyto_expr_df['sample'] = cdata.obs['sample'].values
        cyto_expr_df = cyto_expr_df.groupby(by='sample').mean().T
        # er
        er_expr_df = pd.DataFrame(cdata.layers['er'], columns=cdata.var.index, index=cdata.obs.index)
        er_expr_df['sample'] = cdata.obs['sample'].values
        er_expr_df = er_expr_df.groupby(by='sample').mean().T
        # output
        output_df = pd.concat([expr_df, nuc_expr_df, cyto_expr_df, er_expr_df], axis=1)
        output_df.to_excel(writer, sheet_name=f"{current_kd_cluster}-raw")

### gene annotation

In [17]:
from pybiomart import Dataset
dataset = Dataset(name='hsapiens_gene_ensembl',
                  host='http://www.ensembl.org')
dataset.list_attributes().head(10)

Unnamed: 0,name,display_name,description
0,ensembl_gene_id,Gene stable ID,Stable ID of the Gene
1,ensembl_gene_id_version,Gene stable ID version,Versionned stable ID of the Gene
2,ensembl_transcript_id,Transcript stable ID,Stable ID of the Transcript
3,ensembl_transcript_id_version,Transcript stable ID version,Versionned stable ID of the Transcript
4,ensembl_peptide_id,Protein stable ID,
5,ensembl_peptide_id_version,Protein stable ID version,
6,ensembl_exon_id,Exon stable ID,
7,description,Gene description,
8,chromosome_name,Chromosome/scaffold name,Chromosome/scaffold name
9,start_position,Gene start (bp),Start Coordinate of the gene in chromosomal co...


In [18]:
annot = sc.queries.biomart_annotations(
        "hsapiens",
        ["hgnc_symbol", "description"],
    ).set_index("hgnc_symbol")
annot
# adata.var[annot.columns] = annot

Unnamed: 0_level_0,description
hgnc_symbol,Unnamed: 1_level_1
MT-TF,mitochondrially encoded tRNA-Phe (UUU/C) [Sour...
MT-RNR1,mitochondrially encoded 12S rRNA [Source:HGNC ...
MT-TV,mitochondrially encoded tRNA-Val (GUN) [Source...
MT-RNR2,mitochondrially encoded 16S rRNA [Source:HGNC ...
MT-TL1,mitochondrially encoded tRNA-Leu (UUA/G) 1 [So...
...,...
OSCP1,organic solute carrier partner 1 [Source:HGNC ...
SNORA63C,"small nucleolar RNA, H/ACA box 63C [Source:HGN..."
MRPS15,mitochondrial ribosomal protein S15 [Source:HG...
CSF3R,colony stimulating factor 3 receptor [Source:H...
