# Determine Abundance Percentiles of each Gene for each Cell Type 

## <br> 1. Import Required Packages

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

## <br> 2. Import Deseq2 Data

In [2]:
Pseudobulk_MASTER_Counts = pd.read_csv('./Results/03a_Pseudobulk_Sums/pseudobulk_sums.txt', 
                                       delimiter = '\t',
                                       index_col=0, 
                                       header=0)

In [3]:
Pseudobulk_MASTER_Counts

Unnamed: 0,L28_B Cells,L28_Centrilobular Hepatocytes,L28_Cholangiocytes,L28_HSCs,L28_LSECs,L28_Macrophages,L28_Neutrophils,L28_PFs,L28_Periportal Hepatocytes,L28_T Cells,...,L54_Centrilobular Hepatocytes,L54_Cholangiocytes,L54_HSCs,L54_LSECs,L54_Macrophages,L54_Neutrophils,L54_PFs,L54_Periportal Hepatocytes,L54_T Cells,L54_pDCs
0610006L08Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009B22Rik,0,15,0,2,2,6,0,0,45,0,...,39,1,4,7,24,0,0,27,5,0
0610010F05Rik,2,219,5,56,64,52,3,2,586,3,...,834,40,48,238,372,12,17,441,62,13
0610010K14Rik,0,12,0,1,7,6,0,0,45,0,...,23,0,0,12,21,1,0,21,3,1
0610012D04Rik,0,0,0,0,0,0,0,0,0,0,...,1,0,2,0,2,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mt-Nd3,0,11,0,0,5,5,1,1,28,0,...,14,1,1,10,16,2,1,6,4,0
mt-Nd4,1,30,1,7,14,17,1,1,79,2,...,66,7,10,30,88,9,0,42,17,2
mt-Nd4l,0,3,0,1,1,0,0,0,8,0,...,5,0,1,5,9,1,2,3,4,1
mt-Nd5,0,18,0,3,5,7,0,2,52,1,...,37,5,3,16,52,2,2,16,10,1


In [4]:
Pseudobulk_MASTER_Counts.loc['Alb']

L28_B Cells                         40
L28_Centrilobular Hepatocytes     4140
L28_Cholangiocytes                  57
L28_HSCs                           508
L28_LSECs                          813
                                 ...  
L54_Neutrophils                    426
L54_PFs                            129
L54_Periportal Hepatocytes       11123
L54_T Cells                        857
L54_pDCs                           210
Name: Alb, Length: 242, dtype: int64

## <br> 3. Subset Data by Celltype

In [5]:
# Subset pseudobulk counts by cell type using column name patterns
B_Cell_Counts = Pseudobulk_MASTER_Counts.filter(like="B Cells")
Cholangiocyte_Counts = Pseudobulk_MASTER_Counts.filter(like="Cholangiocytes")
Centrilobular_Hepatocyte_Counts = Pseudobulk_MASTER_Counts.filter(like="Centrilobular Hepatocytes")
Periportal_Hepatocyte_Counts = Pseudobulk_MASTER_Counts.filter(like="Periportal Hepatocytes")
HSC_Counts = Pseudobulk_MASTER_Counts.filter(like="HSCs")
LSEC_Counts = Pseudobulk_MASTER_Counts.filter(like="LSECs")
Macrophage_Counts = Pseudobulk_MASTER_Counts.filter(like="Macrophages")
Neutrophil_Counts = Pseudobulk_MASTER_Counts.filter(like="Neutrophils")
pDC_Counts = Pseudobulk_MASTER_Counts.filter(like="pDCs")
PF_Counts = Pseudobulk_MASTER_Counts.filter(like="PFs")
T_Cell_Counts = Pseudobulk_MASTER_Counts.filter(like="T Cells")

In [6]:
T_Cell_Counts

Unnamed: 0,L28_T Cells,L29_T Cells,L30_T Cells,L32_T Cells,L33_T Cells,L35_T Cells,L36_T Cells,L37_T Cells,L38_T Cells,L39_T Cells,...,L44_T Cells,L45_T Cells,L46_T Cells,L48_T Cells,L49_T Cells,L50_T Cells,L51_T Cells,L52_T Cells,L53_T Cells,L54_T Cells
0610006L08Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009B22Rik,0,1,0,1,1,3,1,0,0,1,...,0,0,1,1,1,2,1,1,6,5
0610010F05Rik,3,6,4,5,2,13,9,10,9,13,...,4,10,11,13,44,20,11,26,119,62
0610010K14Rik,0,1,1,0,0,1,0,0,0,0,...,0,0,1,2,1,0,2,2,7,3
0610012D04Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mt-Nd3,0,0,0,2,0,0,0,1,2,0,...,0,0,0,0,2,0,0,1,4,4
mt-Nd4,2,1,0,2,0,3,1,8,9,8,...,2,0,4,9,14,4,0,4,25,17
mt-Nd4l,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,1,0,1,0,4,4
mt-Nd5,1,1,1,4,0,0,5,1,2,1,...,0,1,1,3,6,5,1,2,15,10


In [7]:
# Create output directory if it doesn't exist
import os

directory = 'Results/03c_Gene_Counts_and_Percentiles'

if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"Directory '{directory}' created successfully.")
else:
    print(f"Directory '{directory}' already exists.")

# Set output directory path
output_dir = 'Results/03c_Gene_Counts_and_Percentiles/'

Directory 'Results/03c_Gene_Counts_and_Percentiles' already exists.


In [8]:
# List of dataframes
dataframes = [B_Cell_Counts, Cholangiocyte_Counts, Centrilobular_Hepatocyte_Counts, Periportal_Hepatocyte_Counts,
              HSC_Counts, LSEC_Counts, Macrophage_Counts, Neutrophil_Counts, pDC_Counts, PF_Counts, T_Cell_Counts]

# List of cell names
cell_names = ["B Cells", "Cholangiocytes", "Centrilobular Hepatocytes","Periportal Hepatocytes", 
              "HSCs", "LSECs", "Macrophages", "Neutrophils", "pDCs", "PFs", "T Cells"]

# Create a dictionary to store the maximum values of each row for each cell type
max_values_dict = {}

for cell_df, cell_name in zip(dataframes, cell_names):
    # Find the maximum value of each row (axis=1) and store it in the dictionary
    max_values_dict[cell_name] = cell_df.max(axis=1)

# Create a new DataFrame using the dictionary
max_values_df = pd.DataFrame(max_values_dict)

max_values_df.to_csv(output_dir + "Max_Count_Per_Celltype.txt", index=True, sep='\t')

# Print the new DataFrame with cell names as column names and maximum values of each row
print(max_values_df)

               B Cells  Cholangiocytes  Centrilobular Hepatocytes  \
0610006L08Rik        0               0                          1   
0610009B22Rik        5              14                         64   
0610010F05Rik       60             199                       1099   
0610010K14Rik        6               9                         46   
0610012D04Rik        0               1                          5   
...                ...             ...                        ...   
mt-Nd3               3               2                         20   
mt-Nd4              38              34                         66   
mt-Nd4l              3               2                          9   
mt-Nd5              18               9                         37   
mt-Nd6              14               8                         26   

               Periportal Hepatocytes  HSCs  LSECs  Macrophages  Neutrophils  \
0610006L08Rik                       3     0      2            1            0   
0610009B22R

In [9]:
# Calculate the percentile of each row relative to each column (cell type)
percentile_df = max_values_df.rank(axis=0, pct=True) * 100

percentile_df.to_csv(output_dir + "Max_Count_Gene_Percentile_Per_Celltype.txt", index=True, sep='\t')

# Print the resulting DataFrame
percentile_df

Unnamed: 0,B Cells,Cholangiocytes,Centrilobular Hepatocytes,Periportal Hepatocytes,HSCs,LSECs,Macrophages,Neutrophils,pDCs,PFs,T Cells
0610006L08Rik,15.277134,15.511857,15.364777,23.154957,10.634054,38.390788,19.090121,24.627768,22.716741,23.524671,14.393649
0610009B22Rik,68.659964,74.080753,74.915883,73.358451,62.302299,63.016541,69.653255,70.872202,55.047045,56.956057,69.195898
0610010F05Rik,89.269236,94.159128,94.238712,91.947898,90.915318,95.716560,96.643362,88.649689,91.598332,92.025467,92.587593
0610010K14Rik,70.551851,69.985695,72.106260,70.865150,62.302299,71.054540,71.544134,58.409727,73.240586,70.574013,70.705982
0610012D04Rik,15.277134,39.561381,43.594987,42.598674,58.857010,8.123627,42.991558,58.409727,68.063587,23.524671,39.319606
...,...,...,...,...,...,...,...,...,...,...,...
mt-Nd3,62.394979,51.826406,63.792235,64.921524,68.519936,65.353696,66.877884,70.872202,68.063587,56.956057,64.723067
mt-Nd4,86.046582,81.737957,75.130458,76.445107,78.295690,82.670804,82.213447,89.809200,83.383031,85.009973,81.563677
mt-Nd4l,62.394979,51.826406,53.428163,60.136401,53.689078,66.292588,65.624685,79.074406,55.047045,70.574013,64.723067
mt-Nd5,80.172466,69.985695,70.063264,70.336268,70.986038,78.375275,75.769146,88.649689,78.231217,76.187214,77.494812


In [10]:
# Spot Check Data
lnc = 'lnc' + '8962'
max_count = max_values_df.loc[lnc]
percentile = percentile_df.loc[lnc]


# Print the resulting row
print(max_count)
print()
print(percentile)

B Cells                        6
Cholangiocytes                 4
Centrilobular Hepatocytes    215
Periportal Hepatocytes       713
HSCs                          13
LSECs                         31
Macrophages                   52
Neutrophils                    1
pDCs                           2
PFs                            1
T Cells                        5
Name: lnc8962, dtype: int64

B Cells                      70.551851
Cholangiocytes               61.390607
Centrilobular Hepatocytes    84.247376
Periportal Hepatocytes       88.514698
HSCs                         73.584107
LSECs                        74.822195
Macrophages                  73.534745
Neutrophils                  58.409727
pDCs                         68.063587
PFs                          56.956057
T Cells                      67.296960
Name: lnc8962, dtype: float64
