# Determine Abundance Percentiles of each Gene for each Cell Type 

### 1. Import Required Packages
### 2. Import Data
### 3. Subset Data by Celltype



## <br> 1. Import Required Packages

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

## <br> 2. Import Deseq2 Data

In [4]:
Pseudobulk_MASTER_Counts = pd.read_csv('./Results/pseudobulk_sums.txt', 
                                       delimiter = '\t',
                                       index_col=0, 
                                       header=0)

In [5]:
Pseudobulk_MASTER_Counts

Unnamed: 0,L001_B Cells,L001_Cholangiocytes,L001_ECs,L001_HSCs,L001_Hepatocytes,L001_Macrophages,L001_Neutrophils,L001_PFs,L001_T Cells,L001_pDCs,...,L138_B Cells,L138_Cholangiocytes,L138_ECs,L138_HSCs,L138_Hepatocytes,L138_Macrophages,L138_Neutrophils,L138_PFs,L138_T Cells,L138_pDCs
0610005C13Rik,23,1,211,52,12982,54,1,4,17,3,...,40,2,129,26,6830,45,5,1,36,4
0610006L08Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009B22Rik,0,1,3,3,74,0,0,0,0,0,...,1,0,3,0,93,1,0,0,0,0
0610009E02Rik,0,0,14,5,240,1,0,0,0,0,...,4,0,16,7,325,5,1,0,1,0
0610009L18Rik,0,0,1,1,9,0,0,0,0,1,...,0,0,2,2,8,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mt-Nd3,0,0,2,4,57,0,0,0,0,0,...,2,0,9,3,96,5,0,0,2,0
mt-Nd4,0,0,6,2,129,1,0,0,2,0,...,6,3,40,7,210,5,1,0,2,1
mt-Nd4l,0,0,1,0,16,0,0,0,0,0,...,1,0,2,2,47,2,0,0,2,0
mt-Nd5,0,0,1,2,65,1,0,0,1,0,...,2,0,15,3,145,2,0,0,6,0


## <br> 3. Subset Data by Celltype

In [6]:
B_Cell_Counts = Pseudobulk_MASTER_Counts.filter(like="B Cells")
Cholangiocyte_Counts = Pseudobulk_MASTER_Counts.filter(like="Cholangiocytes")
Hepatocyte_Counts = Pseudobulk_MASTER_Counts.filter(like="Hepatocytes")
HSC_Counts = Pseudobulk_MASTER_Counts.filter(like="HSCs")
LSEC_Counts = Pseudobulk_MASTER_Counts.filter(like="ECs")
Macrophage_Counts = Pseudobulk_MASTER_Counts.filter(like="Macrophages")
Neutrophil_Counts = Pseudobulk_MASTER_Counts.filter(like="Neutrophils")
pDC_Counts = Pseudobulk_MASTER_Counts.filter(like="pDCs")
PF_Counts = Pseudobulk_MASTER_Counts.filter(like="PFs")
T_Cell_Counts = Pseudobulk_MASTER_Counts.filter(like="T Cells")

In [7]:
T_Cell_Counts

Unnamed: 0,L001_T Cells,L002_T Cells,L003_T Cells,L016_T Cells,L018_T Cells,L019_T Cells,L021_T Cells,L022_T Cells,L023_T Cells,L036_T Cells,...,L103_T Cells,L116_T Cells,L117_T Cells,L118_T Cells,L121_T Cells,L122_T Cells,L123_T Cells,L136_T Cells,L137_T Cells,L138_T Cells
0610005C13Rik,17,31,13,30,28,51,19,39,50,21,...,43,95,28,29,25,16,25,37,37,36
0610006L08Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009B22Rik,0,1,0,0,0,0,0,0,4,1,...,1,4,0,0,0,1,1,1,0,0
0610009E02Rik,0,0,0,0,0,3,0,1,2,2,...,0,2,3,0,1,1,0,0,1,1
0610009L18Rik,0,0,0,0,0,0,1,0,1,1,...,1,1,1,0,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mt-Nd3,0,0,3,0,5,2,0,1,4,0,...,10,5,0,8,1,1,3,1,3,2
mt-Nd4,2,1,6,3,11,15,1,10,12,0,...,57,6,3,27,1,8,11,1,11,2
mt-Nd4l,0,0,0,0,2,4,0,0,1,0,...,3,2,1,5,0,0,2,0,1,2
mt-Nd5,1,1,1,0,0,4,0,0,4,0,...,16,5,1,7,1,1,3,1,8,6


In [8]:
# List of dataframes
dataframes = [B_Cell_Counts, Cholangiocyte_Counts, Hepatocyte_Counts, HSC_Counts,
              LSEC_Counts, Macrophage_Counts, Neutrophil_Counts, pDC_Counts, PF_Counts, T_Cell_Counts]

# List of cell names
cell_names = ["B Cells", "Cholangiocytes", "Hepatocytes", "HSCs", "ECs",
              "Macrophages", "Neutrophils", "pDCs", "PFs", "T Cells"]

# Create a dictionary to store the maximum values of each row for each cell type
max_values_dict = {}

for cell_df, cell_name in zip(dataframes, cell_names):
    # Find the maximum value of each row (axis=1) and store it in the dictionary
    max_values_dict[cell_name] = cell_df.max(axis=1)

# Create a new DataFrame using the dictionary
max_values_df = pd.DataFrame(max_values_dict)

max_values_df.to_csv("./Results/Max_Count_Per_Celltype.txt", index=True, sep='\t')

# Print the new DataFrame with cell names as column names and maximum values of each row
print(max_values_df)

               B Cells  Cholangiocytes  Hepatocytes  HSCs  ECs  Macrophages   
0610005C13Rik       65              18        20978   144  408          196  \
0610006L08Rik        0               0            4     1    0            1   
0610009B22Rik        3               1          388     6   10            4   
0610009E02Rik        4               5          738    37   52           15   
0610009L18Rik        3               2           46     6   11            3   
...                ...             ...          ...   ...  ...          ...   
mt-Nd3               5               2          325    12   43           12   
mt-Nd4              35               4         1712    51  212           72   
mt-Nd4l              8               1          232     7   37           15   
mt-Nd5              16               3          568    18   66           21   
mt-Nd6               4               2          181    10   14            8   

               Neutrophils  pDCs  PFs  T Cells  
06

In [9]:
# Calculate the percentile of each row relative to each column (cell type)
percentile_df = max_values_df.rank(axis=0, pct=True) * 100

percentile_df.to_csv("./Results/Max_Count_Gene_Percentile_Per_Celltype.txt", index=True, sep='\t')

# Print the resulting DataFrame
print(percentile_df)

                 B Cells  Cholangiocytes  Hepatocytes       HSCs        ECs   
0610005C13Rik  91.445215       91.824846    97.477855  84.814766  90.873587  \
0610006L08Rik   9.362046       13.354715     9.809312  15.981586   3.525767   
0610009B22Rik  46.190601       35.584937    56.303181  40.932496  42.136842   
0610009E02Rik  50.693808       71.141511    63.553257  65.835406  63.197626   
0610009L18Rik  46.190601       49.845093    37.258367  40.932496  43.256098   
...                  ...             ...          ...        ...        ...   
mt-Nd3         54.021032       49.845093    54.653750  49.546188  60.568574   
mt-Nd4         84.642405       65.933586    75.083999  70.637518  83.636602   
mt-Nd4l        61.578741       35.584937    51.400707  42.867740  58.284243   
mt-Nd5         73.074573       59.091504    60.455121  55.186106  66.965571   
mt-Nd6         50.693808       49.845093    49.109831  47.281494  46.024785   

               Macrophages  Neutrophils       pDCs 

In [10]:
# Spot Check Data
max_count = max_values_df.loc["Xlr4a"]
percentile = percentile_df.loc["Xlr4a"]


# Print the resulting row
print(max_count)
print()
print(percentile)

B Cells             3
Cholangiocytes      1
Hepatocytes       258
HSCs                3
ECs                 4
Macrophages         4
Neutrophils         0
pDCs                2
PFs                 0
T Cells             7
Name: Xlr4a, dtype: int64

B Cells           46.190601
Cholangiocytes    35.584937
Hepatocytes       52.454510
HSCs              31.629358
ECs               30.929005
Macrophages       40.840860
Neutrophils       21.071694
pDCs              62.477637
PFs               12.281276
T Cells           58.971506
Name: Xlr4a, dtype: float64
