# Make Tables of DEseq2 Data for DEGs that are unique to each celltype.

### 1. Import Required Packages
### 2. Import Deseq2 Data
### 3. Split DESeq2 Data By Cell Type
### 4. Merge DEG gene list of interest with DESeq2 data
### 5. Export All Data


## <br> 1. Import Required Packages

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

## <br> 2. Import Deseq2 Data

In [2]:
Deseq2_Master = pd.read_csv('./Results/Deseq2_Master_Wald-EC_CELLS_ONLY.txt', 
                            delimiter = '\t',
                            index_col=0)

Deseq2_Master['Time'] = Deseq2_Master['Time'].astype('category')
Deseq2_Master['Time'] = Deseq2_Master['Time'].cat.reorder_categories([2,4,8,12,18,24,72])

In [3]:
Deseq2_Master

Unnamed: 0,Gene,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,Celltype,Time,Fold-Change
0,Gm42418,4648.083478,0.364905,0.483332,0.754978,0.450262,0.999125,LyECs,2,1.287797
1,Malat1,2629.622767,-0.212570,0.354544,-0.599557,0.548801,0.999125,LyECs,2,0.862999
2,Cmss1,774.181280,0.902527,0.577271,1.563438,0.117950,0.999125,LyECs,2,1.869337
3,Neat1,598.588051,0.155406,0.404867,0.383844,0.701094,0.999125,LyECs,2,1.113735
4,Dpyd,533.282076,-0.335522,0.366885,-0.914513,0.360447,0.999125,LyECs,2,0.792498
...,...,...,...,...,...,...,...,...,...,...
421647,Ubxn2a,1.071625,0.379140,1.356764,0.279444,0.779904,0.997212,Unknown,72,1.300566
421648,Utp11,0.892990,-3.091406,1.708405,-1.809527,0.070369,0.997212,Unknown,72,0.117326
421649,Vdac1,0.842426,1.551787,1.599392,0.970236,0.331929,0.997212,Unknown,72,2.931801
421650,Wdfy1,0.964786,0.870753,1.503533,0.579138,0.562496,0.997212,Unknown,72,1.828617


## Important previous exported DEG lists

In [5]:
import pandas as pd
import os

cell_types = ['Pericentral_LSECs', 'Midzonal_LSECs', 'Periportal_LSECs', 'PECs', 'LyECs', 'Periportal_VECs', 'Unknown', 'Pericentral_VECs']

cell_type_data = {}

for cell_type in cell_types:
    filename = f"./Results/04d_Unique_to_Celltypes_DEGs/Unique_to_{cell_type}.txt"
    
    # Check if the file is not empty before reading
    if os.path.getsize(filename) > 0:
        df = pd.read_csv(filename, sep='\t', header=None)
        cell_type_data[cell_type] = df
    else:
        print(f"Warning: The file {filename} is empty.")

FileNotFoundError: [Errno 2] No such file or directory: './Results/04d_Unique_to_Celltypes_DEGs/Unique_to_Pericentral_LSECs.txt'

In [6]:
B_Cell_data = cell_type_data.get('B_Cell')
Cholangiocyte_data = cell_type_data.get('Cholangiocyte')
Hepatocyte_data = cell_type_data.get('Hepatocyte')
HSC_data = cell_type_data.get('HSC')
LSEC_data = cell_type_data.get('EC')
Macrophage_data = cell_type_data.get('Macrophage')
PF_data = cell_type_data.get('PF')
T_Cell_data = cell_type_data.get('T_Cell')

In [7]:
B_Cell_data.rename(columns={0: 'Gene'}, inplace=True)
Cholangiocyte_data.rename(columns={0: 'Gene'}, inplace=True)
Hepatocyte_data.rename(columns={0: 'Gene'}, inplace=True)
HSC_data.rename(columns={0: 'Gene'}, inplace=True)
LSEC_data.rename(columns={0: 'Gene'}, inplace=True)
Macrophage_data.rename(columns={0: 'Gene'}, inplace=True)
PF_data.rename(columns={0: 'Gene'}, inplace=True)
T_Cell_data.rename(columns={0: 'Gene'}, inplace=True)

In [8]:
B_Cell_data

Unnamed: 0,Gene
0,Shroom2
1,St3gal5
2,Esr1
3,Asl
4,Scp2


## <br> 3. Split DESeq2 Data By Cell Type

In [9]:
# Step 1: Extract the necessary columns
reconfigured_dfs = {}

# Group the data by 'Celltype'
grouped = Deseq2_Master.groupby('Celltype')

# Iterate through each 'Celltype' group
for celltype, group_data in grouped:
    # Initialize a new DataFrame to store the reconfigured data
    new_df = pd.DataFrame(columns=['Gene'])
    
    # Group the data by 'Time'
    time_grouped = group_data.groupby('Time')
    
    # Iterate through each 'Time' group
    for time_point, time_group in time_grouped:
        # Extract 'Fold-Change' and 'padj' columns and store in the new DataFrame
        time_group_df = time_group[['Gene', 'Fold-Change', 'padj']].copy()
        
        # Rename 'Fold-Change' and 'padj' columns based on the 'Time' point
        time_group_df.rename(
            columns={
                'Fold-Change': f'{celltype}_deseq2_FoldChange_{time_point}',
                'padj': f'{celltype}_deseq2_padj_{time_point}'
            },
            inplace=True
        )
        
        # Merge the 'Fold-Change' and 'padj' columns with the main DataFrame
        new_df = pd.merge(new_df, time_group_df, on='Gene', how='outer')
    
    # Store the DataFrame in the dictionary with the key as the Celltype
    reconfigured_dfs[celltype] = new_df

# Step 2: Access the DataFrames for each cell type with specific names
deseq2_B_Cell_data = reconfigured_dfs.get('B_Cell')
deseq2_Cholangiocyte_data = reconfigured_dfs.get('Cholangiocyte')
deseq2_Hepatocyte_data = reconfigured_dfs.get('Hepatocyte')
deseq2_HSC_data = reconfigured_dfs.get('HSC')
deseq2_LSEC_data = reconfigured_dfs.get('EC')
deseq2_Macrophage_data = reconfigured_dfs.get('Macrophage')
deseq2_PF_data = reconfigured_dfs.get('PF')
deseq2_T_Cell_data = reconfigured_dfs.get('T_Cell')


In [10]:
deseq2_B_Cell_data

Unnamed: 0,Gene,B_Cell_deseq2_FoldChange_2,B_Cell_deseq2_padj_2,B_Cell_deseq2_FoldChange_4,B_Cell_deseq2_padj_4,B_Cell_deseq2_FoldChange_8,B_Cell_deseq2_padj_8,B_Cell_deseq2_FoldChange_12,B_Cell_deseq2_padj_12,B_Cell_deseq2_FoldChange_18,B_Cell_deseq2_padj_18,B_Cell_deseq2_FoldChange_24,B_Cell_deseq2_padj_24,B_Cell_deseq2_FoldChange_72,B_Cell_deseq2_padj_72
0,Gm42418,1.064661,0.998335,2.532164,0.208561,0.629278,0.994735,0.574046,0.508614,1.030092,0.995419,0.814515,0.998802,0.804558,0.999896
1,Malat1,0.931875,0.998335,0.898307,0.985893,1.057680,0.994735,1.062653,0.922744,1.317282,0.335904,1.140146,0.998802,1.164763,0.999896
2,Cmss1,1.239071,0.998335,1.757142,0.842251,0.764794,0.994735,0.571906,0.493395,1.013964,0.997034,0.984522,0.998802,0.935644,0.999896
3,Foxp1,0.971811,0.998335,1.001156,0.998620,0.877335,0.994735,1.086641,0.897345,0.731725,0.316888,0.967729,0.998802,1.027890,0.999896
4,Camk1d,1.089294,0.998335,1.526074,0.780960,0.829152,0.994735,0.633920,0.402068,0.929654,0.964016,1.025166,0.998802,0.944144,0.999896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9123,Tubb4b,0.175632,,1.376400,,2.858580,,0.456754,,0.931297,,0.728940,,3.532064,
9124,Ubl5,0.273010,,2.519195,,1.768847,,4.597153,,0.416581,,6.010819,,4.010734,0.999896
9125,Zfp512b,1.845320,,1.405810,,1.086337,,0.371952,,1.004666,,3.293320,,1.151767,
9126,Zfp566,0.741167,,1.034031,,0.112050,,1.201611,,0.525194,,1.157933,,2.635755,


## <br> 4. Merge DEG gene list of interest with DESeq2 data

In [11]:
# Merge each cell type DataFrame with its respective deseq2 DataFrame using a left join
merged_B_Cell_data_with_deseq2 = pd.merge(B_Cell_data, deseq2_B_Cell_data, on='Gene', how='left')
merged_Cholangiocyte_data_with_deseq2 = pd.merge(Cholangiocyte_data, deseq2_Cholangiocyte_data, on='Gene', how='left')
merged_Hepatocyte_data_with_deseq2 = pd.merge(Hepatocyte_data, deseq2_Hepatocyte_data, on='Gene', how='left')
merged_HSC_data_with_deseq2 = pd.merge(HSC_data, deseq2_HSC_data, on='Gene', how='left')
merged_LSEC_data_with_deseq2 = pd.merge(LSEC_data, deseq2_LSEC_data, on='Gene', how='left')
merged_Macrophage_data_with_deseq2 = pd.merge(Macrophage_data, deseq2_Macrophage_data, on='Gene', how='left')
merged_PF_data_with_deseq2 = pd.merge(PF_data, deseq2_PF_data, on='Gene', how='left')
merged_T_Cell_data_with_deseq2 = pd.merge(T_Cell_data, deseq2_T_Cell_data, on='Gene', how='left')

In [12]:
merged_B_Cell_data_with_deseq2

Unnamed: 0,Gene,B_Cell_deseq2_FoldChange_2,B_Cell_deseq2_padj_2,B_Cell_deseq2_FoldChange_4,B_Cell_deseq2_padj_4,B_Cell_deseq2_FoldChange_8,B_Cell_deseq2_padj_8,B_Cell_deseq2_FoldChange_12,B_Cell_deseq2_padj_12,B_Cell_deseq2_FoldChange_18,B_Cell_deseq2_padj_18,B_Cell_deseq2_FoldChange_24,B_Cell_deseq2_padj_24,B_Cell_deseq2_FoldChange_72,B_Cell_deseq2_padj_72
0,Shroom2,2.043325,0.991955,0.906657,0.989522,1.885411,0.994735,0.717282,0.837339,2.641892,0.046462,0.837564,0.998802,1.486853,0.999896
1,St3gal5,2.127931,0.739812,0.774852,0.985893,0.519396,0.866287,0.353633,0.034003,1.160932,0.929591,0.605787,0.998802,0.718509,0.999896
2,Esr1,1.127967,0.998335,0.889386,0.989522,1.007876,0.996657,0.335872,0.029042,0.933801,0.978463,1.038343,0.998802,0.601205,0.999896
3,Asl,1.515292,0.998335,0.899849,0.989522,0.583252,0.939158,0.760674,0.732111,1.156236,0.91703,0.30201,0.004601,0.585294,0.999896
4,Scp2,0.464372,0.896513,0.26,0.035589,0.730597,0.994735,1.0792,0.979888,2.184647,0.495291,0.639807,0.998802,0.778241,0.999896


### Clean up DFs

In [13]:
merged_B_Cell_data_with_deseq2.columns = merged_B_Cell_data_with_deseq2.columns.str.replace(f"B_Cell_deseq2_", "")
merged_Cholangiocyte_data_with_deseq2.columns = merged_Cholangiocyte_data_with_deseq2.columns.str.replace(f"Cholangiocyte_deseq2_", "")
merged_Hepatocyte_data_with_deseq2.columns = merged_Hepatocyte_data_with_deseq2.columns.str.replace(f"Hepatocyte_deseq2_", "")
merged_HSC_data_with_deseq2.columns = merged_HSC_data_with_deseq2.columns.str.replace(f"HSC_deseq2_", "")
merged_LSEC_data_with_deseq2.columns = merged_LSEC_data_with_deseq2.columns.str.replace(f"EC_deseq2_", "")
merged_Macrophage_data_with_deseq2.columns = merged_Macrophage_data_with_deseq2.columns.str.replace(f"Macrophage_deseq2_", "")
merged_PF_data_with_deseq2.columns = merged_PF_data_with_deseq2.columns.str.replace(f"PF_deseq2_", "")
merged_T_Cell_data_with_deseq2.columns = merged_T_Cell_data_with_deseq2.columns.str.replace(f"T_Cell_deseq2_", "")


In [16]:
merged_LSEC_data_with_deseq2

Unnamed: 0,Gene,FoldChange_2,padj_2,FoldChange_4,padj_4,FoldChange_8,padj_8,FoldChange_12,padj_12,FoldChange_18,padj_18,FoldChange_24,padj_24,FoldChange_72,padj_72
0,Fgf23,121.097403,,177.080397,,14.119877,,77.122690,0.000785,30.454523,0.061768,11.828587,0.530278,24.543592,0.044642
1,Abcb1b,0.785098,,2.935619,,0.576770,,0.892551,,2.293360,0.872570,3.265903,,40.340433,0.008695
2,Rassf9,3.501963,,1.771129,,1.449123,0.967736,5.279393,0.269241,2.339365,0.870042,13.973945,0.022940,6.925543,0.181992
3,Fam129b,12.105126,2.435725e-25,5.322089,1.479891e-10,2.008897,0.160873,2.943079,0.000393,2.689726,0.005931,2.185157,0.100142,1.652948,0.327928
4,Chst11,6.933244,8.412258e-06,10.275086,8.247047e-08,1.293754,0.967736,2.235320,0.458252,1.079963,0.987669,2.108772,0.802404,1.958123,0.572694
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221,Zfp503,0.242021,1.236383e-04,0.832882,8.972905e-01,0.707503,0.923011,0.744579,0.786458,0.628654,0.774285,0.721929,0.924057,0.993856,0.998273
222,Dnm3os,0.195227,4.443334e-04,0.423286,2.194749e-01,0.738624,0.963298,1.274772,0.879043,0.781085,0.951523,0.651931,0.924057,1.123501,0.948556
223,Pdgfrl,0.967354,,0.152600,,0.336768,0.804198,0.145999,0.045350,0.859454,0.987425,0.187670,0.805968,0.371470,0.694027
224,Xylt1,1.111076,9.890288e-01,2.917518,2.493785e-01,0.619122,0.949187,0.121664,0.000899,0.457057,0.793676,1.178709,0.987683,1.549009,0.809413


## 5. Export All Data

In [17]:
# List of all the merged DataFrames with their respective cell types and deseq2 DataFrames
merged_dfs = [
    merged_B_Cell_data_with_deseq2,
    merged_Cholangiocyte_data_with_deseq2,
    merged_Hepatocyte_data_with_deseq2,
    merged_HSC_data_with_deseq2,
    merged_LSEC_data_with_deseq2,
    merged_Macrophage_data_with_deseq2,
    merged_PF_data_with_deseq2,
    merged_T_Cell_data_with_deseq2
]

# List of corresponding cell types
cell_types = ['B_Cell', 'Cholangiocyte', 'Hepatocyte', 'HSC', 'EC', 'Macrophage', 'PF', 'T_Cell']

# Create the directory if it doesn't exist
output_directory = "./Results/04e_Unique_to_Celltypes_DEGs_with_Deseq2_Gene_Expression/"
os.makedirs(output_directory, exist_ok=True)

# Export each DataFrame to a separate file with the specified name
for idx, merged_df in enumerate(merged_dfs):
    celltype = cell_types[idx]
    
    # Fill empty cells with 'NA'
    merged_df.fillna('NA', inplace=True)
    
    # Remove "{celltype}_deseq2_" from the column names
    merged_df.columns = merged_df.columns.str.replace(f"{celltype}_deseq2_", "")
    
    # Set "Gene" as the index
    merged_df.set_index("Gene", inplace=True)
    
    # Export the DataFrame to a file
    file_name = f"{celltype}_Unique_DEGs_DEseq2_Data.txt"
    file_path = os.path.join(output_directory, file_name)
    merged_df.to_csv(file_path, sep='\t', index=True, header=True)