In [None]:
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = (4, 4)

# Set options to display a specific number of columns and control column width
pd.set_option('display.max_columns', 20)  # Set the maximum number of columns to display
pd.set_option('display.max_colwidth', 100)  # Set the maximum column width for text data
pd.set_option('display.max_rows', 100)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Choose a specific gene set to plot
specific_gene_set = "Cellular Response To Interleukin-1 (GO:0071347)"

# Define paths and cell types
cell_types = ["Podo", "TAL", "DCT_CNT_CD", "EC", "Stromal", "Immune", "PEC", "PT", "IC", "DTL_ATL"]
base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_GO_unfiltered/"
pval_folder = "Padj_GO_unfiltered/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"

# Read metadata
metadat = pd.read_csv(metadat_path)
unique_samples = metadat[metadat['Disease_level2'] != "Control"]['Sample'].unique()

# Initialize combined dataframes for R2 and p-values
combined_r2 = pd.DataFrame(index=unique_samples)
combined_pval = pd.DataFrame(index=unique_samples)

# Process each cell type
for cell_type in cell_types:
    r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
    pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")
    
    if os.path.exists(r2_path) and os.path.exists(pval_path):
        r2_df = pd.read_csv(r2_path, index_col=0)
        pval_df = pd.read_csv(pval_path, index_col=0)
        r2_df = r2_df[[specific_gene_set]]
        pval_df = pval_df[[specific_gene_set]]
    
        r2_df.columns = [f"{cell_type}_{col}" for col in r2_df.columns]
        pval_df.columns = [f"{cell_type}_{col}" for col in pval_df.columns]
    
        combined_r2 = combined_r2.join(r2_df, how='left')
        combined_pval = combined_pval.join(pval_df, how='left')

# Set NaN values to 1
combined_r2.fillna(1, inplace=True)
combined_pval.fillna(1, inplace=True)



# Set non-significant R2 values to 1
p_value_threshold = 0.05
for col in combined_r2.columns:
    combined_r2[col] = np.where((combined_pval[col] > p_value_threshold), 1, combined_r2[col])

    
# Filter for the specific gene set across all cell types
plotting_matrix = combined_r2[[f"{ct}_{specific_gene_set}" for ct in cell_types]]

# Remove gene set names from column labels
plotting_matrix.columns = cell_types

# Plotting without dendrogram and custom NaN color
plt.figure(figsize=(24, 24))
sns.clustermap(plotting_matrix, cmap="rocket", figsize=(8, 8), 
               linewidths=.5, linecolor='black', row_cluster=True, col_cluster=True, 
               cbar_pos=(0.00, 0.00, 0.00, 0.00), vmin=0, vmax=1)

plt.show()


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Choose a specific gene set to plot
specific_gene_set = "hsa04150 mTOR signaling pathway - Homo sapiens (human)"

# Define paths and cell types
cell_types = ["Podo", "TAL", "DCT_CNT_CD", "EC", "Stromal", "Immune", "PEC", "PT", "IC", "DTL_ATL"]
base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_KEGG_unfiltered/"
pval_folder = "Padj_KEGG_unfiltered/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"

# Read metadata
metadat = pd.read_csv(metadat_path)
#unique_samples = metadat[metadat['Disease_level2'] == "PKD"]['Sample'].unique()
unique_samples = metadat[metadat['Disease_level2'] != "CKD"]['Sample'].unique()

# Initialize combined dataframes for R2 and p-values
combined_r2 = pd.DataFrame(index=unique_samples)
combined_pval = pd.DataFrame(index=unique_samples)

# Process each cell type
for cell_type in cell_types:
    r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
    pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")
    
    if os.path.exists(r2_path) and os.path.exists(pval_path):
        r2_df = pd.read_csv(r2_path, index_col=0)
        pval_df = pd.read_csv(pval_path, index_col=0)
        r2_df = r2_df[[specific_gene_set]]
        pval_df = pval_df[[specific_gene_set]]
    
        r2_df.columns = [f"{cell_type}_{col}" for col in r2_df.columns]
        pval_df.columns = [f"{cell_type}_{col}" for col in pval_df.columns]
    
        combined_r2 = combined_r2.join(r2_df, how='left')
        combined_pval = combined_pval.join(pval_df, how='left')

# Set NaN values to 1
combined_r2.fillna(1, inplace=True)
combined_pval.fillna(1, inplace=True)



# Set non-significant R2 values to 1
p_value_threshold = 0.05
for col in combined_r2.columns:
    combined_r2[col] = np.where((combined_pval[col] > p_value_threshold), 1, combined_r2[col])


    
# Filter for the specific gene set across all cell types
plotting_matrix = combined_r2[[f"{ct}_{specific_gene_set}" for ct in cell_types]]

# Remove gene set names from column labels
plotting_matrix.columns = cell_types

# Plotting without dendrogram and custom NaN color
plt.figure(figsize=(24, 24))
sns.clustermap(plotting_matrix, cmap="rocket", figsize=(8, 8), 
               linewidths=.5, linecolor='black', row_cluster=True, col_cluster=False, 
               cbar_pos=(0.00, 0.00, 0.00, 0.00), vmin=0, vmax=1)

plt.show()

In [None]:
# Filter rows where all values are -1 across all columns (i.e., across all cell types)
plotting_matrix_filtered = plotting_matrix[~(plotting_matrix == 1).all(axis=1)]

# Plotting without dendrogram and custom NaN color
plt.figure(figsize=(24, 24))
sns.clustermap(plotting_matrix_filtered, cmap="rocket", figsize=(8, 8), 
               linewidths=.5, linecolor='black', row_cluster=True, col_cluster=False, 
               cbar_pos=(0.00, 0.00, 0.00, 0.00), vmin=0, vmax=1)

plt.show()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Choose a specific gene set to plot
specific_gene_set = "hsa04668 TNF signaling pathway - Homo sapiens (human)"

# Define paths and cell types
cell_types = ["Podo", "TAL", "DCT_CNT_CD", "EC", "Stromal", "Immune", "PEC", "PT", "IC", "DTL_ATL"]
base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_KEGG_unfiltered/"
pval_folder = "Padj_KEGG_unfiltered/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"

# Read metadata
metadat = pd.read_csv(metadat_path)
unique_samples = metadat[metadat['Disease_level2'] != "Control"]['Sample'].unique()

# Initialize combined dataframes for R2 and p-values
combined_r2 = pd.DataFrame(index=unique_samples)
combined_pval = pd.DataFrame(index=unique_samples)

# Process each cell type
for cell_type in cell_types:
    r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
    pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")
    
    if os.path.exists(r2_path) and os.path.exists(pval_path):
        r2_df = pd.read_csv(r2_path, index_col=0)
        pval_df = pd.read_csv(pval_path, index_col=0)
        r2_df = r2_df[[specific_gene_set]]
        pval_df = pval_df[[specific_gene_set]]
    
        r2_df.columns = [f"{cell_type}_{col}" for col in r2_df.columns]
        pval_df.columns = [f"{cell_type}_{col}" for col in pval_df.columns]
    
        combined_r2 = combined_r2.join(r2_df, how='left')
        combined_pval = combined_pval.join(pval_df, how='left')

# Set NaN values to 1
combined_r2.fillna(1, inplace=True)
combined_pval.fillna(1, inplace=True)



# Set non-significant R2 values to 1
p_value_threshold = 0.05
for col in combined_r2.columns:
    combined_r2[col] = np.where((combined_pval[col] > p_value_threshold), 1, combined_r2[col])


    
# Filter for the specific gene set across all cell types
plotting_matrix = combined_r2[[f"{ct}_{specific_gene_set}" for ct in cell_types]]

# Remove gene set names from column labels
plotting_matrix.columns = cell_types

# Plotting without dendrogram and custom NaN color
plt.figure(figsize=(24, 24))
sns.clustermap(plotting_matrix, cmap="rocket", figsize=(8, 8), 
               linewidths=.5, linecolor='black', row_cluster=True, col_cluster=False, 
               cbar_pos=(0.00, 0.00, 0.00, 0.00), vmin=0, vmax=1)

plt.show()

In [None]:
# Filter rows where all values are -1 across all columns (i.e., across all cell types)
plotting_matrix_filtered = plotting_matrix[~(plotting_matrix == 1).all(axis=1)]

# Plotting without dendrogram and custom NaN color
plt.figure(figsize=(24, 24))
sns.clustermap(plotting_matrix_filtered, cmap="rocket", figsize=(8, 8), 
               linewidths=.5, linecolor='black', row_cluster=True, col_cluster=False, 
               cbar_pos=(0.00, 0.00, 0.00, 0.00), vmin=0, vmax=1)

plt.show()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Choose a specific gene set to plot
#specific_gene_set = "hsa04150 mTOR signaling pathway - Homo sapiens (human)"
specific_gene_set = "hsa04668 TNF signaling pathway - Homo sapiens (human)"

# Define paths and cell types
#cell_types = ["Podo", "TAL", "DCT_CNT_CD", "EC", "Stromal", "Immune", "PEC", "PT", "IC", "DTL_ATL"]
cell_types = ["PT"]
base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_KEGG_0/"
pval_folder = "Padj_KEGG_0/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"

# Read metadata
metadat = pd.read_csv(metadat_path)
unique_samples = metadat[metadat['Disease_level2'] != "Control"]['Sample'].unique()

# Initialize combined dataframes for R2 and p-values
combined_r2 = pd.DataFrame(index=unique_samples)
combined_pval = pd.DataFrame(index=unique_samples)

# Process each cell type
for cell_type in cell_types:
    r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
    pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")
    
    if os.path.exists(r2_path) and os.path.exists(pval_path):
        r2_df = pd.read_csv(r2_path, index_col=0)
        pval_df = pd.read_csv(pval_path, index_col=0)
        r2_df = r2_df[[specific_gene_set]]
        pval_df = pval_df[[specific_gene_set]]
    
        r2_df.columns = [f"{cell_type}_{col}" for col in r2_df.columns]
        pval_df.columns = [f"{cell_type}_{col}" for col in pval_df.columns]
    
        combined_r2 = combined_r2.join(r2_df, how='left')
        combined_pval = combined_pval.join(pval_df, how='left')

# Set NaN values to 1
combined_r2.fillna(1, inplace=True)
combined_pval.fillna(1, inplace=True)



# Set non-significant R2 values to 1
p_value_threshold = 0.05
for col in combined_r2.columns:
    combined_r2[col] = np.where((combined_pval[col] > p_value_threshold), 1, combined_r2[col])

    
# Filter for the specific gene set across all cell types
plotting_matrix = combined_r2[[f"{ct}_{specific_gene_set}" for ct in cell_types]]

# Remove gene set names from column labels
plotting_matrix.columns = cell_types

# Plotting without dendrogram and custom NaN color
plt.figure(figsize=(24, 24))
sns.clustermap(plotting_matrix, cmap="rocket", figsize=(8, 8), 
               linewidths=.5, linecolor='black', row_cluster=True, col_cluster=False, 
               cbar_pos=(0.00, 0.00, 0.00, 0.00), vmin=0, vmax=1)

plt.show()

In [None]:
import seaborn as sns

# Define the path to the R2 file for the specific cell type and gene set
#r2_base_path = "/home/kloetzer/Atlas/scSpectra/species_healthy_disease/"
cell_type = "PT"  # Replace with your desired cell type
target_gene_set = "hsa04150 mTOR signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name
#target_gene_set = "hsa04668 TNF signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name

metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

#group0 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="Control_diseased")].orig_ident.unique().tolist()
#group0 = metadata[(metadata['Hypertension']=="Yes")].Sample.unique().tolist()

#group1 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="SLGT2i")].orig_ident.unique().tolist()
group1 = metadata[(metadata['Disease_level2']!="PKD") & (metadata['Disease_level1']!="Control")].Sample.unique().tolist()

group2 = metadata[(metadata['Disease_level2']=="PKD") & (metadata['Disease_level2']!="Control")].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = ['HK2851', 'HK2886', '32-10346', '29-10406', '29-10393', 'PKD8_humphreys_ADPKD']

# Creating the list with samples to be removed
#group0 = [sample for sample in group0 if sample not in samples_to_remove]

group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]

# Identify significant samples based on Pval < 0.05 for the specific gene set
significant_samples = pval_df.index[pval_df[target_gene_set] < 0.05]


# Assuming the rest of your setup code (loading data, etc.) is here

# Define x-coordinates for the groups, closer together
group_positions = [0.5, 0.7]  # Adjust these values as needed

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["female", "male"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()



# Create a DataFrame for plotting
plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Create a beeswarm plot

sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'])
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1], ["nonPKD", "PKD"])  # Set custom x-axis ticks

plt.legend().remove()  # Remove the legend if not needed
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()

In [None]:
import seaborn as sns

cell_types = ["PT"]
base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_KEGG_unfiltered/"
pval_folder = "Padj_KEGG_unfiltered/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"


# Define the path to the R2 file for the specific cell type and gene set
#r2_base_path = "/home/kloetzer/Atlas/scSpectra/species_healthy_disease/"
cell_type = "PT"  # Replace with your desired cell type
target_gene_set = "hsa04150 mTOR signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name
#target_gene_set = "hsa04668 TNF signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name

metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

#group0 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="Control_diseased")].orig_ident.unique().tolist()
#group0 = metadata[(metadata['Hypertension']=="Yes")].Sample.unique().tolist()

#group1 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="SLGT2i")].orig_ident.unique().tolist()
group1 = metadata[(metadata['Disease_level2']=="AKI") & (metadata['Disease_level1']!="Control")].Sample.unique().tolist()

group2 = metadata[(metadata['Disease_level2']=="CKD") & (metadata['Disease_level2']!="Control")].Sample.unique().tolist()

group3 = metadata[(metadata['Disease_level2']=="PKD") & (metadata['Disease_level2']!="Control")].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = ['32-10346', '29-10406', '29-10393', 'PKD8_humphreys_ADPKD']

# Creating the list with samples to be removed
#group0 = [sample for sample in group0 if sample not in samples_to_remove]

group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]

group3 = [sample for sample in group3 if sample not in samples_to_remove]

# Identify significant samples based on Pval < 0.05 for the specific gene set
significant_samples = pval_df.index[pval_df[target_gene_set] < 0.05]


# Assuming the rest of your setup code (loading data, etc.) is here

# Define x-coordinates for the groups, closer together
group_positions = [0.5, 0.7, 0.9]  # Adjust these values as needed

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["1", "2", "3"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()



# Create a DataFrame for plotting
plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(3, 6))

# Create a beeswarm plot

sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'],  linewidth=0.3)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1, 2], ["AKI", "CKD", "PKD"])  # Set custom x-axis ticks
plt.xlim(-0.5, 2.5)

plt.legend().remove()  # Remove the legend if not needed
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()

significant_samples

In [None]:
import seaborn as sns

cell_types = ["PT"]
base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_KEGG_unfiltered/"
pval_folder = "Padj_KEGG_unfiltered/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"


# Define the path to the R2 file for the specific cell type and gene set
#r2_base_path = "/home/kloetzer/Atlas/scSpectra/species_healthy_disease/"
cell_type = "Podo"  # Replace with your desired cell type
target_gene_set = "hsa04668 TNF signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name
#target_gene_set = "hsa04668 TNF signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name

metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

#group0 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="Control_diseased")].orig_ident.unique().tolist()
#group0 = metadata[(metadata['Hypertension']=="Yes")].Sample.unique().tolist()

#group1 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="SLGT2i")].orig_ident.unique().tolist()
group1 = metadata[(metadata['Disease_level2']=="AKI") & (metadata['Disease_level1']!="Control")].Sample.unique().tolist()

group2 = metadata[(metadata['Disease_level2']=="CKD") & (metadata['Disease_level2']!="Control")].Sample.unique().tolist()

group3 = metadata[(metadata['Disease_level2']=="PKD") & (metadata['Disease_level2']!="Control")].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = ['32-2', '32-10346', '32-10419', '34-10331', '34-10240', '30-10125', '33-10376', '34-10184', '34-10050',
                    'HK2414', '27-10094', '27-10066', '31-10090', '28-12546', '28-12372', '28-12558', '29-10406', '28-12178', '29-10277', '29-10393', '28-12510',
                    'PKD8_humphreys_ADPKD', 'PKD7_humphreys_ADPKD', 'PKD6_humphreys_ADPKD', 'PKD4_humphreys_ADPKD']

# Creating the list with samples to be removed
#group0 = [sample for sample in group0 if sample not in samples_to_remove]

group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]

group3 = [sample for sample in group3 if sample not in samples_to_remove]

# Identify significant samples based on Pval < 0.05 for the specific gene set
significant_samples = pval_df.index[pval_df[target_gene_set] < 0.05]


# Assuming the rest of your setup code (loading data, etc.) is here

# Define x-coordinates for the groups, closer together
group_positions = [0.5, 0.7, 0.9]  # Adjust these values as needed

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["1", "2", "3"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()



# Create a DataFrame for plotting
plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(3, 6))

# Create a beeswarm plot

sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'],  linewidth=0.3)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1, 2], ["AKI", "CKD", "PKD"])  # Set custom x-axis ticks
plt.xlim(-0.5, 2.5)

plt.legend().remove()  # Remove the legend if not needed
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()

significant_samples

In [None]:
import seaborn as sns

cell_types = ["PT"]
base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_KEGG_unfiltered/"
pval_folder = "Padj_KEGG_unfiltered/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"


# Define the path to the R2 file for the specific cell type and gene set
#r2_base_path = "/home/kloetzer/Atlas/scSpectra/species_healthy_disease/"
cell_type = "PT"  # Replace with your desired cell type
#target_gene_set = "hsa04150 mTOR signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name
target_gene_set = "hsa04668 TNF signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name

metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

#group0 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="Control_diseased")].orig_ident.unique().tolist()
#group0 = metadata[(metadata['Hypertension']=="Yes")].Sample.unique().tolist()

#group1 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="SLGT2i")].orig_ident.unique().tolist()
group1 = metadata[(metadata['Disease_level1']=="AKI")].Sample.unique().tolist()

group2 = metadata[(metadata['Disease_level1']=="CKD") & (metadata['Disease_level2']!="PKD")].Sample.unique().tolist()

group3 = metadata[(metadata['Disease_level2']=="PKD")].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = ['32-10346', '29-10406', '29-10393', 'PKD8_humphreys_ADPKD', 'HK2851', 'HK2886']

# Creating the list with samples to be removed
#group0 = [sample for sample in group0 if sample not in samples_to_remove]

group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]

group3 = [sample for sample in group3 if sample not in samples_to_remove]

# Identify significant samples based on Pval < 0.05 for the specific gene set
significant_samples = pval_df.index[pval_df[target_gene_set] < 0.05]


# Assuming the rest of your setup code (loading data, etc.) is here

# Define x-coordinates for the groups, closer together
group_positions = [0.5, 0.7, 0.9]  # Adjust these values as needed

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["1", "2", "3"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()



# Create a DataFrame for plotting
plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(3, 6))

# Create a beeswarm plot

sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'],  linewidth=0.3)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1, 2], ["AKI", "CKD", "PKD"])  # Set custom x-axis ticks
plt.xlim(-0.5, 2.5)

plt.legend().remove()  # Remove the legend if not needed
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()

significant_samples

In [None]:
group1

In [None]:
import seaborn as sns

# Define the path to the R2 file for the specific cell type and gene set
#r2_base_path = "/home/kloetzer/Atlas/scSpectra/species_healthy_disease/"
cell_type = "PT"  # Replace with your desired cell type
target_gene_set = "hsa00190 Oxidative phosphorylation - Homo sapiens (human)"  # Replace with your desired gene set name
#target_gene_set = "hsa04668 TNF signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name

metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

#group0 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="Control_diseased")].orig_ident.unique().tolist()
#group0 = metadata[(metadata['Hypertension']=="Yes")].Sample.unique().tolist()

#group1 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="SLGT2i")].orig_ident.unique().tolist()
group1 = metadata[(metadata['Disease_level1']=="AKI")].Sample.unique().tolist()

group2 = metadata[(metadata['Disease_level1']=="CKD") & (metadata['Disease_level2']!="PKD")].Sample.unique().tolist()

group3 = metadata[(metadata['Disease_level2']=="PKD")].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = ['32-10346', '29-10406', '29-10393', 'PKD8_humphreys_ADPKD', 'HK2851', 'HK2886']

# Creating the list with samples to be removed
#group0 = [sample for sample in group0 if sample not in samples_to_remove]

group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]

group3 = [sample for sample in group3 if sample not in samples_to_remove]

# Identify significant samples based on Pval < 0.05 for the specific gene set
significant_samples = pval_df.index[pval_df[target_gene_set] < 0.05]


# Assuming the rest of your setup code (loading data, etc.) is here

# Define x-coordinates for the groups, closer together
group_positions = [0.5, 0.7, 0.9]  # Adjust these values as needed

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.5, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["1", "2", "3"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()



# Create a DataFrame for plotting
plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(3, 6))

# Create a beeswarm plot

sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'],  linewidth=0.3)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1, 2], ["AKI", "CKD", "PKD"])  # Set custom x-axis ticks
plt.xlim(-0.5, 2.5)

plt.legend().remove()  # Remove the legend if not needed
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()

significant_samples

In [None]:
import seaborn as sns

base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_KEGG_unfiltered/"
pval_folder = "Padj_KEGG_unfiltered/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"


# Define the path to the R2 file for the specific cell type and gene set
#r2_base_path = "/home/kloetzer/Atlas/scSpectra/species_healthy_disease/"
cell_type = "PT"  # Replace with your desired cell type
target_gene_set = "hsa04150 mTOR signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name
#target_gene_set = "hsa04668 TNF signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name

metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

#group0 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="Control_diseased")].orig_ident.unique().tolist()
#group0 = metadata[(metadata['Hypertension']=="Yes")].Sample.unique().tolist()

#group1 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="SLGT2i")].orig_ident.unique().tolist()
group1 = metadata[(metadata['Disease_level1']=="AKI")].Sample.unique().tolist()

group2 = metadata[(metadata['Disease_level1']=="CKD") & (metadata['Disease_level2']!="PKD")].Sample.unique().tolist()

group3 = metadata[(metadata['Disease_level2']=="PKD")].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = ['32-10346', '29-10406', '29-10393', 'PKD8_humphreys_ADPKD', 'HK2851', 'HK2886']

# Creating the list with samples to be removed
#group0 = [sample for sample in group0 if sample not in samples_to_remove]

group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]

group3 = [sample for sample in group3 if sample not in samples_to_remove]

# Identify significant samples based on Pval < 0.05 for the specific gene set
significant_samples = pval_df.index[pval_df[target_gene_set] < 0.05]


# Assuming the rest of your setup code (loading data, etc.) is here

# Define x-coordinates for the groups, closer together
group_positions = [0.5, 0.7, 0.9]  # Adjust these values as needed

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.5, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["1", "2", "3"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()



# Create a DataFrame for plotting
plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(3, 6))

# Create a beeswarm plot

sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'],  linewidth=0.3)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1, 2], ["AKI", "CKD", "PKD"])  # Set custom x-axis ticks
plt.xlim(-0.5, 2.5)

plt.legend().remove()  # Remove the legend if not needed
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()

significant_samples

In [None]:
set(group1).intersection(set(significant_samples))

In [None]:
group1

In [None]:
len(group1)

In [None]:
len(group2)

In [None]:
len(group3)

In [None]:
import seaborn as sns

# Define the path to the R2 file for the specific cell type and gene set
#r2_base_path = "/home/kloetzer/Atlas/scSpectra/species_healthy_disease/"
cell_type = "TAL"  # Replace with your desired cell type
target_gene_set = "hsa04978 Mineral absorption - Homo sapiens (human)"  # Replace with your desired gene set name
#target_gene_set = "hsa04668 TNF signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name

metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

#group0 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="Control_diseased")].orig_ident.unique().tolist()
#group0 = metadata[(metadata['Hypertension']=="Yes")].Sample.unique().tolist()

#group1 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="SLGT2i")].orig_ident.unique().tolist()
group1 = metadata[(metadata['Disease_level1']=="AKI")].Sample.unique().tolist()

group2 = metadata[(metadata['Disease_level1']=="CKD") & (metadata['Disease_level2']!="PKD")].Sample.unique().tolist()

group3 = metadata[(metadata['Disease_level2']=="PKD")].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = ['34-10240', 'HK2886']

# Creating the list with samples to be removed
#group0 = [sample for sample in group0 if sample not in samples_to_remove]

group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]

group3 = [sample for sample in group3 if sample not in samples_to_remove]

# Identify significant samples based on Pval < 0.05 for the specific gene set
significant_samples = pval_df.index[pval_df[target_gene_set] < 0.05]


# Assuming the rest of your setup code (loading data, etc.) is here

# Define x-coordinates for the groups, closer together
group_positions = [0.5, 0.7, 0.9]  # Adjust these values as needed

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.5, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["1", "2", "3"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()



# Create a DataFrame for plotting
plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(3, 6))

# Create a beeswarm plot

sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'],  linewidth=0.3)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1, 2], ["AKI", "CKD", "PKD"])  # Set custom x-axis ticks
plt.xlim(-0.5, 2.5)

plt.legend().remove()  # Remove the legend if not needed
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()

significant_samples

In [None]:
import seaborn as sns

# Define the path to the R2 file for the specific cell type and gene set
#r2_base_path = "/home/kloetzer/Atlas/scSpectra/species_healthy_disease/"
cell_type = "PT"  # Replace with your desired cell type
target_gene_set = "hsa00982 Drug metabolism - cytochrome P450 - Homo sapiens (human)"  # Replace with your desired gene set name
#target_gene_set = "hsa04066 HIF-1 signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name

metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

#group0 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="Control_diseased")].orig_ident.unique().tolist()
#group0 = metadata[(metadata['Hypertension']=="Yes")].Sample.unique().tolist()

#group1 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="SLGT2i")].orig_ident.unique().tolist()
group1 = metadata[(metadata['Disease_level1']=="AKI")].Sample.unique().tolist()

group2 = metadata[(metadata['Disease_level1']=="CKD") & (metadata['Disease_level2']!="PKD")].Sample.unique().tolist()

group3 = metadata[(metadata['Disease_level2']=="PKD")].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = ['32-10346', '29-10406', '29-10393', 'PKD8_humphreys_ADPKD', 'HK2851', 'HK2886']

#samples_to_remove = []

# Creating the list with samples to be removed
#group0 = [sample for sample in group0 if sample not in samples_to_remove]

group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]

group3 = [sample for sample in group3 if sample not in samples_to_remove]

# Identify significant samples based on Pval < 0.05 for the specific gene set
significant_samples = pval_df.index[pval_df[target_gene_set] < 0.05]


# Assuming the rest of your setup code (loading data, etc.) is here

# Define x-coordinates for the groups, closer together
group_positions = [0.5, 0.7, 0.9]  # Adjust these values as needed

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["1", "2", "3"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()



# Create a DataFrame for plotting
plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(3, 6))

# Create a beeswarm plot

sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'],  linewidth=0.3)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1, 2], ["AKI", "CKD", "PKD"])  # Set custom x-axis ticks
plt.xlim(-0.5, 2.5)

plt.legend().remove()  # Remove the legend if not needed
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()

In [None]:
metadata, metadata["Project/Dataset"].unique()

In [None]:
import pandas as pd
import os

cell_types = ["Podo", "TAL", "DCT_CNT_CD", "EC", "Stromal", "Immune", "PEC", "PT", "IC", "DTL_ATL"]
base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_GO_0/"
pval_folder = "Padj_GO_0/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"

metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

nan_threshold = 100000


metadat = metadata

# Define the cell types and the base paths
#cell_types = ["Cancer", "Stromal", "Endothelial"]
#cell_types = ["PT"]


# Function to read and process data for samples based on GFR threshold
def process_condition(metadat, unique_samples):


    
    
    combined_pval = pd.DataFrame(index=unique_samples)
    for cell_type in cell_types:
        pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")
        pval_df = pd.read_csv(pval_path, index_col=0)
        pval_df = pval_df[pval_df.index.isin(unique_samples)]
        pval_df.columns = [f"{cell_type}_{col}" for col in pval_df.columns]
        combined_pval = combined_pval.join(pval_df, how='left')
    return combined_pval

# Function to count significance
def count_significance(pval_df, threshold=0.05):
    sig_count = (pval_df < threshold).sum(axis=0, skipna=True)
    nonsig_count = (pval_df >= threshold).sum(axis=0, skipna=True)
    return sig_count, nonsig_count

# Read metadata

#metadata[(metadata['tissue: tumor_primary'] == 1.0) & (metadata['platform'] == "10x") & (metadata['condition'] == "LUSC")]['patient'].unique()

# Process each condition based on GFR threshold
#combined_pval_cond1 = process_condition(metadata, metadata[(metadata['platform'] == "10x") & (metadata['condition'] == "LUAD")]["patient"].tolist())  # Samples above the threshold
#combined_pval_cond2 = process_condition(metadata, metadata[(metadata['platform'] == "10x") & (metadata['condition'] == "LUSC")]["patient"].tolist())  # Samples below the threshold

#combined_pval_cond1 = process_condition(metadata, metadata[(metadata['Disease_level1'] != "Control") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]) & (metadata['Diabetes'] == "No"))]["Sample"].tolist())  # Samples above the threshold
#combined_pval_cond2 = process_condition(metadata, metadata[(metadata['Disease_level1'] != "Control") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]) & (metadata['Diabetes'] == "Yes"))]["Sample"].tolist())  # Samples below the threshold

combined_pval_cond1 = process_condition(metadata, metadata[(metadata['Disease_level1'] == "CKD") & (metadata['Diabetes'] == "No")]["Sample"].tolist())  # Samples above the threshold
combined_pval_cond2 = process_condition(metadata, metadata[(metadata['Disease_level1'] == "CKD") & (metadata['Diabetes'] == "Yes")]["Sample"].tolist())  # Samples below the threshold


#filter out the samples with too many nan
# Count the number of NaN values per sample (row) in the combined_pval DataFrame
nan_counts_per_sample1 = combined_pval_cond1.isna().sum(axis=1)
nan_counts_per_sample2 = combined_pval_cond2.isna().sum(axis=1)

# Exclude samples with more than the defined threshold of NaNs
samples_to_exclude1 = nan_counts_per_sample1[nan_counts_per_sample1 > nan_threshold].index
samples_to_exclude2 = nan_counts_per_sample2[nan_counts_per_sample2 > nan_threshold].index

combined_pval_cond1 = combined_pval_cond1.drop(samples_to_exclude1, axis=0)
combined_pval_cond2 = combined_pval_cond2.drop(samples_to_exclude2, axis=0)


# Count significance for each condition
sig_count_cond1, nonsig_count_cond1 = count_significance(combined_pval_cond1)
sig_count_cond2, nonsig_count_cond2 = count_significance(combined_pval_cond2)

# Creating the comparison DataFrame
comparison_df = pd.DataFrame(columns=combined_pval_cond1.columns, index=['cond1_sig', 'cond1_nonsig', 'cond2_sig', 'cond2_nonsig'])
comparison_df.loc['cond1_sig'] = sig_count_cond1
comparison_df.loc['cond1_nonsig'] = nonsig_count_cond1
comparison_df.loc['cond2_sig'] = sig_count_cond2
comparison_df.loc['cond2_nonsig'] = nonsig_count_cond2


import pandas as pd
import numpy as np
from scipy.stats import fisher_exact

# Assuming the comparison_df has been generated as in the previous code

fisher_results = pd.DataFrame(columns=['Feature', 'Test_statistic', 'p_value', "odds_ratio", 'cond1_sig_count', 'cond2_sig_count'])

for feature in comparison_df.columns:
    # Creating the contingency table for each feature
    contingency_table = comparison_df[[feature]].values.reshape(2, 2)
    
    # Extract counts of significant samples for each condition for the current feature
    cond1_sig_count = comparison_df.loc['cond1_sig', feature]
    cond2_sig_count = comparison_df.loc['cond2_sig', feature]

    # Apply Fisher's Exact Test
    odds_ratio, p = fisher_exact(contingency_table)
    
    # Store the results
    result_row = pd.DataFrame([{
        'Feature': feature, 
        'Test_statistic': odds_ratio, 
        'p_value': p, 
        "odds_ratio": odds_ratio, 
        'cond1_sig_count': cond1_sig_count,
        'cond2_sig_count': cond2_sig_count
    }])
    
    fisher_results = pd.concat([fisher_results, result_row], ignore_index=True)

# Sort the results by p-value in ascending order
sorted_results = fisher_results.sort_values(by='p_value')

# Display the most significant features
print(sorted_results.head())  # Adjust the number inside head() as needed

sorted_results.head(50)



In [None]:
sorted_results.head(50)

In [None]:
#sorted_results[sorted_results["cond2_sig_count"] > sorted_results["cond1_sig_count"]].head(50)

sorted_results[sorted_results["cond1_sig_count"] <= 2].head(50)

In [None]:
import seaborn as sns

base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_KEGG_unfiltered/"
pval_folder = "Padj_KEGG_unfiltered/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"


# Define the path to the R2 file for the specific cell type and gene set
#r2_base_path = "/home/kloetzer/Atlas/scSpectra/species_healthy_disease/"
cell_type = "IC"  # Replace with your desired cell type
target_gene_set = "hsa04911 Insulin secretion - Homo sapiens (human)"  # Replace with your desired gene set name
#target_gene_set = "hsa04066 HIF-1 signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name

metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

#group0 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="Control_diseased")].orig_ident.unique().tolist()
#group0 = metadata[(metadata['Hypertension']=="Yes")].Sample.unique().tolist()

#group1 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="SLGT2i")].orig_ident.unique().tolist()
group1 = metadata[(metadata['Disease_level1']!="Control") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]) & (metadata['Diabetes']=="No"))].Sample.unique().tolist()

group2 = metadata[(metadata['Disease_level1']!="Control") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]) & (metadata['Diabetes']=="Yes"))].Sample.unique().tolist()

group3 = metadata[(metadata['Disease_level1']!="Control") & (metadata['Project/Dataset'].isin(["h_Susztak"]))].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = ['34-10240', 'HK2886', '32-10346', '28-12546', '34-10050']
#samples_to_remove = []

# Creating the list with samples to be removed
#group0 = [sample for sample in group0 if sample not in samples_to_remove]

group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]

group3 = [sample for sample in group3 if sample not in samples_to_remove]

# Identify significant samples based on Pval < 0.05 for the specific gene set
significant_samples = pval_df.index[pval_df[target_gene_set] < 0.05]


# Assuming the rest of your setup code (loading data, etc.) is here

# Define x-coordinates for the groups, closer together
group_positions = [0.5, 0.7, 0.9]  # Adjust these values as needed

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["1", "2", "3"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()



# Create a DataFrame for plotting
plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(3, 6))

# Create a beeswarm plot

sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'],  linewidth=0.3)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1, 2], ["1", "2", "3"])  # Set custom x-axis ticks
plt.xlim(-0.5, 2.5)

plt.legend().remove()  # Remove the legend if not needed
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()

significant_samples

In [None]:
import seaborn as sns

base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_KEGG_unfiltered/"
pval_folder = "Padj_KEGG_unfiltered/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"


# Define the path to the R2 file for the specific cell type and gene set
#r2_base_path = "/home/kloetzer/Atlas/scSpectra/species_healthy_disease/"
cell_type = "TAL"  # Replace with your desired cell type
target_gene_set = "hsa00592 alpha-Linolenic acid metabolism - Homo sapiens (human)"  # Replace with your desired gene set name
#target_gene_set = "hsa04066 HIF-1 signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name

metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

#group0 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="Control_diseased")].orig_ident.unique().tolist()
#group0 = metadata[(metadata['Hypertension']=="Yes")].Sample.unique().tolist()

#group1 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="SLGT2i")].orig_ident.unique().tolist()
group1 = metadata[(metadata['Diabetes']=="No") & (metadata['Disease_level1']=="CKD")].Sample.unique().tolist()

group2 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Disease_level1']=="CKD")].Sample.unique().tolist()

group3 = metadata[(metadata['Disease_level1']=="AKI") & (metadata['Disease_level1']=="AKI")].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = ['34-10240', 'HK2886']
#samples_to_remove = []

# Creating the list with samples to be removed
#group0 = [sample for sample in group0 if sample not in samples_to_remove]

group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]

group3 = [sample for sample in group3 if sample not in samples_to_remove]

# Identify significant samples based on Pval < 0.05 for the specific gene set
significant_samples = pval_df.index[pval_df[target_gene_set] < 0.05]


# Assuming the rest of your setup code (loading data, etc.) is here

# Define x-coordinates for the groups, closer together
group_positions = [0.5, 0.7, 0.9]  # Adjust these values as needed

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["1", "2", "3"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()



# Create a DataFrame for plotting
plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(3, 6))

# Create a beeswarm plot

sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'],  linewidth=0.3)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1, 2], ["no Diabetes", "Diabetes", "AKI"])  # Set custom x-axis ticks
plt.xlim(-0.5, 2.5)

plt.legend().remove()  # Remove the legend if not needed
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()

significant_samples

In [None]:
import seaborn as sns

base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_KEGG_unfiltered/"
pval_folder = "Padj_KEGG_unfiltered/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"


# Define the path to the R2 file for the specific cell type and gene set
#r2_base_path = "/home/kloetzer/Atlas/scSpectra/species_healthy_disease/"
cell_type = "PEC"  # Replace with your desired cell type
target_gene_set = "hsa04024 cAMP signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name
#target_gene_set = "hsa04066 HIF-1 signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name

metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

#group0 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="Control_diseased")].orig_ident.unique().tolist()
#group0 = metadata[(metadata['Hypertension']=="Yes")].Sample.unique().tolist()

#group1 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="SLGT2i")].orig_ident.unique().tolist()
#group1 = metadata[(metadata['Diabetes']=="No") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]))].Sample.unique().tolist()
group1 = metadata[(metadata['Diabetes']=="No") & (metadata['Disease_level1']=="CKD")].Sample.unique().tolist()

#group2 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]))].Sample.unique().tolist()
group2 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Disease_level1']=="CKD") & (metadata['Hypertension']=="No")].Sample.unique().tolist()


#group3 = metadata[(metadata['Hypertension']=="Yes") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_Susztak"]))].Sample.unique().tolist()
group3 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Hypertension']=="Yes") & (metadata['Disease_level1']=="CKD")].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = ['HK2851', '31-10000', '31-10006', 'HK2558', '29-10393', '28-12510', 'KRAD46', 'KRAD70',
                     'HK2888', '29-10010', '29-10006', '31-10013', 'HK2886', 'HK2596', '27-10094', '27-10066', '28-12546', '31-10105', '29-10406', '28-12178',
                     
                     
                    ]
#samples_to_remove = []

# Creating the list with samples to be removed
#group0 = [sample for sample in group0 if sample not in samples_to_remove]

group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]

group3 = [sample for sample in group3 if sample not in samples_to_remove]

# Identify significant samples based on Pval < 0.05 for the specific gene set
significant_samples = pval_df.index[pval_df[target_gene_set] < 0.05]


# Assuming the rest of your setup code (loading data, etc.) is here

# Define x-coordinates for the groups, closer together
group_positions = [0.5, 0.7, 0.9]  # Adjust these values as needed

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["1", "2", "3"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()



# Create a DataFrame for plotting
plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(3, 6))

# Create a beeswarm plot

sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'],  linewidth=0.3)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1, 2], ["- D", "D - HT", "D + HT"])  # Set custom x-axis ticks
plt.xlim(-0.5, 2.5)

plt.legend().remove()  # Remove the legend if not needed
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()

significant_samples

In [None]:
import seaborn as sns

base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_KEGG_unfiltered/"
pval_folder = "Padj_KEGG_unfiltered/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"


# Define the path to the R2 file for the specific cell type and gene set
#r2_base_path = "/home/kloetzer/Atlas/scSpectra/species_healthy_disease/"
cell_type = "PEC"  # Replace with your desired cell type
target_gene_set = "hsa04062 Chemokine signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name
#target_gene_set = "hsa04066 HIF-1 signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name

metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

#group0 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="Control_diseased")].orig_ident.unique().tolist()
#group0 = metadata[(metadata['Hypertension']=="Yes")].Sample.unique().tolist()

#group1 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="SLGT2i")].orig_ident.unique().tolist()
#group1 = metadata[(metadata['Diabetes']=="No") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]))].Sample.unique().tolist()
group1 = metadata[(metadata['Diabetes']=="No") & (metadata['Disease_level1']=="CKD")].Sample.unique().tolist()

#group2 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]))].Sample.unique().tolist()
group2 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Disease_level1']=="CKD") & (metadata['Hypertension']=="No")].Sample.unique().tolist()


#group3 = metadata[(metadata['Hypertension']=="Yes") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_Susztak"]))].Sample.unique().tolist()
group3 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Hypertension']=="Yes") & (metadata['Disease_level1']=="CKD")].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = ['HK2851', '31-10000', '31-10006', 'HK2558', '29-10393', '28-12510', 'KRAD46', 'KRAD70',
                     'HK2888', '29-10010', '29-10006', '31-10013', 'HK2886', 'HK2596', '27-10094', '27-10066', '28-12546', '31-10105', '29-10406', '28-12178',
                     
                     
                    ]
#samples_to_remove = []

# Creating the list with samples to be removed
#group0 = [sample for sample in group0 if sample not in samples_to_remove]

group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]

group3 = [sample for sample in group3 if sample not in samples_to_remove]

# Identify significant samples based on Pval < 0.05 for the specific gene set
significant_samples = pval_df.index[pval_df[target_gene_set] < 0.05]


# Assuming the rest of your setup code (loading data, etc.) is here

# Define x-coordinates for the groups, closer together
group_positions = [0.5, 0.7, 0.9]  # Adjust these values as needed

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["1", "2", "3"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()



# Create a DataFrame for plotting
plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(3, 6))

# Create a beeswarm plot

sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'],  linewidth=0.3)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1, 2], ["- D", "D - HT", "D + HT"])  # Set custom x-axis ticks
plt.xlim(-0.5, 2.5)

plt.legend().remove()  # Remove the legend if not needed
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()

significant_samples

In [None]:
len(group1)

In [None]:
import seaborn as sns

base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_KEGG_unfiltered/"
pval_folder = "Padj_KEGG_unfiltered/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"


# Define the path to the R2 file for the specific cell type and gene set
#r2_base_path = "/home/kloetzer/Atlas/scSpectra/species_healthy_disease/"
cell_type = "PEC"  # Replace with your desired cell type
target_gene_set = "hsa04062 Chemokine signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name
#target_gene_set = "hsa04066 HIF-1 signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name

metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

#group0 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="Control_diseased")].orig_ident.unique().tolist()
#group0 = metadata[(metadata['Hypertension']=="Yes")].Sample.unique().tolist()

#group1 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="SLGT2i")].orig_ident.unique().tolist()
group1 = metadata[(metadata['Diabetes']=="No") & (metadata['Disease_level1']=="CKD")].Sample.unique().tolist()

group2 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Disease_level2']=="CKD")].Sample.unique().tolist()

group3 = metadata[(metadata['Disease_level2']=="AKI") & (metadata['Disease_level2']=="AKI")].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = ['HK2851', '31-10000', '31-10006', 'HK2558', '29-10393', '28-12510', 'KRAD46', 'KRAD70',
                    '27-10094', '27-10066', '28-12546', '31-10105', '29-10406', '28-12178',
                    '32-2', '32-10346', '34-10331', '34-10240', '34-10184', '34-10050']
#samples_to_remove = []

# Creating the list with samples to be removed
#group0 = [sample for sample in group0 if sample not in samples_to_remove]

group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]

group3 = [sample for sample in group3 if sample not in samples_to_remove]

# Identify significant samples based on Pval < 0.05 for the specific gene set
significant_samples = pval_df.index[pval_df[target_gene_set] < 0.05]


# Assuming the rest of your setup code (loading data, etc.) is here

# Define x-coordinates for the groups, closer together
group_positions = [0.5, 0.7, 0.9]  # Adjust these values as needed

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["1", "2", "3"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()



# Create a DataFrame for plotting
plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(3, 6))

# Create a beeswarm plot

sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'],  linewidth=0.3)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1, 2], ["no Diabetes", "Diabetes", "AKI"])  # Set custom x-axis ticks
plt.xlim(-0.5, 2.5)

plt.legend().remove()  # Remove the legend if not needed
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()

significant_samples

In [None]:


import seaborn as sns

base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_GO_unfiltered/"
pval_folder = "Padj_GO_unfiltered/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"


# Define the path to the R2 file for the specific cell type and gene set
#r2_base_path = "/home/kloetzer/Atlas/scSpectra/species_healthy_disease/"
cell_type = "DCT_CNT_CD"  # Replace with your desired cell type
#target_gene_set = "Sodium Ion Import Across Plasma Membrane (GO:0098719)"  # Replace with your desired gene set name
#target_gene_set = "hsa04066 HIF-1 signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name
target_gene_set = "Sodium Ion Transport (GO:0006814)"  # Replace with your desired gene set name


metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

#group0 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="Control_diseased")].orig_ident.unique().tolist()
#group0 = metadata[(metadata['Hypertension']=="Yes")].Sample.unique().tolist()

#group1 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="SLGT2i")].orig_ident.unique().tolist()
#group1 = metadata[(metadata['Diabetes']=="No") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]))].Sample.unique().tolist()
group1 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Disease_level1']=="CKD") & (metadata['Hypertension']=="No")].Sample.unique().tolist()

#group2 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]))].Sample.unique().tolist()
group2 = metadata[(metadata['Diabetes']=="No") & (metadata['Disease_level1']=="CKD") & (metadata['Hypertension']=="Yes")].Sample.unique().tolist()


#group3 = metadata[(metadata['Hypertension']=="Yes") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_Susztak"]))].Sample.unique().tolist()
group3 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Hypertension']=="Yes") & (metadata['Disease_level1']=="CKD")].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = ['HK2886']
#samples_to_remove = []

# Creating the list with samples to be removed
#group0 = [sample for sample in group0 if sample not in samples_to_remove]

group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]

group3 = [sample for sample in group3 if sample not in samples_to_remove]

# Identify significant samples based on Pval < 0.05 for the specific gene set
significant_samples = pval_df.index[pval_df[target_gene_set] < 0.05]


# Assuming the rest of your setup code (loading data, etc.) is here

# Define x-coordinates for the groups, closer together
group_positions = [0.5, 0.7, 0.9]  # Adjust these values as needed

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["1", "2", "3"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()



# Create a DataFrame for plotting
plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(3, 6))

# Create a beeswarm plot

sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'],  linewidth=0.3)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1, 2], ["D+ HT-", "D- HT+", "D+ HT+"])  # Set custom x-axis ticks
plt.xlim(-0.5, 2.5)

plt.legend().remove()  # Remove the legend if not needed
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()

significant_samples

In [None]:
import seaborn as sns

base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_GO_unfiltered/"
pval_folder = "Padj_GO_unfiltered/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"


# Define the path to the R2 file for the specific cell type and gene set
#r2_base_path = "/home/kloetzer/Atlas/scSpectra/species_healthy_disease/"
cell_type = "DCT_CNT_CD"  # Replace with your desired cell type
#target_gene_set = "Sodium Ion Import Across Plasma Membrane (GO:0098719)"  # Replace with your desired gene set name
#target_gene_set = "hsa04066 HIF-1 signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name
target_gene_set = "Sodium Ion Transport (GO:0006814)"  # Replace with your desired gene set name


metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

#group0 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="Control_diseased")].orig_ident.unique().tolist()
#group0 = metadata[(metadata['Hypertension']=="Yes")].Sample.unique().tolist()

#group1 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="SLGT2i")].orig_ident.unique().tolist()
#group1 = metadata[(metadata['Diabetes']=="No") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]))].Sample.unique().tolist()
group1 = metadata[(metadata['Disease_level1']=="CKD") & (metadata['Hypertension']=="No")].Sample.unique().tolist()

#group2 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]))].Sample.unique().tolist()
group2 = metadata[(metadata['Diabetes']=="No") & (metadata['Disease_level1']=="CKD") & (metadata['Hypertension']=="Yes")].Sample.unique().tolist()


#group3 = metadata[(metadata['Hypertension']=="Yes") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_Susztak"]))].Sample.unique().tolist()
group3 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Hypertension']=="Yes") & (metadata['Disease_level1']=="CKD")].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = ['HK2886']
#samples_to_remove = []

# Creating the list with samples to be removed
#group0 = [sample for sample in group0 if sample not in samples_to_remove]

group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]

group3 = [sample for sample in group3 if sample not in samples_to_remove]

# Identify significant samples based on Pval < 0.05 for the specific gene set
significant_samples = pval_df.index[pval_df[target_gene_set] < 0.05]


# Assuming the rest of your setup code (loading data, etc.) is here

# Define x-coordinates for the groups, closer together
group_positions = [0.5, 0.7, 0.9]  # Adjust these values as needed

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["1", "2", "3"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()



# Create a DataFrame for plotting
plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(3, 6))

# Create a beeswarm plot

sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'],  linewidth=0.3)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1, 2], ["HT-", "D- HT+", "D+ HT+"])  # Set custom x-axis ticks
plt.xlim(-0.5, 2.5)

plt.legend().remove()  # Remove the legend if not needed
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()

significant_samples

In [None]:


import seaborn as sns

base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_GO_unfiltered/"
pval_folder = "Padj_GO_unfiltered/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"


# Define the path to the R2 file for the specific cell type and gene set
#r2_base_path = "/home/kloetzer/Atlas/scSpectra/species_healthy_disease/"
cell_type = "DCT_CNT_CD"  # Replace with your desired cell type
#target_gene_set = "Sodium Ion Import Across Plasma Membrane (GO:0098719)"  # Replace with your desired gene set name
#target_gene_set = "hsa04066 HIF-1 signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name
target_gene_set = "Regulation Of Monoatomic Ion Transmembrane Transport (GO:0034765)"  # Replace with your desired gene set name


metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

#group0 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="Control_diseased")].orig_ident.unique().tolist()
#group0 = metadata[(metadata['Hypertension']=="Yes")].Sample.unique().tolist()

#group1 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="SLGT2i")].orig_ident.unique().tolist()
#group1 = metadata[(metadata['Diabetes']=="No") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]))].Sample.unique().tolist()
group1 = metadata[(metadata['Disease_level1']=="CKD") & (metadata['Hypertension']=="No")].Sample.unique().tolist()

#group2 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]))].Sample.unique().tolist()
group2 = metadata[(metadata['Diabetes']=="No") & (metadata['Disease_level1']=="CKD") & (metadata['Hypertension']=="Yes")].Sample.unique().tolist()


#group3 = metadata[(metadata['Hypertension']=="Yes") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_Susztak"]))].Sample.unique().tolist()
group3 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Hypertension']=="Yes") & (metadata['Disease_level1']=="CKD")].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = ['HK2886']
#samples_to_remove = []

# Creating the list with samples to be removed
#group0 = [sample for sample in group0 if sample not in samples_to_remove]

group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]

group3 = [sample for sample in group3 if sample not in samples_to_remove]

# Identify significant samples based on Pval < 0.05 for the specific gene set
significant_samples = pval_df.index[pval_df[target_gene_set] < 0.05]


# Assuming the rest of your setup code (loading data, etc.) is here

# Define x-coordinates for the groups, closer together
group_positions = [0.5, 0.7, 0.9]  # Adjust these values as needed

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["1", "2", "3"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()



# Create a DataFrame for plotting
plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(3, 6))

# Create a beeswarm plot

sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'],  linewidth=0.3)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.5, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1, 2], ["HT-", "D- HT+", "D+ HT+"])  # Set custom x-axis ticks
plt.xlim(-0.5, 2.5)

plt.legend().remove()  # Remove the legend if not needed
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()

significant_samples

In [None]:
len(group3)

In [None]:
group3

In [None]:
group3

In [None]:
import seaborn as sns

base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_GO_unfiltered/"
pval_folder = "Padj_GO_unfiltered/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"


# Define the path to the R2 file for the specific cell type and gene set
#r2_base_path = "/home/kloetzer/Atlas/scSpectra/species_healthy_disease/"
cell_type = "DCT_CNT_CD"  # Replace with your desired cell type
#target_gene_set = "Sodium Ion Import Across Plasma Membrane (GO:0098719)"  # Replace with your desired gene set name
#target_gene_set = "hsa04066 HIF-1 signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name
target_gene_set = "Regulation Of Monoatomic Ion Transmembrane Transport (GO:0034765)"  # Replace with your desired gene set name


metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

#group0 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="Control_diseased")].orig_ident.unique().tolist()
#group0 = metadata[(metadata['Hypertension']=="Yes")].Sample.unique().tolist()

#group1 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="SLGT2i")].orig_ident.unique().tolist()
#group1 = metadata[(metadata['Diabetes']=="No") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]))].Sample.unique().tolist()
group1 = metadata[(metadata['Disease_level1']!="Control") & (metadata['Hypertension']=="No")].Sample.unique().tolist()

#group2 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]))].Sample.unique().tolist()
group2 = metadata[(metadata['Diabetes']=="No") & (metadata['Disease_level1']!="Control") & (metadata['Hypertension']=="Yes")].Sample.unique().tolist()


#group3 = metadata[(metadata['Hypertension']=="Yes") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_Susztak"]))].Sample.unique().tolist()
group3 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Hypertension']=="Yes") & (metadata['Disease_level1']!="Control")].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = ['HK2886', '32-10346']
#samples_to_remove = []

# Creating the list with samples to be removed
#group0 = [sample for sample in group0 if sample not in samples_to_remove]

group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]

group3 = [sample for sample in group3 if sample not in samples_to_remove]

# Identify significant samples based on Pval < 0.05 for the specific gene set
significant_samples = pval_df.index[pval_df[target_gene_set] < 0.05]


# Assuming the rest of your setup code (loading data, etc.) is here

# Define x-coordinates for the groups, closer together
group_positions = [0.5, 0.7, 0.9]  # Adjust these values as needed

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["1", "2", "3"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()



# Create a DataFrame for plotting
plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(3, 6))

# Create a beeswarm plot

sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'],  linewidth=0.3)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.5, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1, 2], ["HT-", "D- HT+", "D+ HT+"])  # Set custom x-axis ticks
plt.xlim(-0.5, 2.5)

plt.legend().remove()  # Remove the legend if not needed
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()

significant_samples

In [None]:
import pandas as pd
import os

cell_types = ["Podo", "TAL", "DCT_CNT_CD", "EC", "Stromal", "Immune", "PEC", "PT", "IC", "DTL_ATL"]
base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_GO_0/"
pval_folder = "Padj_GO_0/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"

metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

nan_threshold = 100000


metadat = metadata

# Define the cell types and the base paths
#cell_types = ["Cancer", "Stromal", "Endothelial"]
#cell_types = ["PT"]


# Function to read and process data for samples based on GFR threshold
def process_condition(metadat, unique_samples):


    
    
    combined_pval = pd.DataFrame(index=unique_samples)
    for cell_type in cell_types:
        pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")
        pval_df = pd.read_csv(pval_path, index_col=0)
        pval_df = pval_df[pval_df.index.isin(unique_samples)]
        pval_df.columns = [f"{cell_type}_{col}" for col in pval_df.columns]
        combined_pval = combined_pval.join(pval_df, how='left')
    return combined_pval

# Function to count significance
def count_significance(pval_df, threshold=0.05):
    sig_count = (pval_df < threshold).sum(axis=0, skipna=True)
    nonsig_count = (pval_df >= threshold).sum(axis=0, skipna=True)
    return sig_count, nonsig_count

# Read metadata

#metadata[(metadata['tissue: tumor_primary'] == 1.0) & (metadata['platform'] == "10x") & (metadata['condition'] == "LUSC")]['patient'].unique()

# Process each condition based on GFR threshold
#combined_pval_cond1 = process_condition(metadata, metadata[(metadata['platform'] == "10x") & (metadata['condition'] == "LUAD")]["patient"].tolist())  # Samples above the threshold
#combined_pval_cond2 = process_condition(metadata, metadata[(metadata['platform'] == "10x") & (metadata['condition'] == "LUSC")]["patient"].tolist())  # Samples below the threshold

#combined_pval_cond1 = process_condition(metadata, metadata[(metadata['Disease_level1'] != "Control") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]) & (metadata['Diabetes'] == "No"))]["Sample"].tolist())  # Samples above the threshold
#combined_pval_cond2 = process_condition(metadata, metadata[(metadata['Disease_level1'] != "Control") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]) & (metadata['Diabetes'] == "Yes"))]["Sample"].tolist())  # Samples below the threshold
#combined_pval_cond1 = process_condition(metadata, metadata[(metadata['Disease_level1'] == "CKD") & (metadata['Diabetes'] == "No") & (metadata['Hypertension'] == "No")]["Sample"].tolist())  # Samples above the threshold
#combined_pval_cond2 = process_condition(metadata, metadata[(metadata['Disease_level1'] == "AKI") & (metadata['Diabetes'] == "Yes") & (metadata['Hypertension'] == "Yes")]["Sample"].tolist())  # Samples below the threshold



combined_pval_cond1 = process_condition(metadata, metadata[(metadata['Disease_level1'] == "CKD")]["Sample"].tolist())  # Samples above the threshold
combined_pval_cond2 = process_condition(metadata, metadata[(metadata['Disease_level1'] == "AKI")]["Sample"].tolist())  # Samples below the threshold


#filter out the samples with too many nan
# Count the number of NaN values per sample (row) in the combined_pval DataFrame
nan_counts_per_sample1 = combined_pval_cond1.isna().sum(axis=1)
nan_counts_per_sample2 = combined_pval_cond2.isna().sum(axis=1)

# Exclude samples with more than the defined threshold of NaNs
samples_to_exclude1 = nan_counts_per_sample1[nan_counts_per_sample1 > nan_threshold].index
samples_to_exclude2 = nan_counts_per_sample2[nan_counts_per_sample2 > nan_threshold].index

combined_pval_cond1 = combined_pval_cond1.drop(samples_to_exclude1, axis=0)
combined_pval_cond2 = combined_pval_cond2.drop(samples_to_exclude2, axis=0)


# Count significance for each condition
sig_count_cond1, nonsig_count_cond1 = count_significance(combined_pval_cond1)
sig_count_cond2, nonsig_count_cond2 = count_significance(combined_pval_cond2)

# Creating the comparison DataFrame
comparison_df = pd.DataFrame(columns=combined_pval_cond1.columns, index=['cond1_sig', 'cond1_nonsig', 'cond2_sig', 'cond2_nonsig'])
comparison_df.loc['cond1_sig'] = sig_count_cond1
comparison_df.loc['cond1_nonsig'] = nonsig_count_cond1
comparison_df.loc['cond2_sig'] = sig_count_cond2
comparison_df.loc['cond2_nonsig'] = nonsig_count_cond2


import pandas as pd
import numpy as np
from scipy.stats import fisher_exact

# Assuming the comparison_df has been generated as in the previous code

fisher_results = pd.DataFrame(columns=['Feature', 'Test_statistic', 'p_value', "odds_ratio", 'cond1_sig_count', 'cond2_sig_count'])

for feature in comparison_df.columns:
    # Creating the contingency table for each feature
    contingency_table = comparison_df[[feature]].values.reshape(2, 2)
    
    # Extract counts of significant samples for each condition for the current feature
    cond1_sig_count = comparison_df.loc['cond1_sig', feature]
    cond2_sig_count = comparison_df.loc['cond2_sig', feature]

    # Apply Fisher's Exact Test
    odds_ratio, p = fisher_exact(contingency_table)
    
    # Store the results
    result_row = pd.DataFrame([{
        'Feature': feature, 
        'Test_statistic': odds_ratio, 
        'p_value': p, 
        "odds_ratio": odds_ratio, 
        'cond1_sig_count': cond1_sig_count,
        'cond2_sig_count': cond2_sig_count
    }])
    
    fisher_results = pd.concat([fisher_results, result_row], ignore_index=True)

# Sort the results by p-value in ascending order
sorted_results = fisher_results.sort_values(by='p_value')

# Display the most significant features
print(sorted_results.head())  # Adjust the number inside head() as needed

sorted_results.head(50)

In [None]:
metadata[(metadata['Disease_level1'] == "AKI")]["Sample"].tolist()

In [None]:
import seaborn as sns

base_path = "/home/isilon/users/o_kloetzer/Atlas/Revision/human_extended/output_human_extended/"
r2_folder = "R2_GO_unfiltered/"
pval_folder = "Padj_GO_unfiltered/"
metadat_path = "/home/isilon/users/o_kloetzer/Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"


# Define the path to the R2 file for the specific cell type and gene set
#r2_base_path = "/home/kloetzer/Atlas/scSpectra/species_healthy_disease/"
cell_type = "Stromal"  # Replace with your desired cell type
#target_gene_set = "Sodium Ion Import Across Plasma Membrane (GO:0098719)"  # Replace with your desired gene set name
#target_gene_set = "hsa04066 HIF-1 signaling pathway - Homo sapiens (human)"  # Replace with your desired gene set name
target_gene_set = "Positive Regulation Of Pathway-Restricted SMAD Protein Phosphorylation (GO:0010862)"  # Replace with your desired gene set name


metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

#group0 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="Control_diseased")].orig_ident.unique().tolist()
#group0 = metadata[(metadata['Hypertension']=="Yes")].Sample.unique().tolist()

#group1 = metadata[(metadata["proj"]=="m_humphreys_DKD") & (metadata["treated"]=="SLGT2i")].orig_ident.unique().tolist()
#group1 = metadata[(metadata['Diabetes']=="No") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]))].Sample.unique().tolist()
group1 = metadata[(metadata['Disease_level1']!="Control") & (metadata['Hypertension']=="No") & (metadata['Diabetes']=="No")].Sample.unique().tolist()

#group2 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_KPMP", "KPMP_new"]))].Sample.unique().tolist()
group2 = metadata[(metadata['Diabetes']=="No") & (metadata['Disease_level1']!="Control") & (metadata['Hypertension']=="Yes")].Sample.unique().tolist()


#group3 = metadata[(metadata['Hypertension']=="Yes") & (metadata['Disease_level1']=="CKD") & (metadata['Project/Dataset'].isin(["h_Susztak"]))].Sample.unique().tolist()
group3 = metadata[(metadata['Diabetes']=="Yes") & (metadata['Hypertension']=="Yes") & (metadata['Disease_level1']!="Control")].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = []
#samples_to_remove = []

# Creating the list with samples to be removed
#group0 = [sample for sample in group0 if sample not in samples_to_remove]

group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]

group3 = [sample for sample in group3 if sample not in samples_to_remove]

# Identify significant samples based on Pval < 0.05 for the specific gene set
significant_samples = pval_df.index[pval_df[target_gene_set] < 0.05]


# Assuming the rest of your setup code (loading data, etc.) is here

# Define x-coordinates for the groups, closer together
group_positions = [0.5, 0.7, 0.9]  # Adjust these values as needed

# Create a figure for the plot
plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["1", "2", "3"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()



# Create a DataFrame for plotting
plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2, group3]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(3, 6))

# Create a beeswarm plot

sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'],  linewidth=0.3)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.0, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1, 2], ["-/-", "-/+", "+/+"])  # Set custom x-axis ticks
plt.xlim(-0.5, 2.5)

plt.legend().remove()  # Remove the legend if not needed
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()

significant_samples

In [None]:
group1