In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np

from sklearn.preprocessing import MinMaxScaler

In [None]:
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = (4, 4)

# Set options to display a specific number of columns and control column width
pd.set_option('display.max_columns', 20)  # Set the maximum number of columns to display
pd.set_option('display.max_colwidth', 100)  # Set the maximum column width for text data
pd.set_option('display.max_rows', 100)

In [None]:

# Replace with your actual file paths and cell types
cell_types = ["Podo", "TAL", "DCT_CNT_CD", "EC", "Stromal", "Immune", "PEC", "PT", "IC", "DTL_ATL"] # etc.
# Paths as specified in your first script
base_path = ".../Atlas/scSPECTRA/onthefly/onthefly_V2/"
r2_folder = "R2/"
pval_folder = "Pval/"
metadat_path = ".../Atlas/scSPECTRA/R2_pval/Atlas_Extended_II_Albuminuria_gpt.csv"  # Update with the actual path to 'metadat'

def extract_cell_type(gene_set):
    for cell_type in cell_types:
        if gene_set.startswith(cell_type):
            return cell_type
    return "Unknown"  # or some default value

# Read metadata
metadat = pd.read_csv(metadat_path)
unique_samples = metadat[metadat['Disease_level2'] != "Control"]['Sample'].unique()
#unique_samples = metadat[(metadat['Disease_level2'] != "Control") & (metadat['Project/Dataset'] == "cal_CKD")]['Sample'].unique()

# Initialize combined dataframes for R2 and p-values
combined_r2 = pd.DataFrame(index=unique_samples)
combined_pval = pd.DataFrame(index=unique_samples)
sample_counts_per_cell_type = {}

# Process each cell type
for cell_type in cell_types:
    r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
    pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")
    
    r2_df = pd.read_csv(r2_path, index_col=0)
    pval_df = pd.read_csv(pval_path, index_col=0)
    
    # Modify column names
    r2_df.columns = [f"{cell_type}_{col}" for col in r2_df.columns]
    pval_df.columns = [f"{cell_type}_{col}" for col in pval_df.columns]
    
    # Merge data
    combined_r2 = combined_r2.join(r2_df, how='left')
    combined_pval = combined_pval.join(pval_df, how='left')

    # Count samples per cell type
    r2_df = r2_df[r2_df.index.isin(unique_samples)]
    sample_counts_per_cell_type[cell_type] = r2_df.shape[0]

# Handle NA values in p-values
combined_pval.fillna(1, inplace=True)

print(combined_r2)

# Initialize a MinMaxScaler
min_max_scaler = MinMaxScaler()

# Scale the combined_r2 data
# Note: The fit_transform method expects a 2D array, so we use .values to convert the DataFrame
min_max_scaled_r2_values = min_max_scaler.fit_transform(combined_r2.values)

# Create a new DataFrame from the min-max scaled data, preserving the index and column names

combined_r2 = pd.DataFrame(min_max_scaled_r2_values, index=combined_r2.index, columns=combined_r2.columns)

# Set p-value threshold
p_value_threshold = 0.01

# Identify significant gene sets
significant_matrix = combined_pval < p_value_threshold

# Count significant occurrences
significant_counts = significant_matrix.sum()

# Create DataFrame for counts and cell types
significant_counts_df = pd.DataFrame({'GeneSet': significant_counts.index, 'Count': significant_counts.values})

# Apply the function to extract the full cell type name
significant_counts_df['CellType'] = significant_counts_df['GeneSet'].apply(extract_cell_type)

# Normalize by sample number per cell type
significant_counts_df['NormalizedPercentage'] = significant_counts_df.apply(
    lambda row: (row['Count'] / sample_counts_per_cell_type.get(row['CellType'], 1)) * 100, axis=1
)

# Display final DataFrame
print(significant_counts_df.head())  # You can modify this to display as needed

sorted_df = significant_counts_df.sort_values(by='NormalizedPercentage', ascending=False)
display(sorted_df)

In [None]:

# Select the top features
repetitive_features = sorted_df[sorted_df["NormalizedPercentage"] >= 25]

# Remove the CellType prefix from the GeneSet column
repetitive_features['GeneSetOnly'] = repetitive_features.apply(
    lambda row: row['GeneSet'].replace(row['CellType'] + '_', ''), axis=1
)

repetitive_features

In [None]:
len(repetitive_features["GeneSetOnly"].unique())

len(repetitive_features["GeneSetOnly"])

In [None]:



import re

def auto_split_label(label, max_length=40):
    # Split the label into words
    words = label.split()

    # Function to remove suffix
    def remove_suffix(word):
        return re.sub(r'\(GO:\d{7}\)$', '', word)

    # Start with the first word (with suffix removed)
    first_word = remove_suffix(words[0])
    split_label = first_word
    current_length = len(first_word)

    # Add each word to the line until the max_length is reached, then start a new line
    for word in words[1:]:
        word = remove_suffix(word)
        if current_length + len(word) + 1 <= max_length:
            split_label += ' ' + word
            current_length += len(word) + 1
        else:
            split_label += '\n' + word
            current_length = len(word)

    return split_label


# Identify the top common feature for each cell type
top_features_per_celltype = repetitive_features.loc[repetitive_features.groupby('CellType')['NormalizedPercentage'].idxmax()]

# Sort the DataFrame based on NormalizedPercentage in ascending order for horizontal plot
top_features_per_celltype_sorted = top_features_per_celltype.sort_values(by='NormalizedPercentage', ascending=True)

# Your specific color map for cell types
cell_colors = {
    "DCT_CNT_CD": "#3182bd",
    "DTL_ATL": "#fdd0a2",
    "EC": "seagreen",
    "IC": "orange",
    "Immune": "#c7e9c0",
    "Podo": "#555555",
    "Stromal": "limegreen",
    "PEC": "#fde725",
    "PT": "darkorchid",
    "TAL": "lightcoral",
}

# Plotting the horizontal bar plot
plt.figure(figsize=(12, 8))
barplot = plt.barh(top_features_per_celltype_sorted['CellType'], top_features_per_celltype_sorted['NormalizedPercentage'],
                   color=[cell_colors[ct] for ct in top_features_per_celltype_sorted['CellType']])

# Setting the y-axis labels using the auto_split_label function
plt.yticks(top_features_per_celltype_sorted['CellType'], 
           [auto_split_label(gs) for gs in top_features_per_celltype_sorted['GeneSetOnly']], fontsize = 18)

plt.xlabel('Normalized Percentage')
plt.title('Top Common Features per Cell Type (Sorted by Normalized Percentage)')
plt.tight_layout()
plt.show()



In [None]:
cell_colors = {
    "DCT_CNT_CD": "#3182bd",
    "DTL_ATL": "#fdd0a2",
    "EC": "seagreen",
    "IC": "orange",
    "Immune": "#c7e9c0",
    "Podo": "#555555",
    "Stromal": "limegreen",
    "PEC": "#fde725",
    "PT": "darkorchid",
    "TAL": "lightcoral",
}



top_features = sorted_df[sorted_df["NormalizedPercentage"] >= 25]

# Count the frequency of each CellType in the top 100 features
cell_type_counts = top_features['CellType'].value_counts()

# Extract the colors corresponding to the cell types
pie_colors = [cell_colors[cell_type] for cell_type in cell_type_counts.index if cell_type in cell_colors]

# Plotting the pie chart
plt.figure(figsize=(5, 5))  # Adjust the size as needed
cell_type_counts.plot(kind='pie', colors=pie_colors, autopct='')
plt.title('25% of patients cut-off')
plt.ylabel(len(top_100_features))  # Hide the y-label
plt.show()

In [None]:
sample_of_interest = 'PKD4_humphreys_ADPKD'

ct_plot1 = "PT"

ct_plot2 = "TAL"

ct_plot3 = "EC"

def auto_split_label(label, max_length=50):
    # Split the label into words
    words = label.split()

    # Start with the first word
    split_label = words[0]
    current_length = len(words[0])

    # Add each word to the line until the max_length is reached, then start a new line
    for word in words[1:]:
        if current_length + len(word) + 1 <= max_length:
            split_label += ' ' + word
            current_length += len(word) + 1
        else:
            split_label += '\n' + word
            current_length = len(word)

    return split_label

# Provided cell types and colors
cell_colors = {
    "DCT_CNT_CD": "#3182bd",
    "DTL_ATL": "#fdd0a2",
    "EC": "seagreen",
    "IC": "orange",
    "Immune": "#c7e9c0",
    "Podo": "#000004",
    "Stromal": "limegreen",
    "PEC": "#fde725",
    "PT": "darkorchid",
    "TAL": "lightcoral",
}

# Function to extract cell type from feature name
def extract_cell_type(feature_name):
    for cell_type in cell_colors.keys():
        if feature_name.startswith(cell_type):
            return cell_type
    return "Unknown"

# Sample data - replace with your actual data

top_20_features = combined_pval.loc[sample_of_interest].nsmallest(25)
neg_log_pvals = -np.log10(top_20_features)
r2_values = combined_r2.loc[sample_of_interest][top_20_features.index]

# Create a reversed red colormap for R2 values
cmap = plt.cm.Reds_r  # '_r' suffix to reverse the colormap

norm = mcolors.Normalize(vmin=0, vmax=1)  # Normalization fixed from 0 to 1

# Assuming 'significant_counts_df' is your DataFrame and it has a 'GeneSet' and 'NormalizedPercentage' column
scale_factor = 4
base_size = 20

# Map features to their normalized percentage
feature_to_percentage = dict(zip(significant_counts_df['GeneSet'], significant_counts_df['NormalizedPercentage']))

# Create the dot plot
fig, ax = plt.subplots(figsize=(16, 5))
for i, (feature, neg_log_pval) in enumerate(neg_log_pvals.items()):
    cell_type = extract_cell_type(feature)
    bar_color = cell_colors.get(cell_type, 'grey')
    dot_color = cmap(norm(r2_values[feature]))
    
    # Get the normalized percentage for the dot size
    normalized_percentage = feature_to_percentage.get(feature, 0)
    dot_size = normalized_percentage * scale_factor + base_size

    ax.plot([0, neg_log_pval - (neg_log_pval / 10)], [i, i], color=bar_color, linewidth=3)
    ax.scatter(neg_log_pval, i, s=dot_size, color=dot_color, edgecolor='black', alpha=0.7, linewidth=0.5)

ax.set_yticks(range(len(top_20_features.index)))
ax.set_yticklabels(top_20_features.index, fontsize=10)
ax.set_xlabel('-log10(p-value)', fontsize=12)
ax.set_ylabel('Features', fontsize=12)
ax.set_title(f'Top Features for Sample {sample_of_interest}', fontsize=14)

# Invert y-axis to have the most significant features at the top
ax.invert_yaxis()

# Create a colorbar for the R2 values
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = fig.colorbar(sm, ax=ax, label='Scaled R2 Values', orientation='vertical')

# Legend for cell types
cell_legend_elements = [plt.Line2D([0], [0], color=color, lw=4, label=cell_type) for cell_type, color in cell_colors.items()]
legend1 = ax.legend(handles=cell_legend_elements, title='Cell Types', bbox_to_anchor=(1.5, 1), loc='upper left')

# Example normalized percentages for the dot size legend
example_percentages = [20, 40, 60, 80]
legend_dot_sizes = [p * scale_factor + base_size for p in example_percentages]

# Add legend for dot sizes
for p, size in zip(example_percentages, legend_dot_sizes):
    ax.scatter([], [], s=size, color='gray', edgecolor='black', alpha=0.7, label=f'{p}%')

legend2 = ax.legend(title='Frequency', bbox_to_anchor=(1.5, 0.2), loc='center left')
ax.add_artist(legend1)  # Add back the first legend

plt.tight_layout()
plt.show()


# Define your desired order of cell types
desired_order = [
     
"PT", 
"TAL", 
"DCT_CNT_CD", 
"IC",    
"EC",
"Stromal",      
"Immune",   
"DTL_ATL",  
"PEC",           
"Podo",    

    
]  # Replace with actual cell types

#top_20_features = combined_pval.loc[sample_of_interest].nsmallest(100)

sample_features = combined_pval.loc[sample_of_interest]

top_20_features = sample_features[sample_features < p_value_threshold]

# Assume 'combined_pval' is a DataFrame you have that contains the p-values

# Select the top 50 gene sets from the top features
top_gene_sets = top_20_features.index

# Group the top gene sets by cell type
grouped_data = pd.Series(top_gene_sets).apply(extract_cell_type).value_counts()

categories = list(grouped_data.index)

# Initialize ordered values with zero for all categories
ordered_values = [grouped_data.get(ct, 0) for ct in desired_order]

# Since we need to repeat the first value to close the circular graph
ordered_values += ordered_values[:1]

# Calculate the angle of each axis in the plot
N = len(desired_order)
angles = [n / float(N) * 2 * np.pi for n in range(N)]
angles += angles[:1]

# Initialize the spider plot
fig, ax = plt.subplots(figsize=(4, 4), subplot_kw=dict(polar=True))

# Draw one axis per variable and add labels
plt.xticks(angles[:-1], desired_order)

# Draw ylabels and set plot limits
ax.set_rlabel_position(0)
plt.yticks(color="grey", size=5)


#plt.ylim(0, max_value)


# Neutral color for shading
shade_color = 'lightgrey'

# Fill the entire area under the radar chart with a neutral color
ax.fill(angles, ordered_values, shade_color, alpha=0.9)

# Draw thin lines connecting the points
ax.plot(angles, ordered_values, color='grey', linewidth=1, linestyle='-', alpha=0.8)

# Plot each line segment in its respective cell type color
for idx in range(N):
    color = cell_colors.get(desired_order[idx], "grey")
    ax.plot([angles[idx], 0], [ordered_values[idx], 0], color=color, linewidth=2)

# Change the color of tick labels to red if the cell type is missing
for idx, label in enumerate(ax.get_xticklabels()):
    if desired_order[idx] not in categories:
        label.set_color('red')

# Add a title
plt.title('Cell Type Distribution in Top Gene Sets for Sample ' + sample_of_interest, size=11, y=1.1)

plt.show()


# Desired cell type for visualization
desired_cell_type = ct_plot1

# Assuming 'combined_pval' and 'combined_r2' are DataFrames with samples as rows and features as columns
# Filter for the specific sample of interest
sample_pvals = combined_pval.loc[sample_of_interest]
sample_r2 = combined_r2.loc[sample_of_interest]

# Filter the top features for the specific sample and cell type
#top_features_for_sample = sample_pvals.nsmallest(25)
filtered_features = sample_pvals[sample_pvals.index.map(extract_cell_type) == desired_cell_type].nsmallest(5)
filtered_neg_log_pvals = -np.log10(filtered_features)
filtered_r2_values = sample_r2[filtered_features.index]

# Now, update the dot plot code to use the filtered data
fig, ax = plt.subplots(figsize=(16, 5))
for i, (feature, neg_log_pval) in enumerate(filtered_neg_log_pvals.items()):
    cell_type = extract_cell_type(feature)
    bar_color = cell_colors.get(cell_type, 'grey')
    dot_color = cmap(norm(filtered_r2_values[feature]))
    
    # Get the normalized percentage for the dot size (if applicable)
    normalized_percentage = feature_to_percentage.get(feature, 0)
    dot_size = normalized_percentage * scale_factor + base_size

    ax.plot([0, neg_log_pval - (neg_log_pval / 10)], [i, i], color=bar_color, linewidth=3)
    ax.scatter(neg_log_pval, i, s=dot_size, color=dot_color, edgecolor='black', alpha=0.7, linewidth=0.5)

ax.set_yticks(range(len(filtered_features.index)))

prefix_to_remove = f"{desired_cell_type}_"
cleaned_labels = [label.replace(prefix_to_remove, '', 1) for label in filtered_features.index]

# Apply the auto-splitting to each label
cleaned_labels = [auto_split_label(label) for label in cleaned_labels]

ax.set_yticklabels(cleaned_labels, fontsize=22)
ax.set_xlabel('-log10(p-value)', fontsize=15)
#ax.set_ylabel('Features', fontsize=12)
#ax.set_title(f'Top Features for Sample {sample_of_interest} - Cell Type: {desired_cell_type}', fontsize=14)
ax.invert_yaxis()

# Create a colorbar for the R2 values
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = fig.colorbar(sm, ax=ax, label='Scaled R2 Values', orientation='vertical')

# Legend for cell types
#cell_legend_elements = [plt.Line2D([0], [0], color=color, lw=4, label=cell_type) for cell_type, color in cell_colors.items()]
#legend1 = ax.legend(handles=cell_legend_elements, title='Cell Types', bbox_to_anchor=(1.5, 1), loc='upper left')

# Example normalized percentages for the dot size legend
example_percentages = [20, 40, 60, 80]
legend_dot_sizes = [p * scale_factor + base_size for p in example_percentages]

# Add legend for dot sizes
for p, size in zip(example_percentages, legend_dot_sizes):
    ax.scatter([], [], s=size, color='gray', edgecolor='black', alpha=0.7, label=f'{p}%')

legend2 = ax.legend(title='Frequency', bbox_to_anchor=(1.5, 0.2), loc='center left')
#ax.add_artist(legend1)  # Add back the first legend

plt.tight_layout()
plt.show()




# Desired cell type for visualization
desired_cell_type = ct_plot2

# Assuming 'combined_pval' and 'combined_r2' are DataFrames with samples as rows and features as columns
# Filter for the specific sample of interest
sample_pvals = combined_pval.loc[sample_of_interest]
sample_r2 = combined_r2.loc[sample_of_interest]

# Filter the top features for the specific sample and cell type
#top_features_for_sample = sample_pvals.nsmallest(25)
filtered_features = sample_pvals[sample_pvals.index.map(extract_cell_type) == desired_cell_type].nsmallest(5)
filtered_neg_log_pvals = -np.log10(filtered_features)
filtered_r2_values = sample_r2[filtered_features.index]

# Now, update the dot plot code to use the filtered data
fig, ax = plt.subplots(figsize=(16, 5))
for i, (feature, neg_log_pval) in enumerate(filtered_neg_log_pvals.items()):
    cell_type = extract_cell_type(feature)
    bar_color = cell_colors.get(cell_type, 'grey')
    dot_color = cmap(norm(filtered_r2_values[feature]))
    
    # Get the normalized percentage for the dot size (if applicable)
    normalized_percentage = feature_to_percentage.get(feature, 0)
    dot_size = normalized_percentage * scale_factor + base_size

    ax.plot([0, neg_log_pval - (neg_log_pval / 10)], [i, i], color=bar_color, linewidth=3)
    ax.scatter(neg_log_pval, i, s=dot_size, color=dot_color, edgecolor='black', alpha=0.7, linewidth=0.5)

ax.set_yticks(range(len(filtered_features.index)))

prefix_to_remove = f"{desired_cell_type}_"
cleaned_labels = [label.replace(prefix_to_remove, '', 1) for label in filtered_features.index]

# Apply the auto-splitting to each label
cleaned_labels = [auto_split_label(label) for label in cleaned_labels]

ax.set_yticklabels(cleaned_labels, fontsize=22)
ax.set_xlabel('-log10(p-value)', fontsize=15)
#ax.set_ylabel('Features', fontsize=12)
#ax.set_title(f'Top Features for Sample {sample_of_interest} - Cell Type: {desired_cell_type}', fontsize=14)
ax.invert_yaxis()

# Create a colorbar for the R2 values
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = fig.colorbar(sm, ax=ax, label='Scaled R2 Values', orientation='vertical')

# Legend for cell types
#cell_legend_elements = [plt.Line2D([0], [0], color=color, lw=4, label=cell_type) for cell_type, color in cell_colors.items()]
#legend1 = ax.legend(handles=cell_legend_elements, title='Cell Types', bbox_to_anchor=(1.5, 1), loc='upper left')

# Example normalized percentages for the dot size legend
example_percentages = [20, 40, 60, 80]
legend_dot_sizes = [p * scale_factor + base_size for p in example_percentages]

# Add legend for dot sizes
for p, size in zip(example_percentages, legend_dot_sizes):
    ax.scatter([], [], s=size, color='gray', edgecolor='black', alpha=0.7, label=f'{p}%')

legend2 = ax.legend(title='Frequency', bbox_to_anchor=(1.5, 0.2), loc='center left')
#ax.add_artist(legend1)  # Add back the first legend

plt.tight_layout()
plt.show()





# Desired cell type for visualization
desired_cell_type = ct_plot3

# Assuming 'combined_pval' and 'combined_r2' are DataFrames with samples as rows and features as columns
# Filter for the specific sample of interest
sample_pvals = combined_pval.loc[sample_of_interest]
sample_r2 = combined_r2.loc[sample_of_interest]

# Filter the top features for the specific sample and cell type
#top_features_for_sample = sample_pvals.nsmallest(25)
filtered_features = sample_pvals[sample_pvals.index.map(extract_cell_type) == desired_cell_type].nsmallest(5)
filtered_neg_log_pvals = -np.log10(filtered_features)
filtered_r2_values = sample_r2[filtered_features.index]

# Now, update the dot plot code to use the filtered data
fig, ax = plt.subplots(figsize=(16, 5))
for i, (feature, neg_log_pval) in enumerate(filtered_neg_log_pvals.items()):
    cell_type = extract_cell_type(feature)
    bar_color = cell_colors.get(cell_type, 'grey')
    dot_color = cmap(norm(filtered_r2_values[feature]))
    
    # Get the normalized percentage for the dot size (if applicable)
    normalized_percentage = feature_to_percentage.get(feature, 0)
    dot_size = normalized_percentage * scale_factor + base_size

    ax.plot([0, neg_log_pval - (neg_log_pval / 10)], [i, i], color=bar_color, linewidth=3)
    ax.scatter(neg_log_pval, i, s=dot_size, color=dot_color, edgecolor='black', alpha=0.7, linewidth=0.5)

ax.set_yticks(range(len(filtered_features.index)))

prefix_to_remove = f"{desired_cell_type}_"
cleaned_labels = [label.replace(prefix_to_remove, '', 1) for label in filtered_features.index]

# Apply the auto-splitting to each label
cleaned_labels = [auto_split_label(label) for label in cleaned_labels]

ax.set_yticklabels(cleaned_labels, fontsize=22)
ax.set_xlabel('-log10(p-value)', fontsize=15)
#ax.set_ylabel('Features', fontsize=12)
#ax.set_title(f'Top Features for Sample {sample_of_interest} - Cell Type: {desired_cell_type}', fontsize=14)
ax.invert_yaxis()

# Create a colorbar for the R2 values
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = fig.colorbar(sm, ax=ax, label='Scaled R2 Values', orientation='vertical')

# Legend for cell types
#cell_legend_elements = [plt.Line2D([0], [0], color=color, lw=4, label=cell_type) for cell_type, color in cell_colors.items()]
#legend1 = ax.legend(handles=cell_legend_elements, title='Cell Types', bbox_to_anchor=(1.5, 1), loc='upper left')

# Example normalized percentages for the dot size legend
example_percentages = [20, 40, 60, 80]
legend_dot_sizes = [p * scale_factor + base_size for p in example_percentages]

# Add legend for dot sizes
for p, size in zip(example_percentages, legend_dot_sizes):
    ax.scatter([], [], s=size, color='gray', edgecolor='black', alpha=0.7, label=f'{p}%')

legend2 = ax.legend(title='Frequency', bbox_to_anchor=(1.5, 0.2), loc='center left')
#ax.add_artist(legend1)  # Add back the first legend

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import os

# Define the cell types and the base paths
cell_types = ["PT", "Podo", "TAL", "DCT_CNT_CD", "EC", "Stromal", "IC", "Immune", "PEC", "DTL_ATL"]


# Function to read and process data for a given condition
def process_condition(metadat, condition_value):
    unique_samples = metadat[metadat['Hypertension'] == condition_value]['Sample'].unique()
    combined_pval = pd.DataFrame(index=unique_samples)

    for cell_type in cell_types:
        pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")
        pval_df = pd.read_csv(pval_path, index_col=0)
        pval_df = pval_df[pval_df.index.isin(unique_samples)]  # Apply the same filter
        pval_df.columns = [f"{cell_type}_{col}" for col in pval_df.columns]
        combined_pval = combined_pval.join(pval_df, how='left')

    return combined_pval

# Function to count significance
def count_significance(pval_df, threshold=0.01):
    sig_count = (pval_df < threshold).sum(axis=0, skipna=True)
    nonsig_count = (pval_df >= threshold).sum(axis=0, skipna=True)
    return sig_count, nonsig_count

# Read metadata

# Process each condition
combined_pval_cond1 = process_condition(metadat, "Yes")  # Replace "A3" with your actual condition value
combined_pval_cond2 = process_condition(metadat, "No")  # Replace "A1" with your actual condition value

# Count significance for each condition
sig_count_cond1, nonsig_count_cond1 = count_significance(combined_pval_cond1)
sig_count_cond2, nonsig_count_cond2 = count_significance(combined_pval_cond2)

# Creating the comparison DataFrame
comparison_df = pd.DataFrame(columns=combined_pval_cond1.columns, index=['cond1_sig', 'cond1_nonsig', 'cond2_sig', 'cond2_nonsig'])
comparison_df.loc['cond1_sig'] = sig_count_cond1
comparison_df.loc['cond1_nonsig'] = nonsig_count_cond1
comparison_df.loc['cond2_sig'] = sig_count_cond2
comparison_df.loc['cond2_nonsig'] = nonsig_count_cond2

# Display the DataFrame
print(comparison_df)


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, fisher_exact
from statsmodels.stats.multitest import multipletests

chi_square_results = pd.DataFrame(columns=['Feature', 'Test_statistic', 'p_value', 'degrees_of_freedom', 'Test_type', 'cond1_sig_count', 'cond2_sig_count'])
minimum_count_threshold = 1  # Define the threshold

for feature in comparison_df.columns:
    # Creating the contingency table for each feature
    contingency_table = comparison_df[[feature]].values.reshape(2, 2)

    # Extract counts of significant samples for each condition for the current feature
    cond1_sig_count = comparison_df.loc['cond1_sig', feature]
    cond2_sig_count = comparison_df.loc['cond2_sig', feature]

    # Check if any cell in the contingency table is below the threshold
    if np.any(contingency_table < minimum_count_threshold):
        # If any cell count is below the threshold, use Fisher's Exact Test
        odds_ratio, p = fisher_exact(contingency_table)
        result_row = pd.DataFrame([{
            'Feature': feature, 
            'Test_statistic': odds_ratio, 
            'p_value': p, 
            'Test_type': 'Fisher',
            'cond1_sig_count': cond1_sig_count,
            'cond2_sig_count': cond2_sig_count
        }])
    else:
        # Perform the Chi-square test
        chi_square_stat, p, dof, _ = chi2_contingency(contingency_table)
        result_row = pd.DataFrame([{
            'Feature': feature, 
            'Test_statistic': chi_square_stat, 
            'p_value': p, 
            'degrees_of_freedom': dof,
            'Test_type': 'Chi-square',
            'cond1_sig_count': cond1_sig_count,
            'cond2_sig_count': cond2_sig_count
        }])

    chi_square_results = pd.concat([chi_square_results, result_row], ignore_index=True)

# Sort the results by p-value in ascending order
sorted_results = chi_square_results.sort_values(by='p_value')

# Display the most significant gene sets
print(sorted_results.head())  # Adjust the number inside head() as needed

In [None]:
# Extract the original p-values
p_values = sorted_results['p_value'].values

# Apply multiple test correction 
correction_method = 'fdr_bh'  
rejected, corrected_p_values, _, _ = multipletests(p_values, alpha=0.05, method=correction_method)

# Add the corrected p-values to your DataFrame
sorted_results['corrected_p_value'] = corrected_p_values
sorted_results['rejected_h0'] = rejected


print(sorted_results.head(10))  


In [None]:

cell_type = "DCT_CNT_CD"  # Replace with your desired cell type
target_gene_set = "Regulation Of Monoatomic Ion Transmembrane Transport (GO:0034765)" 

metadata = metadat

# Load the R2 file
r2_path = os.path.join(base_path, r2_folder, f"R2_{cell_type}.csv")
pval_path = os.path.join(base_path, pval_folder, f"Pval_{cell_type}.csv")

r2_df = pd.read_csv(r2_path, index_col=0)
pval_df = pd.read_csv(pval_path, index_col=0)

# Define the sample groups (time points)

group1 = metadata[(metadata['Hypertension']=="No") & (metadata['Disease_level1']!="Control")].Sample.unique().tolist()

group2 = metadata[(metadata['Hypertension']=="Yes") & (metadata['Disease_level1']!="Control")].Sample.unique().tolist()


samples_to_remove = []
samples_to_remove = ['HK2886', '32-10346'] #missing cell types



group1 = [sample for sample in group1 if sample not in samples_to_remove]

group2 = [sample for sample in group2 if sample not in samples_to_remove]


significant_samples = pval_df.index[pval_df[target_gene_set] < 0.01]



group_positions = [0.5, 0.7]  # Adjust these values as needed


plt.figure(figsize=(2.5, 6))

# Initialize lists to store mean R2 values for each group
mean_r2_values = []

# Plot each individual sample at the new group positions
for i, group in enumerate([group1, group2]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    mean_r2 = group_r2_values.mean()
    mean_r2_values.append(mean_r2)

    for sample in group:
        x_position = group_positions[i]  # x-coordinate for this group
        color = 'red' if sample in significant_samples else 'blue'
        plt.plot(x_position, group_r2_values.loc[sample], marker='o', markersize=5, color=color)

# Plot the mean R2 values
plt.plot(group_positions, mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

# Customize the plot
plt.ylim(0.5, 1)  # Set the y-axis range from 0 to 1
plt.xlim(0.3, 0.9)  # Set the x-axis range
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks(group_positions, ["No", "Yes"])  # Set custom x-axis ticks
#plt.legend()
plt.grid(False)
plt.show()

In [None]:

plot_data = pd.DataFrame()
for i, group in enumerate([group1, group2]):
    group_r2_values = r2_df.loc[group, target_gene_set]
    group_data = pd.DataFrame({'R2 Value': group_r2_values, 'Group': i, 'Sample': group_r2_values.index})
    plot_data = pd.concat([plot_data, group_data])

# Add a column for color based on significance
plot_data['Color'] = plot_data['Sample'].apply(lambda x: 'red' if x in significant_samples else 'blue')

# Create a figure for the plot
plt.figure(figsize=(2, 6))



sns.swarmplot(x='Group', y='R2 Value', data=plot_data, hue='Color', palette=['blue', 'red'])
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2', zorder=10, markersize=12,
        markeredgewidth=2)

# Customize the plot
plt.ylim(0.5, 1)  # Set the y-axis range from 0 to 1
plt.xlabel(cell_type)
plt.ylabel("R2 Value")
plt.title(target_gene_set)
plt.xticks([0, 1], ["No", "Yes"])  # Set custom x-axis ticks
plt.legend().remove()  
plt.grid(False)
plt.plot(mean_r2_values, marker='_', color='black', linestyle='', label='Mean R2')

plt.show()
