In [1]:
### Purpose:
# Purpose of this code is to generate a plots to visualize the age distribution
# of each genotype for sex-determination RNA-seq samples.
# Date: 1/31/2025
# Requires: statistic files exported from Imaris, with nuclei volume per cell

In [13]:
# Import necessary libraries
import os
from pathlib import Path
import pandas as pd

base_dir = Path("/Data1/git/RNA_seq_sex_determ/aging_data") # Define the base directory where nuclei count statistics are located
pattern = "*_Nucleus_Volume.*" # Define regex pattern to match desired csv files
output_dir = Path("/Data1/git/RNA_seq_sex_determ/output/") # Define the output directory to save the plots

# Recursively find files that match the pattern
files = list(base_dir.rglob(pattern)) # List all files matching the pattern
print(f"Found {len(files)} matching files.")

dataframes = [] # Initialize an empty list to store dataframes
for file_path in files: # Iterate over the list of files
    try:
        # Read CSV, skipping the first 3 rows (so the 4th row is treated as header)
        df = pd.read_csv(file_path, skiprows=3)
        # Add a new column with the full file path as a string
        df['source_file'] = str(file_path) # source file has the genotype, so add this to column
        dataframes.append(df)
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")

# Concatenate all dataframes into one, if any were loaded successfully
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
else:
    combined_df = pd.DataFrame()

# Display the first few rows of the combined dataframe
combined_df.head()


# Define the lookup dictionary
lookup = {
    "N2_young": "N2_young_2025",
    "sex1_ceh39_young": "sex1ceh39_degron_young_2025",
    "sex1_fox1_young": "sex1fox1_degron_young_2025",
    "sex1_young": "sex1_degron_young_2025"
}

# Clean up the source_file column to only have the genotype part
combined_df['source_file'] = combined_df['source_file'].str.replace("/Data1/git/RNA_seq_sex_determ/aging_data/", "")
combined_df['source_file'] = combined_df['source_file'].str.replace("/.*", "", regex=True)
# replace "_stats" with "" in source_file
combined_df['source_file'] = combined_df['source_file'].str.replace("_stats", "", regex=True)

# Replace the source_file values using the lookup table, leaving unmatched values unchanged
combined_df['source_file'] = combined_df['source_file'].map(lookup).fillna(combined_df['source_file'])

# Display the final combined dataframe
combined_df.head()

# Group by source_file, Original Image Name, and CellID to count the number of nuclei per cell
grouped = combined_df.groupby(['source_file', 'Original Image Name', 'CellID']).size().reset_index(name='nuclei_count')



# Import additional dataframe
additional_df = pd.read_csv("/Data1/git/RNA_seq_sex_determ/aging_data/summary_age_bin_ym3_feb2025_update.csv")

# Rename columns to match grouped
additional_df = additional_df.rename(columns={"genotype": "source_file", "count": "nuclei_count"})

# Add missing columns
additional_df['CellID'] = 0
additional_df['Original Image Name'] = pd.NA

# Ensure columns are in the same order as grouped
additional_df = additional_df[['source_file', 'Original Image Name', 'CellID', 'nuclei_count']]

# Append to grouped
grouped = pd.concat([grouped, additional_df], ignore_index=True)

# Display the first few rows of the grouped dataframe
grouped.head()

# drop anyrows containing "mid" in source_file and sort by source_file
grouped = grouped[~grouped['source_file'].str.contains("mid")]

# drop any rows containing 2023
grouped = grouped[~grouped['source_file'].str.contains("2023")]

grouped = grouped.sort_values(by='source_file')
# reset index
grouped = grouped.reset_index(drop=True)

Found 12 matching files.


In [14]:
import plotly.graph_objects as go

fig = go.Figure()

# For each source_file, compute the cumulative distribution of nuclei counts
for src, df_src in grouped.groupby('source_file'):
    cell_count = len(df_src)
    # Replace "_stats" with " (n= X)" if found; otherwise, append the cell count.
    if "_stats" in src:
        label = src.replace("_stats", f" (n= {cell_count})")
    else:
        label = f"{src} (n= {cell_count})"
        
    # Count the frequency of each nuclei_count value and sort by nuclei_count
    count_freq = df_src['nuclei_count'].value_counts().sort_index()
    
    # Compute the cumulative percentage of cells having <= X nuclei
    cumulative_percentage = count_freq.cumsum() / count_freq.sum() * 100

    # Add a trace for the current source_file with the updated label
    fig.add_trace(go.Scatter(
        x=cumulative_percentage.index,
        y=cumulative_percentage.values,
        mode='lines+markers',
        name=label
    ))

# Update layout: transparent background, white axes/ticks and grid lines.
fig.update_layout(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(
        title="Number of Nuclei",
        titlefont=dict(color='white'),
        tickfont=dict(color='white'),
        gridcolor='rgba(255,255,255,0.2)'
    ),
    yaxis=dict(
        title="Cumulative % of Cells",
        titlefont=dict(color='white'),
        tickfont=dict(color='white'),
        gridcolor='rgba(255,255,255,0.2)'
    ),
    legend=dict(font=dict(color='white'))
)

#set width to 800
fig.update_layout(width=800)

fig.show()

# save the figure to png and svg (output_dir,"/aging_01_31_2025_CDF_plot.png")
fig.write_image(output_dir / "aging_01_31_2025_CDF_plot.png")
fig.write_image(output_dir / "aging_01_31_2025_CDF_plot.svg")


In [15]:
fig3 = go.Figure()

# Loop over each source_file to create a violin plot trace and add a median annotation.
for src in grouped['source_file'].unique():
    df_sub = grouped[grouped['source_file'] == src]
    cell_count = len(df_sub)
    # Update the label as described.
    if "_stats" in src:
        label = src.replace("_stats", f" (n= {cell_count})")
    else:
        label = f"{src} (n= {cell_count})"
    
    # Use the updated label for the x-values so that the category reflects the new label.
    fig3.add_trace(go.Violin(
        x=[label] * len(df_sub),
        y=df_sub['nuclei_count'],
        name=label,
        box_visible=True,
        meanline_visible=True
    ))
    median_value = df_sub['nuclei_count'].median()
    mean_value = df_sub['nuclei_count'].mean()
    fig3.add_annotation(
        x=label,
        y=mean_value,
        text=f"{mean_value:.1f}",
        showarrow=False,
        font=dict(color="white")
    )

fig3.update_layout(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(
        title="Source File",
        titlefont=dict(color='white'),
        tickfont=dict(color='white')
    ),
    yaxis=dict(
        title="Nuclei Count per Cell",
        titlefont=dict(color='white'),
        tickfont=dict(color='white')
    ),
    legend=dict(font=dict(color='white'))
)

# set width to 800
fig3.update_layout(width=800)
fig3.show()

# save the figure to png and svg
fig3.write_image(output_dir / "aging_01_31_2025_violin_plot.png")
fig3.write_image(output_dir / "aging_01_31_2025_violin_plot.svg")
