# Calculate Weighted Pathway Similarity
Define and use the `calculate_weighted_pathway_similarity` function to compute various similarity metrics between gene programs and their initialization pathways. This section will include the function definition and its application to the `gene_program_matrix_df` and `cached_pathways` data.

In [None]:
# Define the function to calculate weighted pathway similarity
def calculate_weighted_pathway_similarity(gene_program_df, pathway_data, current_gene_columns):
    """
    Calculate multiple similarity measures between gene programs and their initialization pathways
    that account for gene weights.
    
    Parameters:
    -----------
    gene_program_df : DataFrame
        Contains gene weights for each program
    pathway_data : dict
        Contains original pathway gene sets
    current_gene_columns : list
        Column names in gene_program_df that represent genes
        
    Returns:
    --------
    DataFrame with various similarity metrics
    """
    results = []
    pathway_keys_list = list(pathway_data.keys())
    
    for i, program_name in enumerate(gene_program_df.index):
        if i >= len(pathway_keys_list):
            break
            
        original_pathway_key = pathway_keys_list[i]
        original_genes = set(pathway_data[original_pathway_key])
        
        # Skip if no pathway genes defined
        if not original_genes:
            continue
            
        # Get gene weights for this program
        weights = gene_program_df.loc[program_name, current_gene_columns]
        
        # Create a mask for pathway genes
        pathway_mask = pd.Series(0, index=current_gene_columns)
        pathway_mask.loc[[g for g in original_genes if g in current_gene_columns]] = 1
        
        # Calculate various similarity measures
        
        # 1. Weight concentration ratio: sum of weights of pathway genes / sum of all weights
        total_weight = np.sum(np.abs(weights))
        pathway_weight = np.sum(np.abs(weights) * pathway_mask)
        weight_concentration = (pathway_weight / total_weight) * 100 if total_weight > 0 else 0
        
        # 2. Top weights in pathway: % of top N genes by weight that are in pathway
        N = min(50, len(weights))
        top_genes = weights.abs().sort_values(ascending=False).head(N).index
        top_in_pathway = sum(1 for g in top_genes if g in original_genes) / len(top_genes) * 100
        
        # 3. Enrichment score: similar to GSEA, how enriched are pathway genes in the ranked list
        ranked_genes = weights.abs().sort_values(ascending=False).index
        running_sum = 0
        max_score = 0
        min_score = 0
        
        # Normalized weights for enrichment calculation
        norm_weights = weights.abs() / np.sum(weights.abs())
        
        for j, gene in enumerate(ranked_genes):
            if gene in original_genes:
                # Add normalized weight when we hit a pathway gene
                running_sum += norm_weights[gene] * len(current_gene_columns) / len(original_genes)
            else:
                # Subtract when not a pathway gene
                running_sum -= 1 / (len(current_gene_columns) - len(original_genes))
                
            max_score = max(max_score, running_sum)
            min_score = min(min_score, running_sum)
        
        # Final enrichment score - take the max deviation from 0
        enrichment_score = max_score if abs(max_score) > abs(min_score) else min_score
        
        # 4. Average rank of pathway genes
        ranks = pd.Series(range(1, len(ranked_genes)+1), index=ranked_genes)
        pathway_ranks = ranks[[g for g in ranks.index if g in original_genes]]
        avg_pathway_rank = pathway_ranks.mean()
        rank_percentile = (1 - (avg_pathway_rank / len(weights))) * 100  # Higher is better
        
        # 5. Weight distribution comparison
        # Compare mean weight of pathway genes vs non-pathway genes
        pathway_genes_weights = [weights[g] for g in weights.index if g in original_genes]
        non_pathway_genes_weights = [weights[g] for g in weights.index if g not in original_genes]
        
        mean_pathway_weight = np.mean(np.abs(pathway_genes_weights)) if pathway_genes_weights else 0
        mean_non_pathway_weight = np.mean(np.abs(non_pathway_genes_weights)) if non_pathway_genes_weights else 0
        
        weight_ratio = mean_pathway_weight / mean_non_pathway_weight if mean_non_pathway_weight > 0 else float('inf')
        
        # Store results
        results.append({
            'Program': program_name,
            'Original_Pathway': original_pathway_key,
            'Original_Pathway_Size': len(original_genes),
            'Weight_Concentration': weight_concentration,
            'Top50_In_Pathway': top_in_pathway,
            'Enrichment_Score': enrichment_score,
            'Pathway_Rank_Percentile': rank_percentile,
            'Weight_Ratio': weight_ratio,
            'Upsilon': gene_program_df.loc[program_name, 'upsilon'] if 'upsilon' in gene_program_df.columns else np.nan
        })
    
    return pd.DataFrame(results)

# Apply the function to calculate similarity metrics
similarity_df = calculate_weighted_pathway_similarity(gene_program_matrix_df, cached_pathways, gene_columns)

# Visualize Distributions of Similarity Metrics
Generate and display histograms to visualize the distributions of calculated similarity metrics such as 'Weight_Concentration', 'Top50_In_Pathway', 'Enrichment_Score', and 'Weight_Ratio'.

In [None]:
# Visualize distributions of similarity metrics
fig, axes = plt.subplots(2, 2, figsize=(18, 14))

# 1. Weight concentration distribution
sns.histplot(similarity_df['Weight_Concentration'], kde=True, ax=axes[0, 0])
axes[0, 0].set_title('Distribution of Weight Concentration in Pathway Genes')
axes[0, 0].set_xlabel('% of Total Weight in Pathway Genes')
axes[0, 0].set_ylabel('Count')
axes[0, 0].grid(True, alpha=0.3)

# 2. Top genes in pathway distribution
sns.histplot(similarity_df['Top50_In_Pathway'], kde=True, ax=axes[0, 1])
axes[0, 1].set_title('Percentage of Top 50 Genes in Original Pathway')
axes[0, 1].set_xlabel('% of Top 50 Genes in Pathway')
axes[0, 1].set_ylabel('Count')
axes[0, 1].grid(True, alpha=0.3)

# 3. Enrichment score distribution
sns.histplot(similarity_df['Enrichment_Score'], kde=True, ax=axes[1, 0])
axes[1, 0].set_title('Distribution of Pathway Enrichment Scores')
axes[1, 0].set_xlabel('Enrichment Score')
axes[1, 0].set_ylabel('Count')
axes[1, 0].grid(True, alpha=0.3)

# 4. Weight ratio distribution
sns.histplot(similarity_df['Weight_Ratio'], kde=True, ax=axes[1, 1])
axes[1, 1].set_title('Ratio of Mean Pathway Gene Weight to Non-Pathway Gene Weight')
axes[1, 1].set_xlabel('Weight Ratio (higher = more pathway focused)')
axes[1, 1].set_ylabel('Count')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Explore Relationship Between Metrics and Upsilon
Create scatter plots to investigate the relationships between the calculated similarity metrics (e.g., 'Weight_Concentration', 'Top50_In_Pathway', 'Enrichment_Score', 'Weight_Ratio') and the 'Upsilon' value, potentially using 'Original_Pathway_Size' or other metrics for hue and size encoding.

In [None]:
# Scatter plots to explore relationships between metrics and Upsilon
fig, axes = plt.subplots(2, 2, figsize=(18, 14))

# 1. Weight Concentration vs Upsilon
sns.scatterplot(
    x='Weight_Concentration', 
    y='Upsilon', 
    hue='Original_Pathway_Size', 
    size='Enrichment_Score', 
    sizes=(20, 200), 
    alpha=0.7, 
    data=similarity_df, 
    ax=axes[0, 0]
)
axes[0, 0].set_title('Weight Concentration vs Upsilon')
axes[0, 0].set_xlabel('% of Weight in Pathway Genes')
axes[0, 0].set_ylabel('Upsilon Value')
axes[0, 0].grid(True, alpha=0.3)

# 2. Top 50 Genes in Pathway vs Upsilon
sns.scatterplot(
    x='Top50_In_Pathway', 
    y='Upsilon', 
    hue='Original_Pathway_Size', 
    size='Enrichment_Score', 
    sizes=(20, 200), 
    alpha=0.7, 
    data=similarity_df, 
    ax=axes[0, 1]
)
axes[0, 1].set_title('Top 50 Genes in Pathway vs Upsilon')
axes[0, 1].set_xlabel('% of Top 50 Genes in Pathway')
axes[0, 1].set_ylabel('Upsilon Value')
axes[0, 1].grid(True, alpha=0.3)

# 3. Enrichment Score vs Upsilon
sns.scatterplot(
    x='Enrichment_Score', 
    y='Upsilon', 
    hue='Original_Pathway_Size', 
    size='Weight_Concentration', 
    sizes=(20, 200), 
    alpha=0.7, 
    data=similarity_df, 
    ax=axes[1, 0]
)
axes[1, 0].set_title('Enrichment Score vs Upsilon')
axes[1, 0].set_xlabel('Enrichment Score')
axes[1, 0].set_ylabel('Upsilon Value')
axes[1, 0].grid(True, alpha=0.3)

# 4. Weight Ratio vs Upsilon
sns.scatterplot(
    x='Weight_Ratio', 
    y='Upsilon', 
    hue='Original_Pathway_Size', 
    size='Top50_In_Pathway', 
    sizes=(20, 200), 
    alpha=0.7, 
    data=similarity_df, 
    ax=axes[1, 1]
)
axes[1, 1].set_title('Weight Ratio vs Upsilon')
axes[1, 1].set_xlabel('Pathway to Non-Pathway Weight Ratio')
axes[1, 1].set_ylabel('Upsilon Value')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Display Summary Statistics of Similarity Measures
Print descriptive summary statistics for the key similarity measures calculated, such as 'Weight_Concentration', 'Top50_In_Pathway', 'Enrichment_Score', 'Pathway_Rank_Percentile', and 'Weight_Ratio'.

In [None]:
# Display summary statistics for similarity measures
print("Summary Statistics for Similarity Measures:")
print(similarity_df[['Weight_Concentration', 'Top50_In_Pathway', 
                     'Enrichment_Score', 'Pathway_Rank_Percentile', 
                     'Weight_Ratio']].describe())

# Identify High-Fidelity Gene Programs
Filter the `similarity_df` to identify and display gene programs that show high fidelity to their initialization pathways, for example, by selecting programs with 'Weight_Concentration' above a certain quantile. Display relevant columns for these programs.

In [None]:
# Identify high-fidelity gene programs
high_fidelity = similarity_df[similarity_df['Weight_Concentration'] > similarity_df['Weight_Concentration'].quantile(0.75)]

# Display relevant columns for high-fidelity programs
display_cols = ['Program', 'Original_Pathway', 'Weight_Concentration', 
                'Top50_In_Pathway', 'Enrichment_Score', 'Upsilon']
print("\nTop Gene Programs with Highest Weight Concentration in Pathway Genes:")
print(high_fidelity[display_cols].sort_values('Weight_Concentration', ascending=False).head(10))

# Identify Low-Fidelity Gene Programs
Filter the `similarity_df` to identify and display gene programs that have diverged significantly from their initialization pathways, for example, by selecting programs with 'Weight_Concentration' below a certain quantile. Display relevant columns for these programs.

In [None]:
# Identify low-fidelity gene programs
low_fidelity = similarity_df[similarity_df['Weight_Concentration'] < similarity_df['Weight_Concentration'].quantile(0.25)]

# Display relevant columns for low-fidelity programs
display_cols = ['Program', 'Original_Pathway', 'Weight_Concentration', 
                'Top50_In_Pathway', 'Enrichment_Score', 'Upsilon']
print("\nTop Gene Programs with Lowest Weight Concentration in Pathway Genes:")
print(low_fidelity[display_cols].sort_values('Weight_Concentration').head(10))