In [1]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import os
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

ModuleNotFoundError: No module named 'dask'

In [None]:
def count_goslim(df, file_name: str):
    unique_gene_pairs = df.drop_duplicates(subset=['GOSlim GOA Accession(s)', 'Gene stable ID_human', 'Gene stable ID_rice'])
    goslim_counts = unique_gene_pairs.groupby(['GOSlim GOA Accession(s)', 'GOSlim GOA Description']).size().reset_index(name='counts')
    goslim_counts = goslim_counts.sort_values(by='counts', ascending=False).copy()
    results_directory = '../data/goslim_correspondence_counts'
    os.makedirs(results_directory, exist_ok=True)
    goslim_counts.to_csv(f'{results_directory}/{file_name}', sep='\t', index=False)
    return goslim_counts

In [None]:
merged_df_goslim_pd = pd.read_csv('../data/GOslim_merge/GOslim_merge_common_goslim_correspondence_all_up.tsv', 
                                  sep='\t',
                                  low_memory=False)
merged_df_up = pd.read_csv('../data/circos_result_UP_2311/merged_goslim_sorted_UP.tsv', sep='\t')

display(merged_df_goslim_pd)
display(merged_df_up)

In [None]:
gosilm_correspondence_up_counts = count_goslim(merged_df_up, 'goslim_correspondence_counts_UP.tsv')
goslim_correspondence_all_counts = count_goslim(merged_df_goslim_pd, 'goslim_correspondence_counts_all.tsv')

display(gosilm_correspondence_up_counts)
display(goslim_correspondence_all_counts)

In [None]:
# for debugging purpose
total_counts_up = gosilm_correspondence_up_counts['counts'].sum()
total_counts_all = goslim_correspondence_all_counts['counts'].sum()


unique_rows_up = merged_df_up.drop_duplicates(subset=['GOSlim GOA Accession(s)', 'Gene stable ID_human', 'Gene stable ID_rice']).shape[0]
unique_rows_all = merged_df_goslim_pd.drop_duplicates(subset=['GOSlim GOA Accession(s)', 'Gene stable ID_human', 'Gene stable ID_rice']).shape[0]

print(f"Total counts: {unique_rows_all}")
print(f"up counts: {unique_rows_up}")
print(f"Counts match rows: {unique_rows_up == total_counts_up}")


In [None]:

def fold_enrichment(df_goslim_up_counts, df_goslim_all_counts, unique_genes_up, unique_genes_all, file_name: str):
    """_summary_
    Args:
        df_goslim_up_counts (dataframe): _description_
        df_goslim_all_counts (dataframe): _description_
        unique_genes_up (): _description_
        unique_genes_all (dataframe): _description_
        file_name (str): _description_

    Returns:
        dataframe : _description_
    """
    merged_df = pd.merge(df_goslim_up_counts, 
                         df_goslim_all_counts,
                         on=['GOSlim GOA Accession(s)', 'GOSlim GOA Description'], 
                         how='right', # if there is no match, fill with NaN or 0
                         suffixes=('_up', '_all'))
    
    merged_df['counts_up'] = merged_df['counts_up'].fillna(0)
    merged_df['counts_all'] = merged_df['counts_all'].fillna(0)
    merged_df['up_ratio'] = merged_df['counts_up'] / unique_genes_up
    merged_df['all_ratio'] = merged_df['counts_all'] / unique_genes_all
    merged_df['fold_enrichment'] = merged_df['up_ratio'] / merged_df['all_ratio']
    merged_df.replace([np.inf, -np.inf], np.nan, inplace=True) # inf -> nan
    results_directory = '../data/goslim_fold_enrichment_correspondence'
    os.makedirs(results_directory, exist_ok=True)
    merged_df.to_csv(f'{results_directory}/{file_name}', sep='\t', index=False)
    
    return merged_df


goslim_correspondence_enrichment = fold_enrichment(gosilm_correspondence_up_counts, goslim_correspondence_all_counts, unique_rows_up, unique_rows_all, 'goslim_correspondence_fold_enrichment.tsv')
display(goslim_correspondence_enrichment)

## Q-value

In [None]:

def calculate_p_q_values(df_enrichment, unique_correspondence_up, unique_correspondence_all, file_name: str):
    p_values = []
    df_enrichment['counts_up'] = df_enrichment['counts_up'].astype(int) # float(0.0) to int(0)
    for index, row in df_enrichment.iterrows():
        if row['counts_up'] == 0:
            p_values.append(1.0)
        else:
            observed_up_correspondence = row['counts_up'] # create contingency table
            observed_all_correspondence = row['counts_all']
            observed_not_up_correspondence = observed_all_correspondence - observed_up_correspondence
            total_not_up_correspondence = unique_correspondence_all - unique_correspondence_up
            table = [
            [observed_up_correspondence, unique_correspondence_up - observed_up_correspondence],
            [observed_not_up_correspondence, total_not_up_correspondence - observed_not_up_correspondence]
            ]
            _, p_value = fisher_exact(table, alternative='greater') # Fisher's exact test
            p_values.append(p_value)
    
    _, q_values, _, _ = multipletests(p_values, 
                                      alpha=0.05, 
                                      method='fdr_bh') # Benjamini/Hochberg method
    
    df_enrichment['p_value'] = p_values
    df_enrichment['q_value'] = q_values

    results_directory = '../data/goslim_correspondence_q_values'
    os.makedirs(results_directory, exist_ok=True)
    df_enrichment['GOSlim'] = df_enrichment['GOSlim GOA Accession(s)'] + ": " + df_enrichment['GOSlim GOA Description']
    cols = df_enrichment.columns.tolist()
    cols = [cols[-1]] + cols[:-1]
    df_enrichment = df_enrichment[cols]
    df_enrichment.drop(['GOSlim GOA Accession(s)', 'GOSlim GOA Description'], axis=1, inplace=True)
    df_enrichment = df_enrichment[df_enrichment['counts_up'] != 0]
    df_enrichment = df_enrichment.sort_values(by='fold_enrichment', ascending=False)
    df_enrichment.to_csv(f'{results_directory}/{file_name}', sep='\t', index=False)

    return df_enrichment

goslim_correspondence_with_p_q = calculate_p_q_values(goslim_correspondence_enrichment,
                                                      unique_rows_up,
                                                      unique_rows_all, 
                                                      'goslim_correspondence_fold_enrichment_p_q.tsv')

display(goslim_correspondence_with_p_q)

In [None]:
color = sns.color_palette("flare", as_cmap=True)

goslim_correspondence_with_p_q['-log10(q-value)'] = -np.log10(goslim_correspondence_with_p_q['q_value'])

plt.figure(figsize=(6, 9))

dotplot = sns.scatterplot(
    data=goslim_correspondence_with_p_q,
    x='fold_enrichment',
    y='GOSlim',
    size='counts_up', 
    hue='-log10(q-value)' ,     
    palette=color, 
    legend='brief'
)

plt.grid(color='b', linestyle=':', linewidth=0.1)

plt.xlabel('Fold Enrichment')
plt.ylabel('Common GOSlim Terms')



plt.show()