In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import anndata as ad
import scanpy.external as sce
from sklearn import preprocessing
import pickle5 as pickle
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
from sklearn import preprocessing
import sklearn
from sklearn.metrics import accuracy_score
import pandas as pd
from scipy.spatial.distance import pdist 
from scipy.stats import mannwhitneyu
from utils import *
import requests
import seaborn as sns

eps=1e-100


In [None]:
# ad_gene_embedding.obs
ad_gene_embedding=sc.read_h5ad(f"../source_data/ad_embed.h5ad")


In [None]:
# URL of the file to be downloaded
url = 'http://data.pantherdb.org/ftp/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/PTHR18.0_mouse'
response = requests.get(url)
if response.status_code == 200:
    with open('../source_data/PTHR18.0_mouse.txt', 'wb') as file:
        file.write(response.content)
    print("File downloaded successfully.")
else:
    print(f"Failed to download the file. Status code: {response.status_code}")


In [None]:
file_path = '../source_data/PTHR18.0_mouse.txt'

with open(file_path, 'r') as file:
    content = file.read()
    

lines = content.strip().split('\n')
data = [line.split('\t') for line in lines]
df = pd.DataFrame(data)
num_columns = len(data[0])
column_names = [f'Column{i+1}' for i in range(num_columns)]
df.columns = column_names

df = df.rename(columns={'Column1':'MGI_UniProtKB_ID',
                  'Column3':'gene_symbol',
                  'Column5':'protein_family_main',
                   'Column6':'protein_family_sub',
                   'Column7':'MF_term',
                   'Column8':'BP_term',
                   'Column9':'CC_term',})

In [None]:
umap_coordinates={}
for gene in ad_gene_embedding.obs.index:
    umap_coordinates[gene] = np.array(ad_gene_embedding[gene].obsm['X_umap'])

In [None]:
p_values_all={}
for lookcolumn in ['protein_family_main','protein_family_sub','MF_term','BP_term','CC_term']:
    p_values_all[lookcolumn]=[]
    indd=0
    allfocus=df[lookcolumn].unique()
    for ind in tqdm(range(len(allfocus))):
        focus = allfocus[ind]
        if focus!='':
            samplelist = [i.upper() for i in df.loc[df[lookcolumn]==focus,'gene_symbol'].unique()]

            all_list = []
            for i in samplelist:
                if i in ad_gene_embedding.obs.index:
                    all_list.append(i)
            go_genes = all_list

            # more than three genes in this GO term
            if len(all_list)>3:
                go_distances = pdist(np.array([umap_coordinates[gene] for gene in go_genes]).squeeze())

                random_distances = []
                for _ in range(1000):
                    random_genes = random.sample(umap_coordinates.keys(), len(go_genes))
                    distances = pdist(np.array([umap_coordinates[gene] for gene in random_genes]).squeeze())
                    random_distances.append(np.mean(distances))  # Using mean as an example

                # Statistical test
                u_stat, p_value = mannwhitneyu([np.mean(go_distances)], random_distances, alternative='less')

                
                p_values_all[lookcolumn].append(p_value)

In [None]:
import json

# filename = '../source_data/PANTHER/p_values_all.json'
# with open(filename, 'w') as f:
#     json.dump(p_values_all, f, indent=4)

with open('../source_data/PANTHER/p_values_all.json', 'r') as f:
    p_values_all = json.load(f)

In [None]:
from statsmodels.stats.multitest import multipletests
adj_p_values_all={}
for lookcolumn in ['protein_family_main','protein_family_sub','MF_term','BP_term','CC_term']:
    focus_pvalue = p_values_all[lookcolumn]
    
    # Apply Benjamini-Hochberg correction
    adjusted_p_values = multipletests(focus_pvalue, method='fdr_bh')[1]
    adj_p_values_all[lookcolumn]=adjusted_p_values
    
    # Interpret adjusted p-values
    significant_results = adjusted_p_values < 0.05

    print(f'{sum(significant_results)} {lookcolumn} are significant close.')
    

#### plot UMAP

In [None]:
lookcolumn='protein_family_main'

names_all={}
genes_all={}

names_all[lookcolumn]=[]
genes_all[lookcolumn]=[]
indd=0
allfocus=df[lookcolumn].unique()
for ind in tqdm(range(len(allfocus))):
    focus = allfocus[ind]
    if focus!='':
        samplelist = [i.upper() for i in df.loc[df[lookcolumn]==focus,'gene_symbol'].unique()]

        all_list = []
        for i in samplelist:
            if i in ad_gene_embedding.obs.index:
                all_list.append(i)
        go_genes = all_list

        if len(all_list)>3:
            names_all[lookcolumn].append(focus)
            genes_all[lookcolumn].append(all_list)
            
flattened_list = [item for sublist in genes_all[lookcolumn] for item in sublist]

In [None]:
category_all=[]
all_genes = []
for focus in np.array(names_all[lookcolumn])[np.array(adj_p_values_all[lookcolumn])<0.05]:
    
    samplelist = [i.upper() for i in df.loc[df[lookcolumn]==focus,'gene_symbol'].unique()]


    all_list = []
    for i in samplelist:
        if i in ad_gene_embedding.obs.index:
            all_list.append(i)
    all_genes.append(all_list)
    category_all.append(focus)
    
flattened_list = [item for sublist in all_genes for item in sublist]

ad_gene_embedding.obs[f'category_{lookcolumn}']=''
for gene, categ in zip(all_genes,category_all):
    ad_gene_embedding.obs.loc[gene,f'category_{lookcolumn}']=categ

In [None]:
ad_gene_embedding.obs['category']=ad_gene_embedding.obs['category_protein_family_main']

ad_gene_embedding_sub = ad_gene_embedding[ad_gene_embedding.obs['category']!='',:]


In [None]:
np.random.seed(40)
color_palette=np.random.rand(ad_gene_embedding_sub.obs['category'].unique().shape[0],3)
color_dic = {}
for ind,i in enumerate(ad_gene_embedding_sub.obs['category'].unique()):
    color_dic[i]=color_palette[ind]
    

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(ad_gene_embedding.obsm['X_umap'][:,0],
           ad_gene_embedding.obsm['X_umap'][:,1],s=1,c='lightgrey')
for i in ad_gene_embedding_sub.obs['category'].unique():
    

    plt.scatter(np.median(ad_gene_embedding_sub[ad_gene_embedding_sub.obs['category']==i,:].obsm['X_umap'][:,0]),
               np.median(ad_gene_embedding_sub[ad_gene_embedding_sub.obs['category']==i,:].obsm['X_umap'][:,1]),
                         s=100*np.log10(ad_gene_embedding_sub[ad_gene_embedding_sub.obs['category']==i,:].shape[0]),
                lw=2,facecolors='white', edgecolors=color_dic[i])

    
plt.scatter(16,-1, s=100*np.log10(1000), lw=2,
        facecolors='white', edgecolors='k')

plt.scatter(16,0, s=100*np.log10(100), lw=2,
        facecolors='white', edgecolors='k')

plt.scatter(16,1, s=100*np.log10(10), lw=2,
        facecolors='white', edgecolors='k')


#### plot examples

In [None]:
plot_name=['LIPOCALIN','LUPUS LA PROTEIN-RELATED','CHARGED MULTIVESICULAR BODY PROTEIN',
          'RNA RECOGNITION MOTIF  RRM  DOMAIN CONTAINING PROTEIN','G PROTEIN-COUPLED RECEPTOR',
          'MICROTUBULE-ASSOCIATED PROTEINS 1A/1B LIGHT CHAIN 3-RELATED','CHYMOTRYPSIN-RELATED',
           'NEUROPEPTIDES RECEPTOR']

In [None]:

from adjustText import adjust_text

for i in plot_name:
    focus=ad_gene_embedding_sub[ad_gene_embedding_sub.obs['category']==i,:]
    x = list(focus.obsm['X_umap'][:,0])
    y = list(focus.obsm['X_umap'][:,1])
    texts = list(focus.obs.index)
    texts=[i.capitalize() for i in texts]

    fig, ax = plt.subplots(figsize=(8,8))


    ax.scatter(ad_gene_embedding.obsm['X_umap'][:,0],
               ad_gene_embedding.obsm['X_umap'][:,1],s=1,c='lightgrey')

    ax.scatter(focus.obsm['X_umap'][:,0],
                focus.obsm['X_umap'][:,1],
                         s=10,color=color_dic[i])

    texts_to_adjust = []
    for ind, txt in enumerate(texts):
        texts_to_adjust.append(ax.text(x[ind], y[ind], txt, fontname='Arial',))

    adjust_text(texts_to_adjust, x=x, y=y, arrowprops=dict(arrowstyle='->', color='red'))
    plt.axis('off')
    i=i.split('/')[0]
#     plt.savefig(f'figures_refine/{i}.png',dpi=300,transparent=True)


#### plot pairwise distance

In [None]:
distance_summaries = {}
p_values_all={}
for lookcolumn in ['protein_family_main']:
    distance_summaries[lookcolumn] = {'GO': [], 'Random': [],'corr':[]}
    p_values_all[lookcolumn]=[]
    indd=0
    allfocus=df[lookcolumn].unique()
    for ind in tqdm(range(len(allfocus))):
        focus = allfocus[ind]
        if focus!='':
            samplelist = [i.upper() for i in df.loc[df[lookcolumn]==focus,'gene_symbol'].unique()]


            all_list = []
            for i in samplelist:
                if i in ad_gene_embedding.obs.index:
            #         print(i)
                    all_list.append(i)
            go_genes = all_list

            if len(all_list)>3:
                go_distances = pdist(np.array([umap_coordinates[gene] for gene in go_genes]).squeeze())

                # Calculate distances for random groups
                random_distances = []
                for _ in range(1000):
                    random_genes = random.sample(umap_coordinates.keys(), len(go_genes))
                    distances = pdist(np.array([umap_coordinates[gene] for gene in random_genes]).squeeze())
                    random_distances.append(np.mean(distances))  # Using mean as an example

                distance_summaries[lookcolumn]['GO'].append(np.mean(go_distances))
                distance_summaries[lookcolumn]['Random'].append(np.mean(random_distances))

                # Statistical test
                u_stat, p_value = mannwhitneyu([np.mean(go_distances)], random_distances, alternative='less')
                distance_summaries[lookcolumn]['corr'].append(p_value)
df_for_plotting = pd.DataFrame(distance_summaries['protein_family_main'])

In [None]:
pd_for_plotting={'dis':[],'identity':[]}
for i,j,z in zip(df_for_plotting['GO'],df_for_plotting['Random'],df_for_plotting['corr']):
        pd_for_plotting['identity'].append('GO')
        pd_for_plotting['dis'].append(i)
        pd_for_plotting['identity'].append('Random')
        pd_for_plotting['dis'].append(j)
pd_for_plotting = pd.DataFrame(pd_for_plotting)

In [None]:
import seaborn as sns
from statannotations.Annotator import Annotator

plt.figure(figsize=(3, 8))

my_pal = {"GO": '#bec2bf',  "Random":"#8e80ad"}
g= sns.boxplot(x='identity', y='dis',data=pd_for_plotting,fliersize=0,palette=my_pal,
                 zorder=1)# boxprops=dict(facecolor='none', edgecolor='black'))
for i, artist in enumerate(g.artists):
    col = colors[i % len(colors)]
    artist.set_facecolor(col)
g= sns.swarmplot(x='identity', y='dis',data=pd_for_plotting,s=1.5,color='k',
                 zorder=2,alpha=1)

pairs=[("GO", "Random")]

annotator = Annotator(g, pairs, data=pd_for_plotting, x='identity', y='dis', 
                      order=['GO','Random'])
annotator.configure(test='Mann-Whitney', text_format='star', loc='outside')
annotator.apply_and_annotate()


# For a violin plot, use sns.violinplot instead
plt.title("Distribution of Pairwise Distances")
plt.xlabel("Group")
plt.ylabel("Pairwise Distance")
plt.ylim([0,11])
plt.show()
