In [2]:
import os
import time
import datetime
import json
import requests
import requests.exceptions
import pandas as pd
import polars as pl
from pathlib import Path
from upsetplot import UpSet, from_memberships
from matplotlib import pyplot as plt
from matplotlib.patches import Patch

# (1) Create output directory

In [None]:
#output directory
direction = "up"
species = "rice"
now = datetime.datetime.now()
result = Path(f'../out/upsetplot_{direction}_{now.strftime("%y%m")}_{species}')
result.mkdir(parents=True, exist_ok=True)

## (2) Gene Ontology information

- GOSlimのタームであるGO:0006950 response to stressがアノテーションされているか
- アノテーションされている場合、マッピングされたGOのエビデンスコードがIEAのみかを確認

In [None]:
goslim_mapping = pl.read_csv(
    "../Data/Data_quickgo/240628/QuickGO_rice_all_goslim_mapping_up_upsetplot_v2.tsv", separator="\t"
).drop(
    ["Entry", "GOSlim", "SLIMMED FROM", "GO EVIDENCE CODE"]
).with_columns( # アノテーションされたGOのエビデンスコードがIEAのみの場合は、response to stressをNoneにする(upsetplotのため)
    pl.when(
        pl.col("response to stress (only IEA)").is_not_null()
    ).then(
        None
    ).otherwise(
        pl.col("response to stress")
    ).alias("response to stress")
)

display(goslim_mapping)

In [None]:
# データフレームをPandasに変換
goslim_mapping_pd = goslim_mapping.to_pandas()

pd.set_option('mode.chained_assignment', None)
pd.set_option('future.no_silent_downcasting', True)

# convert to dict for upsetplot
membership_dict = {}
for index, row in goslim_mapping_pd.iterrows():
    membership_list = []
    for col in goslim_mapping_pd.columns[1:]:
        if row[col] is not None:
            membership_list.append(col)
    membership_dict[row["From"]] = membership_list

# upsetplot用のデータに変換
upset_data = from_memberships(membership_dict.values())

# upsetplotを使用して可視化
upset_plot = UpSet(upset_data, 
                   orientation='horizontal',
                   show_counts="{:d}",
                   subset_size='count', 
                   include_empty_subsets = False)

# スタイルの設定
upset_plot.style_subsets(present="response to stress", 
                         facecolor="red"
                         )

upset_plot.style_subsets(present="response to stress (only IEA)", 
                         facecolor="red"
                         )

fig = plt.figure(figsize=(14, 8), dpi=500)
upset_plot.plot(fig=fig)
plt.show()

## (3) InterPro and Ortholog information

In [None]:
uniprot_id = pl.read_csv("../Data/Data_uniprot/id_mapping_result/202404/HN5_rice_up_idmapping_complete.tsv", separator="\t").sort("From").with_columns(pl.col("AlphaFoldDB").str.replace(";", ""))

# Remove all "null" columns https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.drop_nulls.html
uniprot_id = uniprot_id[[s.name for s in uniprot_id if not (s.null_count() == uniprot_id.height)]]

# if any ortholog database exists, fill with "Ortholog DB" in "Ortholog DB" column
uniprot_id = uniprot_id.with_columns(
    (
        pl.col("OrthoDB").is_not_null() |
        pl.col("OMA").is_not_null() |
        pl.col("eggNOG").is_not_null() |
        pl.col("InParanoid").is_not_null() |
        pl.col("HOGENOM").is_not_null()
    ).map_elements(lambda x: "Ortholog DB" if x else None, return_dtype=pl.Utf8).alias("Ortholog DB")
)

#Set Review column to null where value is "unreviewed"
uniprot_id = uniprot_id.with_columns(
    pl.when(pl.col("Reviewed") == "unreviewed")
    .then(None)
    .otherwise(pl.col("Reviewed"))
    .alias("Reviewed")
)

# this dataframe is used for upsetplot
uniprot_id_select = uniprot_id.select([
    "From",
    "Entry",
    #"Reviewed",
    "InterPro",
    "Ortholog DB"
])

print(uniprot_id.group_by("Entry").n_unique())
display(uniprot_id_select.head())

In [None]:
def select_row_with_values(df):
    #reviewed_not_null = df.filter(pl.col("Reviewed").is_not_null())
    #if not reviewed_not_null.is_empty():
    #    df = reviewed_not_null
    
    ortholog_db_not_null = df.filter(pl.col("Ortholog DB").is_not_null())
    if not ortholog_db_not_null.is_empty():
        df = ortholog_db_not_null
    
    return df.head(1)

# retrieve unique value from uniprot_id_select dataframe
unique_from_values = uniprot_id_select.select("From").unique().to_series()

# apply select_row_with_values function to unique gene ID
selected_rows = []
for from_value in unique_from_values:
    group_df = uniprot_id_select.filter(pl.col("From") == from_value)
    selected_row = select_row_with_values(group_df)
    selected_rows.append(selected_row)

# join the dataframe
uniprot_id_select_filtered = pl.concat(selected_rows).sort("From").drop("Entry")

# join the dataframe with goslim_mapping
uniprot_id_select_filtered_go = uniprot_id_select_filtered.join(
    goslim_mapping,
    on="From",
    how="left",
    coalesce=True
)

display(uniprot_id_select_filtered_go)

## Upsetplot visualization (2)

In [None]:
uniprot_data_pd = uniprot_id_select_filtered_go.to_pandas()

# convert to dict for upsetplot
membership_dict = {}
for index, row in uniprot_data_pd.iterrows():
    membership_list = []
    for col in uniprot_data_pd.columns[1:]:
        if row[col] is not None:
            membership_list.append(col)
    membership_dict[row["From"]] = membership_list

upset_data = from_memberships(membership_dict.values())

upset_plot = UpSet(upset_data, 
                   orientation='horizontal',
                   show_counts="{:d}",
                   subset_size='count', 
                   include_empty_subsets = False)

upset_plot.style_subsets(present="response to stress",
                         facecolor="red"
                         )

upset_plot.style_subsets(present="response to stress (only IEA)",
                         facecolor="red"
                         )

fig = plt.figure(figsize=(14, 8), dpi=500)
upset_plot.plot(fig=fig)
plt.show()

## (4) Get sequence similarity informartion from ensembl pan-homology

In [None]:
def get_pan_homology(dataframe, target_taxon: int, json_file: str):
    cache = load_cache(json_file)
    search_result = Path(f'{result}/pan_homology_{target_taxon}')
    search_result.mkdir(parents=True, exist_ok=True)

    def get_id(row):
        gene_id = row[0]
        if gene_id in cache:
            return cache[gene_id]
        
        request_url = f"https://rest.ensembl.org/homology/id/oryza_sativa/{gene_id}?compara=pan_homology&content-type=application/json;target_taxon={target_taxon}"

        try:
            response = requests.get(request_url, headers={"Accept": "application/json"}, timeout=30)
            response.raise_for_status()

            if response.status_code == 200:
                data = json.loads(response.text)
                file_name = f'{search_result}/pan_homology_{gene_id}.json'
                
                if data["data"] and any("homologies" in d for d in data["data"]): #
                    human_gene_ids = []
                    for entry in data["data"]:
                        for homology in entry["homologies"]:
                            if homology["target"]["taxon_id"] == target_taxon:
                                human_gene_ids.append(homology["target"]["id"])
                            elif homology["source"]["taxon_id"] == target_taxon:
                                human_gene_ids.append(homology["source"]["id"])
                    human_gene_ids_str = ",".join(human_gene_ids)
                    cache[gene_id] = human_gene_ids_str
                    with open(file_name, "w") as f:
                        json.dump(data, f, indent=4)
                    save_cache(cache, json_file)
                    return human_gene_ids_str if human_gene_ids_str else None
                else:
                    cache[gene_id] = None
                    save_cache(cache, json_file)
                    return None
            else:
                print(f"Failed to fetch data for {gene_id}")
                return None
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
            return None
        finally:
            time.sleep(5)
    
    pan_homology = dataframe.map_rows(get_id, return_dtype = pl.String)
    dataframe = dataframe.with_columns(
        pl.Series(pan_homology).alias(
            f"pan_homology_{target_taxon}"
            )
        )
    return dataframe

In [None]:
pan_homology_human = get_pan_homology(uniprot_id_select_filtered_go, 9606, "cache_pan_homology_human_up.json")

pan_homology_human = pan_homology_human.with_columns(
    pl.when(pl.col("pan_homology_9606") == "")
    .then(None)
    .otherwise(pl.col("pan_homology_9606"))
    .alias("pan-homology human")
).drop("pan_homology_9606")

display(pan_homology_human)

In [None]:
pan_homology_mouse = get_pan_homology(pan_homology_human, 10090, "cache_pan_homology_mouse_up.json")

pan_homology_mouse = pan_homology_mouse.with_columns(
    pl.when(pl.col("pan_homology_10090") == "")
    .then(None)
    .otherwise(pl.col("pan_homology_10090"))
    .alias("pan-homology mouse")
).drop("pan_homology_10090")

display(pan_homology_mouse)

In [None]:
pan_homology_mouse_pd = pan_homology_mouse.to_pandas()

# convert to dict for upsetplot
membership_dict = {}
for index, row in pan_homology_mouse_pd.iterrows():
    membership_list = []
    for col in pan_homology_mouse_pd.columns[1:]:
        if row[col] is not None:
            membership_list.append(col)
    membership_dict[row["From"]] = membership_list

upset_data = from_memberships(membership_dict.values())

upset_plot = UpSet(upset_data, 
                   orientation='horizontal',
                   show_counts="{:d}",
                   subset_size='count', 
                   include_empty_subsets = False)

# Highlighting selected subsets
# 1. all combination
upset_plot.style_subsets(present="response to stress",
                         facecolor="red"
                         )

upset_plot.style_subsets(present="response to stress (only IEA)",
                         facecolor="red"
                         )


# 2. response to stress
upset_plot.style_subsets(
    present="response to stress",
    absent=["pan-homology human", "pan-homology mouse"],
    facecolor="coral"
)

upset_plot.style_subsets(
    present="response to stress (only IEA)",
    absent=["pan-homology human", "pan-homology mouse"],
    facecolor="coral"
)

# 3. pan-homology
upset_plot.style_subsets(
    present=["pan-homology human", "pan-homology mouse"],
    absent=["response to stress", "response to stress (only IEA)"],
    facecolor="forestgreen"
)

# 4. InterPro and Ortholog DB
upset_plot.style_subsets(
    present=["InterPro", "Ortholog DB"],
    absent=["response to stress", "response to stress (only IEA)", "pan-homology human", "pan-homology mouse"],
    facecolor="navy"
)

fig_3 = plt.figure(figsize=(16, 10), dpi=700)
upset_plot.plot(fig=fig_3)
plt.show()

In [1]:
# Create legend independently
legend_elements = [
    Patch(facecolor="red", label="response to stress"),
    Patch(facecolor='coral', label='response to stress (no pan-homology information)'),
    Patch(facecolor='forestgreen', label='pan-homology'),
    Patch(facecolor="navy", label="InterPro and Ortholog DB")
]

fig_leg = plt.figure(figsize=(1.5, 0.5), dpi=500)
ax_leg = fig_leg.add_subplot(111)
ax_leg.legend(handles=legend_elements, loc='center')
ax_leg.axis('off') 
plt.show()
# fig_leg.savefig('legend.png', bbox_inches='tight')

NameError: name 'Patch' is not defined

In [None]:
#save results
pan_homology_mouse.write_csv(
    f"{result}/upsetplot_data_rice_up.tsv", separator="\t"
)

## (5) Classification and download mmCIF files

In [None]:
def get_cif_afurl(dataframe, json_file: str,):
    cache = load_cache(json_file)

    def get_url(row):
        alpha_fold_id = row[1] # alphafold DBとクロスリファレンスがうまくいっていないが､構造があるIDがあるため､それらを取得するためEntryカラムに設定
        if alpha_fold_id is None or alpha_fold_id == "":
            return None

        if alpha_fold_id in cache:
            return cache[alpha_fold_id]
        
        request_url = f'https://alphafold.ebi.ac.uk/api/prediction/{alpha_fold_id}'

        try:
            response = requests.get(request_url, headers={"Accept": "application/json"}, timeout=30)
            response.raise_for_status()
    
            if response.text:
                data = json.loads(response.text)
                if isinstance(data, list) and len(data) > 0: 
                    print(f"AlphaFold ID {alpha_fold_id} found in AlphaFold")
                    cif_url = data[0].get('cifUrl', None)
                    cache[alpha_fold_id] = cif_url
                    save_cache(cache, json_file)
                    return cif_url
                else:
                    print(f"AlphaFold ID {alpha_fold_id} not found in AlphaFold")
                    return None
            else:
                print(f"Empty response for AlphaFold ID {alpha_fold_id}")
                return None
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
        time.sleep(5)
        return None
    
  
    url = dataframe.map_rows(get_url, return_dtype = pl.String) # specify return_dtype to avoid error (in this case, pl.Boolean)
    dataframe = dataframe.with_columns(
        pl.Series(url).alias(
            "mmCIFfile_AF_URL"
            )
        )
    return dataframe

In [None]:
def download_mmCIF_files(dataframe, target_directory='../rice_up_mmCIFfile'):
    os.makedirs(target_directory, exist_ok=True)
    URL_col = dataframe.select("mmCIFfile_AF_URL").get_columns()[0]

    #retrieve mmCIF file from url
    for url in URL_col:
        if url and url != 'URL not found':
            filename = url.split('/')[-1] # get the last element
            save_path = os.path.join(target_directory, filename)

            if not os.path.exists(save_path): # if file not exists, download it
                response = requests.get(url)
                if response.status_code == 200:
                    with open(save_path, 'wb') as f:
                        f.write(response.content)
                    print(f'File {filename} downloaded successfully')
                    time.sleep(10)
                else:
                    print(f'Failed to download {filename}, HTTP Status Code: {response.status_code}')
            else:
                print(f'File {filename} already exists')
        else:
            print(f'Skipping invalid or missing URL, URL: {url}')
            

def extract_filename(url):
    return url.split("/")[-1]

In [None]:
# 1. response to stress
all_combination = pan_homology_mouse.filter(
    (pl.col("InterPro").is_not_null()) & 
    (pl.col("Ortholog DB").is_not_null()) &
    (pl.col("pan-homology human").is_not_null()) &
    (pl.col("pan-homology mouse").is_not_null()) &
    ((pl.col("response to stress").is_not_null()) |
    (pl.col("response to stress (only IEA)").is_not_null()))
).join(
    uniprot_id,
    on="From",
    how="left",
    coalesce=True
).select(
    [
        "From",
        "Entry",
        "Reviewed",
        "AlphaFoldDB",
        "Protein families",
        "InterPro",
        "Ortholog DB",
        "pan-homology human",
        "pan-homology mouse",
        "response to stress",
        "response to stress (only IEA)"
    ]
).with_columns(
    pl.lit("response_to_stress").alias("tag")
)

print(all_combination.group_by(["From"]).n_unique()) # Check gene count

#download mmCIF file
all_combination = get_cif_afurl(all_combination, "cache_af_url.json")
download_mmCIF_files(all_combination)

display(all_combination.head())

In [None]:
# 2. response to stress (no pan-homology information)
response_to_stress = pan_homology_mouse.filter(
    ((pl.col("response to stress").is_not_null()) |
    (pl.col("response to stress (only IEA)").is_not_null())) &
    (~pl.col("pan-homology human").is_not_null()) &
    (~pl.col("pan-homology mouse").is_not_null())
).join(
    uniprot_id,
    on="From",
    how="left",
    coalesce=True
).select(
    [
        "From",
        "Entry",
        "Reviewed",
        "AlphaFoldDB",
        "Protein families",
        "InterPro",
        "Ortholog DB",
        "pan-homology human",
        "pan-homology mouse",
        "response to stress",
        "response to stress (only IEA)"
    ]
).with_columns(
    pl.lit("response_to_stress_no_panhomology").alias("tag")
)

print(response_to_stress.group_by(["From"]).n_unique()) # Check gene count

# download mmCIF file
only_response_to_stress = get_cif_afurl(response_to_stress, "cache_af_url.json")
download_mmCIF_files(only_response_to_stress)

display(only_response_to_stress.head())

In [None]:
# 3. pan-homology
pan_homology = pan_homology_mouse.filter(
    (pl.col("response to stress").is_null()) &
    (pl.col("response to stress (only IEA)").is_null()) &
    (pl.col("pan-homology human").is_not_null()) &
    (pl.col("pan-homology mouse").is_not_null())
).join(
    uniprot_id,
    on="From",
    how="left",
    coalesce=True
).select(
    [
        "From",
        "Entry",
        "Reviewed",
        "AlphaFoldDB",
        "Protein families",
        "InterPro",
        "Ortholog DB",
        "pan-homology human",
        "pan-homology mouse",
        "response to stress",
        "response to stress (only IEA)"
    ]
).with_columns(
    pl.lit("panhomology").alias("tag")
)

print(pan_homology.group_by(["From"]).n_unique()) # Check gene count
pan_homology = get_cif_afurl(pan_homology, "cache_af_url.json")
download_mmCIF_files(pan_homology)

display(pan_homology.head())

In [None]:
# 5. InterPro and Ortholog DB
interpro_orthologdb = pan_homology_mouse.filter(
    (pl.col("InterPro").is_not_null()) &
    (pl.col("Ortholog DB").is_not_null()) &
    (pl.col("pan-homology human").is_null()) &
    (pl.col("pan-homology mouse").is_null()) &
    ((pl.col("response to stress").is_null()) &
    (pl.col("response to stress (only IEA)").is_null()))
).join(
    uniprot_id,
    on="From",
    how="left",
    coalesce=True
).select(
    [
        "From",
        "Entry",
        "Reviewed",
        "AlphaFoldDB",
        "Protein families",
        "InterPro",
        "Ortholog DB",
        "pan-homology human",
        "pan-homology mouse",
        "response to stress",
        "response to stress (only IEA)"
    ]
).with_columns(
    pl.lit("interpro_orthologdb").alias("tag")
)

print(interpro_orthologdb.group_by(["From"]).n_unique()) # Check gene count

# download mmCIF file
interpro_orthologdb = get_cif_afurl(interpro_orthologdb, "cache_af_url.json")
display(interpro_orthologdb.head())

In [None]:
# 6. Only InterPro
only_interpro = pan_homology_mouse.filter(
    (pl.col("InterPro").is_not_null()) &
    (pl.col("Ortholog DB").is_null()) &
    (pl.col("pan-homology human").is_null()) &
    (pl.col("pan-homology mouse").is_null()) &
    (pl.col("response to stress").is_null()) &
    (pl.col("response to stress (only IEA)").is_null())
).join(
    uniprot_id,
    on="From",
    how="left",
    coalesce=True
).select(
    [
        "From",
        "Entry",
        "Reviewed",
        "AlphaFoldDB",
        "Protein families",
        "InterPro",
        "Ortholog DB",
        "pan-homology human",
        "pan-homology mouse",
        "response to stress",
        "response to stress (only IEA)"
    ]
)

# 6. Only ortholog DB
only_orthologdb = pan_homology_mouse.filter(
    (pl.col("InterPro").is_null()) &
    (pl.col("Ortholog DB").is_not_null()) &
    (pl.col("pan-homology human").is_null()) &
    (pl.col("pan-homology mouse").is_null()) &
    ((pl.col("response to stress").is_null()) &
    (pl.col("response to stress (only IEA)").is_null()))
).join(
    uniprot_id,
    on="From",
    how="left",
    coalesce=True
).select(
    [
        "From",
        "Entry",
        "Reviewed",
        "AlphaFoldDB",
        "Protein families",
        "InterPro",
        "Ortholog DB",
        "pan-homology human",
        "pan-homology mouse",
        "response to stress",
        "response to stress (only IEA)"
    ]
)

only_information = pl.concat(
    [only_interpro, only_orthologdb]
).with_columns(
    pl.lit("only_information").alias("tag")
)

print(only_information.group_by(["From"]).n_unique()) # Check gene count
# download mmCIF file
only_information = get_cif_afurl(only_information, "cache_af_url.json")
display(only_information.head())

In [None]:
all_information = pl.concat([
    all_combination,
    only_response_to_stress,
    pan_homology,
    interpro_orthologdb,
    only_information
]).sort(
    "From"
)
print(all_information.group_by(["From"]).n_unique()) # Check gene count
display(all_information.head())

In [None]:
HNscore_up = pl.read_csv(
    "../Data/Data_HN5_genelist_rice_2402/HN5_genes_up_rice.tsv",
    separator="\t"
)

all_information_score = all_information.join(
    HNscore_up,
    on="From",
    how="left",
    coalesce=True
).unique()

grouped = all_information_score.group_by("From").agg(
    pl.col("AlphaFoldDB").map_elements(lambda x: x.is_null().all()).alias("all_null")
).filter(pl.col("all_null")).select("From").write_csv("./noaf.tsv", separator="\t")


print(all_information_score.group_by(["From"]).n_unique())
print(all_information_score.group_by(["Entry"]).n_unique())
display(all_information_score.head())

## Save results


In [None]:
all_combination.write_csv(f"{result}/response_to_stress.tsv", separator="\t")
only_response_to_stress.write_csv(f"{result}/response_to_stress_no_panhomology.tsv", separator="\t")
pan_homology.write_csv(f"{result}/panhomology.tsv", separator="\t")
interpro_orthologdb.write_csv(f"{result}/interpro_orthologdb.tsv", separator="\t")
only_information.write_csv(f"{result}/only_one_information.tsv", separator="\t")
all_information_score.write_csv(f"{result}/all_information_score.tsv", separator="\t")