In [121]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [122]:
import jupyter_black

jupyter_black.load()

In [123]:
from pdb_get_structures import download_pdb_from_uniprot

uniprot_directory = "../data/uniprot_entries/"
pdb_output_dir = "../data/pdb/experimental/"

pdb_data = download_pdb_from_uniprot(uniprot_directory, pdb_output_dir, verbose=False)

## Get predicted structures

In [124]:
import re
import pandas as pd

csv_file = "../data/online_version/submission_data.csv"
uniprot_ids = list(pdb_data.data.keys())

df = pd.read_csv(csv_file)
df["uid"] = df["identifier"].str.split("|").str[1]
df = df.loc[df["uid"].isin(uniprot_ids)]

In [125]:
def find_all_occurrences(pdb_sequence, seq):
    if not isinstance(seq, str):
        return []
    start_positions = [i.start() for i in re.finditer(f"(?={pdb_sequence})", seq)]
    end_positions = [start + len(pdb_sequence) - 1 for start in start_positions]
    return list(zip(start_positions, end_positions))


matches = []

# Assuming your DataFrame is named df
for uniprot_id, pdb_entry in pdb_data.data.items():
    pdb_id = pdb_entry.entry_id
    pdb_sequence = pdb_entry.sequence

    # Find the dedicated row where the uid and the uniprot_id match
    matched_row = df.loc[df["uid"] == uniprot_id]
    if not matched_row.empty:
        identifier = matched_row["identifier"].values[0]
        mature_seq = matched_row["mature_seq"].values[0]
        full_seq = matched_row["full_seq"].values[0]

        seq_to_check = {"mature_seq": mature_seq, "full_seq": full_seq}

        for seq_name, seq in seq_to_check.items():
            occurrences = find_all_occurrences(pdb_sequence, seq)
            for start_position, end_position in occurrences:
                matches.append(
                    {
                        "identifier": identifier,
                        "sequence_type": seq_name,
                        "start_position": start_position,
                        "end_position": end_position,
                        "pdb_id": pdb_id,
                    }
                )

matches_df = pd.DataFrame(matches)

In [126]:
def custom_agg_func(group):
    full_seq_info = None
    mature_seq_info = None
    pdb_id = None

    for _, row in group.iterrows():
        if row["sequence_type"] == "full_seq":
            full_seq_info = {
                "start_position": row["start_position"],
                "end_position": row["end_position"],
            }
        elif row["sequence_type"] == "mature_seq":
            mature_seq_info = {
                "start_position": row["start_position"],
                "end_position": row["end_position"],
            }
        pdb_id = row["pdb_id"]

    return pd.Series(
        {
            "full_seq_info": full_seq_info,
            "mature_seq_info": mature_seq_info,
            "pdb_id": pdb_id,
        }
    )


merged_matches_df = (
    matches_df.groupby("identifier").apply(custom_agg_func).reset_index()
)
merged_matches_df

Unnamed: 0,identifier,full_seq_info,mature_seq_info,pdb_id
0,SP|A0A4P1LYC9|Dendroaspis_polylepis_polylepis,,"{'start_position': 1, 'end_position': 56}",6R5M
1,SP|A0S864|Boiga_irregularis,"{'start_position': 35, 'end_position': 108}","{'start_position': 16, 'end_position': 89}",2H7Z
2,SP|A0S865|Boiga_irregularis,"{'start_position': 35, 'end_position': 110}","{'start_position': 16, 'end_position': 91}",2H7Z
3,SP|A8N286|Ophiophagus_hannah,"{'start_position': 22, 'end_position': 85}","{'start_position': 1, 'end_position': 64}",3HH7
4,SP|B3EWH9|Hemachatus_haemachatus,,"{'start_position': 1, 'end_position': 60}",3VTS
...,...,...,...,...
56,SP|Q8IV16|Homo_sapiens,"{'start_position': 21, 'end_position': 150}","{'start_position': 1, 'end_position': 130}",6OAU
57,SP|Q8QGR0|Dendroaspis_angusticeps,"{'start_position': 22, 'end_position': 85}","{'start_position': 1, 'end_position': 64}",2VLW
58,SP|Q90VW1|Laticauda_semifasciata,"{'start_position': 22, 'end_position': 82}","{'start_position': 1, 'end_position': 61}",1NXB
59,SP|Q9YGJ0|Bungarus_multicinctus,"{'start_position': 22, 'end_position': 88}","{'start_position': 1, 'end_position': 67}",1MR6


### get sequences from `rostssh` server

In [127]:
import os
import subprocess

# Replace these with your server username and hostname
server_username = "senoner"
server_hostname = "rost"

# Filter identifiers with full_seq and mature_seq matches
full_seq_identifiers = merged_matches_df.loc[
    merged_matches_df["full_seq_info"].notnull(), "identifier"
]
mature_seq_identifiers = merged_matches_df.loc[
    merged_matches_df["mature_seq_info"].notnull(), "identifier"
]


# Function to download files using rsync
def download_files(identifiers, src_dir, dest_dir, verbose=False):
    for identifier in identifiers:
        escaped_src_identifier = identifier.replace("|", "\\\\\\|")
        escaped_dest_identifier = identifier.replace("|", "\\|")
        src_file = f"{server_username}@{server_hostname}:/mnt/project/senoner/3FTx/{src_dir}/{escaped_src_identifier}.pdb"
        dest_file = f"{dest_dir}/{escaped_dest_identifier}.pdb"

        # Check if the destination file already exists
        if not os.path.exists(f"{dest_dir}/{identifier}.pdb"):
            if verbose:
                print(f"Downloading {src_file} to {dest_file}")
            cmd = f"rsync -avz {src_file} {dest_file}"
            subprocess.run(cmd, shell=True, check=True)
        else:
            if verbose:
                print(f"File {dest_file} already exists. Skipping download.")


# Download full_seq PDB files
download_files(full_seq_identifiers, "colabfold_full/pdb", "../data/pdb/predicted_full")

# Download mature_seq PDB files
download_files(mature_seq_identifiers, "colabfold/pdb", "../data/pdb/predicted_mature")

### Compute the distance of predicted to experimental PDB using foldseek

In [128]:
from run_foldseek import FoldseekRunner

# Initialize a FoldseekRunner instance
runner = FoldseekRunner(
    query_dir="../data/pdb/predicted_mature",
    target_dir="../data/pdb/experimental",
    query_db="../data/pdb/fs_pred_ms/pred_ms",
    target_db="../data/pdb/fs_exp/exp",
    alignment_dir="../data/pdb/out_ms/ms",
    alignment_file="../data/pdb/out_ms/ms_aln",
    tmp_dir="../data/pdb/tmp",
)
# runner = FoldseekRunner(
#     query_dir="../data/pdb/predicted_full",
#     target_dir="../data/pdb/experimental",
#     query_db="../data/pdb/fs_pred_fs/pred_fs",
#     target_db="../data/pdb/fs_exp/exp",
#     alignment_dir="../data/pdb/out_fs/fs",
#     alignment_file="../data/pdb/out_fs/fs_aln",
#     tmp_dir="../data/pdb/tmp",
# )

# Run the commands
runner.run()

In [129]:
import matplotlib.pyplot as plt

foldseek = pd.read_csv("../data/pdb/out_ms/ms_aln", sep="\t")
foldseek["query"] = foldseek["query"].str.split(".pdb").str[0]
foldseek["target"] = foldseek["target"].str.split(".pdb").str[0]

indices = list()
for idx, row in matches_df.iterrows():
    if row["sequence_type"] != "mature_seq":
        continue
    # foldseek
    try:
        jdx = foldseek.loc[
            (foldseek["query"] == row["identifier"])
            & (foldseek["target"] == row["pdb_id"])
        ].index[0]
    except Exception as e:
        print(e)
        print(row["identifier"], row["pdb_id"])
    indices.append(jdx)
foldseek = foldseek.iloc[indices].reset_index(drop=True)
# for column in ["tcov", "lddt", "alntmscore", "rmsd"]:
#     foldseek[column] = foldseek[column] * 100
cols = ["query", "target", "lddt", "alntmscore", "rmsd"]
foldseek = foldseek[cols]
foldseek = foldseek.rename(
    columns={
        "query": "identifier",
        "target": "pdb_id",
        "lddt": "LDDT",
        "alntmscore": "TM_score",
        "rmsd": "RMSD",
    }
)

print(foldseek.describe())
# foldseek.boxplot()
# plt.show()

            LDDT   TM_score       RMSD
count  59.000000  59.000000  59.000000
mean    0.851547   0.865725   1.922881
std     0.082508   0.075581   0.853297
min     0.604300   0.624400   0.646000
25%     0.817000   0.836100   1.307500
50%     0.871900   0.887500   1.724000
75%     0.910200   0.912500   2.472500
max     0.966800   0.971200   4.147000


In [119]:
from pathlib import Path

csv_file = Path(csv_file)

df = pd.read_csv(csv_file)
df = df.merge(right=foldseek, on="identifier", how="outer")
df.to_csv(csv_file.with_stem(f"{csv_file.stem}_foldseek"), index=False)