In [1]:
import json
import os
import shutil
import numpy as np

import pandas as pd
# from B1_UTILS import download_map_model, map_normalizing, map_output
# from B2_UTILS_data2npy import data_to_npy

In [2]:
DOWNLOAD = True
NORMALIZATION = True
MAP2NPY = False

In [5]:
MODEL_PARTS = ['secondary_strctures', 'key_atoms', 'residue_types']

MAIN_HOME_PATH = '/home/qiboxu/Database/U_NET/EMDB_PDB_for_U_Net'
PATH_SETTINGS = {
    "Filtered_Dateset": [
        os.path.join(MAIN_HOME_PATH, 'Filtered_Dateset', 'Raw'),
        os.path.join(MAIN_HOME_PATH, 'Filtered_Dateset', 'Training'),
        os.path.join(MAIN_HOME_PATH, 'Filtered_Dateset', 'Raw', 'final-20240212.csv'),
    ],
}

PATH_KEYS = "Filtered_Dateset"
DATA_PATH, HOME_PATH, csv_path = PATH_SETTINGS[PATH_KEYS]
temp_sample_path = os.path.join(HOME_PATH, "ready_to_train_and_val")
os.makedirs(temp_sample_path, exist_ok=True)

In [6]:
# Read map list and generate raw_map and model downloading paths
df = pd.read_csv(csv_path)
emdbs, pdbs = df["emdb_id"], df["fitted_pdbs"]
resolutions = df["resolution"].astype(str)
emdb_ids = [emdb.split("-")[1] for emdb in emdbs]
folders = [f"{emdb}_re_{resolution}" for emdb, resolution in zip(emdbs, resolutions)]
raw_maps = [f"emd_{emdb_id}.map" for emdb_id in emdb_ids]
models = [f"{pdb}.cif" for pdb in pdbs]
raw_map_paths = [f"{DATA_PATH}/{folder}/{raw_map}" for folder, raw_map in zip(folders, raw_maps)]
model_paths = [f"{DATA_PATH}/{folder}/{model}" for folder, model in zip(folders, models)]

In [18]:
map_paths = [f"{raw_map_path.split('.map')[0]}_normalized.mrc" for raw_map_path in raw_map_paths]
for idx, emdb_id in enumerate(emdb_ids):
    if DOWNLOAD:
        download_map_model(emdb=emdbs[idx], pdb=pdbs[idx], resolution=resolutions[idx], directory=DATA_PATH)
        # if idx == 20:   # for download limited number of testing data
        #     exit()
    if NORMALIZATION and not os.path.exists(map_paths[idx]):
        map_data = map_normalizing(raw_map_paths[idx])
        map_output(raw_map_paths[idx], map_data, map_paths[idx], is_model = False)


In [19]:
DOWNLOAD = False
NORMALIZATION = False
MAP2NPY = True

In [20]:
if MAP2NPY:
    # Create training and testing dataset (3d numpy arraies) from map file and model file
    # idx of npy file
    sample_num = 0

    # num of tag in each part for all models
    num_of_tag_in_each_part_for_all_models = {}
    for part in MODEL_PARTS:
        num_of_tag_in_each_part_for_all_models[part] = 0

    for idx, emdb_id in enumerate(emdb_ids[0:5]):
        # for idx in [0, 1]:
        print(f"Generating dataset from EMDB-{emdb_id}... ")
        sample_num, num_of_tag_in_each_part = data_to_npy(
            map_paths[idx], model_paths[idx], MODEL_PARTS, temp_sample_path,
            sample_num)
        # Add number of each tag in new sampled model
        for key, value_list in num_of_tag_in_each_part.items():
            num_of_tag_in_each_part_for_all_models[key] += value_list
        # print(num_of_tag_in_each_part_for_all_models)


Generating dataset from EMDB-11893... 
Sampling part: secondary_strctures
Sampling part: key_atoms
Sampling part: residue_types
Generating dataset from EMDB-22345... 
Sampling part: secondary_strctures
Sampling part: key_atoms
Sampling part: residue_types
Generating dataset from EMDB-12884... 
Sampling part: secondary_strctures
Sampling part: key_atoms
Sampling part: residue_types
Generating dataset from EMDB-10892... 
Sampling part: secondary_strctures
Sampling part: key_atoms
Sampling part: residue_types
Generating dataset from EMDB-10914... 
Sampling part: secondary_strctures
Sampling part: key_atoms
Sampling part: residue_types


In [27]:
# Delete 0s in tag number list and calculate 1/ratio of tag number
ratio_of_tag = {}
for key, value_list in num_of_tag_in_each_part_for_all_models.items():
    value_list = np.trim_zeros(value_list)
    num_of_tag_in_each_part_for_all_models[key] = value_list
    if 0 in value_list:
        print(f'There is missing groups in part {key}, numbers of each class are {value_list}.')
    ratio_of_tag[key] = [int(value_list[0]//value_x)
                         for value_x in value_list if value_x != 0]

# Print statistical results
print(f"The number of .npy file: {sample_num}")
print("Num of each tag: ")
for key, value in num_of_tag_in_each_part_for_all_models.items():
    print(f'{key}: {value}')
print("\nRatio of tags: ")
for key, value in ratio_of_tag.items():
    print(f'{key}: {value}')

The number of .npy file: 710
Num of each tag: 
secondary_strctures: [174890136   1590260    706409   2079118   6856317]
key_atoms: [180078192   2201966    850162   2991920]
residue_types: [180928354    181975    184653     88626     96369     25327    137842
     83586    148467     53821    132074    182337    183722     51573
     72989     80491    120068    119334     17296     60128    181288
    814910    864319    644418    668273]

Ratio of tags: 
secondary_strctures: [1, 109, 247, 84, 25]
key_atoms: [1, 81, 211, 60]
residue_types: [1, 994, 979, 2041, 1877, 7143, 1312, 2164, 1218, 3361, 1369, 992, 984, 3508, 2478, 2247, 1506, 1516, 10460, 3009, 998, 222, 209, 280, 270]


In [23]:
# split data into training and testing dataset
import splitfolders
sample_path = os.path.join(HOME_PATH, "train_val_data")
os.makedirs(sample_path, exist_ok=True)
splitfolders.ratio(input=temp_sample_path, output=sample_path,
                    seed=44, ratio=(.8, .2), group_prefix=None, move=True)
shutil.rmtree(temp_sample_path)

Copying files: 2840 files [00:00, 60501.43 files/s]


In [28]:
# Calculate and print weight (ratio of tags)
os.makedirs(os.path.join(HOME_PATH, "TEST"), exist_ok=True)
with open(f"{HOME_PATH}/TEST/train_val_data/class_weight_for_training.txt", "w") as file:
    json.dump(ratio_of_tag, file)

In [None]:
# test generatred data
import numpy as np
import matplotlib.pyplot as plt

for num in range(0, 5):
    model_type = 'map_sample'
    mapname = f'/home/qiboxu/Database/U_NET/EMDB_PDB_for_U_Net/Filtered_Dateset/Training/TEST/train_val_data/train/{model_type}/map.{num}.npy'
    aaa = np.load(mapname)
    # print(np.max(aaa))
    plt.imshow(aaa[:, :, 30], cmap='gray', origin='lower')
    plt.title(f'{model_type}.{num}')
    plt.show()
    for model_type in ['secondary_strctures', 'key_atoms', 'residue_types']:
        filename = f'/home/qiboxu/Database/U_NET/EMDB_PDB_for_U_Net/Filtered_Dateset/Training/TEST/train_val_data/train/{model_type}/model_{model_type}.{num}.npy'
        aaa = np.load(filename)
        plt.imshow(aaa[:, :, 30], origin='lower')
        plt.title(f'{model_type}.{num}')
        plt.show()
    print("###")
