In [29]:
import json
import os
import pandas as pd
import numpy as np

from tqdm import contrib as tc

import random
from sklearn.model_selection import train_test_split

In [30]:
code_path = os.getcwd()
project_file_path = code_path.split("chembl")[0]
training_files_path = os.path.join(project_file_path, "training_files")
training_files_path

'/home/hayriye/DEEPScreen2.2/training_files'

In [31]:
preprocessed_tsv_file = "/chembl32_preprocessed_filtered_act_inact_comps_Q60.tsv"

In [32]:
act_inact_df = pd.read_csv(training_files_path + preprocessed_tsv_file , sep = "\t", header=None)
print(len(act_inact_df))
act_inact_df.head()

1580


Unnamed: 0,0,1
0,CHEMBL3066_act,"CHEMBL210514,CHEMBL22055,CHEMBL11149,CHEMBL273..."
1,CHEMBL3066_inact,"CHEMBL2370208,CHEMBL157101,CHEMBL487627,CHEMBL..."
2,CHEMBL261_act,"CHEMBL6919,CHEMBL269122,CHEMBL35118,CHEMBL1418..."
3,CHEMBL261_inact,"CHEMBL7204,CHEMBL6705,CHEMBL21,CHEMBL6852,CHEM..."
4,CHEMBL281_act,"CHEMBL35118,CHEMBL18,CHEMBL19,CHEMBL71611,CHEM..."


In [33]:
protein_list = pd.read_csv(training_files_path + "/chembl32_training_target_list.txt", header=None)
protein_list = [line[0] for line in protein_list.values.tolist()]
len(protein_list)

790

In [34]:
active_list=act_inact_df[act_inact_df[0] == "CHEMBL4282"+"_act"].loc[:,1].values[0].split(",")
len(active_list)

1218

In [35]:
inactive_list=act_inact_df[act_inact_df[0] == "CHEMBL4282"+"_inact"].loc[:,1].values[0].split(",")
len(inactive_list)

1830

In [36]:
ttpath = os.path.join(training_files_path, "target_training_datasets")
mediattpath = os.path.join("/media/ubuntu/8TB/hayriye/DEEPScreen2.2", "training_files", "target_training_datasets")

angle_list = [angle for angle in range(10,360,10)]


for i, protein in tc.tenumerate(protein_list):
#for i,protein in enumerate(protein_list):
    
    #print(i,protein)
    
    active_string = act_inact_df[act_inact_df[0] == protein+"_act"].loc[:,1].values[0]
    inactive_string = act_inact_df[act_inact_df[0] == protein+"_inact"].loc[:,1].values[0]
    
    active_list = active_string.split(",")
    inactive_list = inactive_string.split(",")
    
    
    active_tuples = [[e,1] for e in active_list]
    active_rotated_tuples = []
    for current_tuple in active_tuples:
        current_molecule = current_tuple[0]
        current_tag = current_tuple[1]
        for angle in angle_list:
            active_rotated_tuples.append([current_molecule+"_"+str(angle), current_tag])
    active_tuples = active_tuples + active_rotated_tuples
    
    
    inactive_tuples = [[e,0] for e in inactive_list]
    inactive_rotated_tuples = []
    for current_tuple in inactive_tuples:
        current_molecule = current_tuple[0]
        current_tag = current_tuple[1]
        for angle in angle_list:
            inactive_rotated_tuples.append([current_molecule+"_"+str(angle), current_tag])
    inactive_tuples = inactive_tuples + inactive_rotated_tuples
    
    
    active_tuples_train, active_tuples_test_val = train_test_split(active_tuples, test_size=0.20, random_state=42)
    active_tuples_test, active_tuples_val = train_test_split(active_tuples_test_val, test_size=0.50, random_state=42)
    
    inactive_tuples_train, inactive_tuples_test_val = train_test_split(inactive_tuples, test_size=0.20, random_state=42)
    inactive_tuples_test, inactive_tuples_val = train_test_split(inactive_tuples_test_val, test_size=0.50, random_state=42)
    
    tuples_train = active_tuples_train + inactive_tuples_train
    tuples_test = active_tuples_test + inactive_tuples_test
    tuples_val = active_tuples_val + inactive_tuples_val
    
    random.shuffle(tuples_train)
    random.shuffle(tuples_test)
    random.shuffle(tuples_val)
    
    json_dict = {"training": tuples_train,
                "test": tuples_test,
                "validation": tuples_val}
    
    # Serializing json   
    json_object = json.dumps(json_dict)  
    #print(json_object)
    
    
    protein_path = os.path.join(ttpath, protein)
    media_protein_path = os.path.join(mediattpath, protein)
    #print(protein_path)
    
    imgs_path = os.path.join(ttpath, protein, "imgs")
    media_imgs_path = os.path.join(mediattpath, protein, "imgs")
    #print(new_path)
    
    
    try: 
        os.makedirs(imgs_path)
    except FileExistsError as error: 
        pass
    
    try: 
        os.makedirs(media_imgs_path)
    except FileExistsError as error: 
        pass
    
    output_file = open("{}/train_val_test_dict.json".format(protein_path), "w")
    output_file.write(json_object)
    output_file.close()
    
    output_file = open("{}/train_val_test_dict.json".format(media_protein_path), "w")
    output_file.write(json_object)
    output_file.close()

100%|█████████████████████████████████████████████████████████████████████████████████████| 790/790 [01:45<00:00,  7.50it/s]


In [9]:
ttpath = "/media/ubuntu/8TB/hayriye/DEEPScreen2.0/training_files/target_training_datasets"
protein_path = os.path.join(ttpath, "CHEMBL4282")

In [10]:
import json

with open("{}/train_val_test_dict.json".format(protein_path), "r+") as f:
    data = json.load(f)
    
    chem_id_list =[]
    
    for element in data["training"]:
        for elem in element:
            chem_id_list.append(elem)
            break
    
    print(len(chem_id_list))
    
    for element in data["test"]:
        for elem in element:
            chem_id_list.append(elem)
            break
    print(len(chem_id_list))
    
    for element in data["validation"]:
        for elem in element:
            chem_id_list.append(elem)
            break
    print(len(chem_id_list))
    
    #data['id'] = 134 # <--- add `id` value.
    #f.seek(0)        # <--- should reset file position to the beginning.
    #json.dump(data, f, indent=4)
    #f.truncate()     # remove remaining part


86514
97329
108144
