# Obtaining of dataset

## Import modules

In [3]:
import pandas as pd
import numpy as np
import os

## Data extraction

In [4]:
headers = [
    'type_of_crit_point', 'x', 'y', 'z', 'rol', 'roe', 'log_rolroe',
    'rl', 're', 'sumrlre', 'rlre', 'rocntl', 'rocnte', 'log_rocntl_rocnte_nl_ne',
    'roinl', 'roine', 'nl', 'ne', 'cf1', 'cf2', 'cf3', 'cf4', 'rol_l',
    'roe_e', 'numbl', 'numbe', 'xl', 'yl', 'zl', 'xe', 'ye', 'ze',
    'number_of_lig_atom', 'name_of_lig_atom', 'name_of_lig', 'lig_chain',
    'number_of_res_atom', 'name_of_res_atom', 'name_of_res', 'res_chain', 'number_of_res'
]

In [5]:
map_txt_path = 'calculation/map_txt/'
raw_files = os.listdir(map_txt_path)

<b>File description</b>:


TYPEOFCRITPOINT - binding points (3,-1), cycle points (3,+1), pi-pi stacking (3,+3)  
X, Y, Z - point coordinates  
ROL - electron density of ligand, e/Angstrom^3  
ROE - electron density of enzyme, e/Angstrom^3  
LOG_ROLROE = LOG(ROL*ROE)  
RL - distance between the point and the most contributing ligand atom  
RE - distance between the point and the most contributing enzyme atom  
SUMRLRE = RL+RE  
RLRE = RL*RE  
LOG_ROCNTL_ROCNTE_NL_NE = LOG(ROCNTL*ROCNTE/(NL*NE))  
ROCNTL - electron density at the center of the ligand atom  
ROCNTE - electron density at the center of the enzyme atom  
ROINL - ligand electron density donated by the inner  shells to the point  
ROINE - enzyme electron density donated by the inner  shells to the point  
NL - atomic number of the ligand atom  
NE - atomic number of the enzyme atom  
ROL_L - contribution of the ligand atom to the total electron density at the critical point  
ROE_E - contribution of the enzyme atom to the total electron density at the critical point  
NUMBL - the ligand atom number that contributes the most to the critical point   
NUMBE - the enzyme atom number that contributes the most to the critical point 
  
XL, YL, ZL - the ligand atom coordinates that contributes the most to the critical point  
XE, YE, ZE - the enzyme atom coordinates that contribute the most to the critical point  


COMPLEMENTARITY FACTORS:    
CF1 = LOG((ROE+ROINE)*ROCNTE/NE)+ LOG(ROL*ROCNTL/NL)  
CF2 = LOG(ROE*ROCNTE/NEOUT)+ LOG(ROL*ROCNTL/NLOUT)  
CF3 = LOG(ROE*ROCNTE/NEOUT**2)+ LOG(ROL*ROCNTL/NLOUT**2)  
CF4 = LOG(ROE*ROCNTE/NEOUT**2)+ LOG(ROL*ROCNTL/NLOUT**2)+ROINL+ROINE  
  
CORRELATION:   
CF =a1+b1*SUMRLRE    
CF =a2+b2*RLRE 

In [6]:
raw_files[:5]

['c_1a30map.txt',
 'c_1a94map.txt',
 'c_1a9mmap.txt',
 'c_1aaqmap.txt',
 'c_1aidmap.txt']

In [None]:
def read_file(path, file):
    with open(path + file, 'r') as f:
        return f.readlines()[66:-6]

def process_string(string):
    splitted_string = string.split()
    if 'HOH' in splitted_string:
        splitted_string.insert(-1, "A")
    return splitted_string

def extract_and_save(path, file):
    data_list = [process_string(string) for string in read_file(path, file)]
    df = pd.DataFrame(data_list, columns=headers)
    df.to_csv(path + file[:-4] + '.csv', index=False)

In [None]:
for file in raw_files:
    extract_and_save(map_txt_path, file)

In [None]:
!mv /mnt/d/work/crit_points/calculation/map_txt/*.csv /mnt/d/work/crit_points/calculation/csv/

## Feature engineering

Path of csv files

In [4]:
map_csv_path = 'calculation/csv/'
csv_files = os.listdir(map_csv_path)

Target extraction

In [5]:
biol_activity_data = pd.read_csv('/mnt/d/work/crit_points/hiv_biol_activity_data.csv', index_col=False, usecols=['PDB', 'biol_act'])
biol_activity_data.head()

Unnamed: 0,PDB,biol_act
0,1a30,4.3
1,1a94,7.85
2,1a9m,6.92
3,1aaq,8.4
4,1aid,4.82


In [6]:
particular_aa = {}

for i in csv_files:
    df = pd.read_csv(map_csv_path+i)
    for index in range(df.shape[0]):
        #if df.iloc[index]['name_of_res'] != 'HOH':
        key = df.iloc[index]['number_of_res']
        value = df.iloc[index]['name_of_res']
 
        try:
            if value not in particular_aa[key]:
                particular_aa[key].append(value)
        except:        
            particular_aa[key] = [value]

particular_aa

{25: ['ASP', 'HOH', 'ASN'],
 27: ['GLY', 'HOH'],
 28: ['ALA', 'HOH'],
 29: ['ASP', 'HOH'],
 8: ['ARG', 'HOH'],
 30: ['ASP', 'HOH', 'ASN'],
 47: ['ILE', 'HOH', 'VAL'],
 48: ['GLY', 'HIS', 'HOH', 'VAL', 'ALA', 'THR'],
 49: ['GLY', 'HOH'],
 81: ['PRO', 'HOH'],
 50: ['ILE', 'HOH', 'VAL'],
 84: ['ILE', 'HOH', 'VAL'],
 23: ['LEU', 'HOH'],
 82: ['VAL', 'ILE', 'PHE', 'ASP', 'ASN', 'HOH', 'ALA', 'THR', 'LEU', 'SER'],
 85: ['HOH'],
 214: ['HOH'],
 215: ['HOH'],
 216: ['HOH'],
 16: ['HOH'],
 32: ['VAL', 'HOH', 'ILE'],
 53: ['PHE', 'HOH'],
 17: ['HOH'],
 20: ['HOH'],
 38: ['HOH'],
 3: ['HOH'],
 45: ['LYS', 'HOH', 'ARG'],
 34: ['HOH'],
 1: ['HOH'],
 54: ['ILE', 'HOH'],
 80: ['THR', 'HOH'],
 26: ['HOH'],
 64: ['HOH'],
 66: ['HOH'],
 12: ['HOH'],
 104: ['HOH'],
 108: ['ARG', 'HOH'],
 123: ['LEU', 'HOH'],
 125: ['ASP', 'HOH'],
 127: ['GLY', 'HOH'],
 128: ['ALA', 'HOH'],
 129: ['ASP', 'HOH'],
 130: ['ASP', 'HOH', 'ASN'],
 132: ['VAL', 'HOH', 'ILE'],
 147: ['ILE', 'HOH', 'VAL'],
 148: ['GLY', 'HOH'],
 1

In [7]:
for i in csv_files:
    df = pd.read_csv(map_csv_path+i)
    for index in range(df.shape[0]):
        if df.iloc[index]['name_of_res'] == 'HOH' and df.iloc[index]['number_of_res'] == 48:
            print(i)

c_1aidmap.csv
c_1aidmap.csv
c_1bwamap.csv
c_1d4kmap.csv
c_1d4kmap.csv
c_1ec2map.csv
c_1ec2map.csv
c_1ec2map.csv
c_2i0amap.csv
c_2r5pmap.csv
c_3o9imap.csv
c_3o9imap.csv
c_3o9imap.csv
c_3o9imap.csv
c_3qaamap.csv
c_4djqmap.csv
c_4zipmap.csv
c_4zipmap.csv
c_5jfpmap.csv
c_5jfpmap.csv
c_5jfpmap.csv
c_5upzmap.csv


In [45]:
for a in asd:
    print(a)

1
5
12


Creating baseline information

In [15]:
amino_acids = [
                'ALA', 'ARG', 'ASN', 'ASP', 'CYS',
                'GLU', 'GLN', 'GLY', 'HIS', 'ILE',
                'LEU', 'LYS', 'MET', 'PHE', 'PRO',
                'SER', 'THR', 'TRP', 'TYR', 'VAL',
                'HOH'
]

elements = {
            1: 'h', 6: 'c', 7: 'n', 8: 'o', 9: 'f', 15: 'p',
            16: 's', 17: 'cl', 35: 'br', 53: 'i'
}

Creating functions to establish descriptors

In [10]:
# quantitative
## for whole system
def count_crit_points(df, crit_point_type):
    return df[df['type_of_crit_point'] == crit_point_type].shape[0]

def whole_sys_descriptor_calculator(df):
    num_crit_points = df.shape[0]
    num_3_minus_1 = count_crit_points(df, '(3,-1)')
    num_3_plus_1 = count_crit_points(df, '(3,+1)')
    num_3_plus_3 = count_crit_points(df, '(3,+3)')

    rol_sum = df['rol'].sum()
    roe_sum = df['roe'].sum()
    rolroe_sum = rol_sum + roe_sum
    return num_crit_points, num_3_minus_1, num_3_plus_1, num_3_plus_3, rol_sum, roe_sum, rolroe_sum

## for amino acids
def sum_rolroe_aa(df, residue):
    filtered_df = df[df['name_of_res'] == residue]
    return filtered_df['rol'].sum() + filtered_df['roe'].sum()

def aa_descriptor_calculator(df):
    return [sum_rolroe_aa(df, aa) for aa in amino_acids]

## for elements
def sum_rolroe_el(df, el):
    filtered_df = df[df['nl'] == el]
    return filtered_df['rol'].sum() + filtered_df['roe'].sum()

def el_descriptor_calculator(df):
    return [sum_rolroe_el(df, el) for el in elements.keys()]

# qualitative
## for whole system
def yon_whole_sys_descriptor_calculator(df):
    yon_3_minus_1 = int(count_crit_points(df, '(3,-1)') > 0)
    yon_3_plus_1 = int(count_crit_points(df, '(3,+1)') > 0)
    yon_3_plus_3 = int(count_crit_points(df, '(3,+3)') > 0)
    return yon_3_minus_1, yon_3_plus_1, yon_3_plus_3

## for amino acids
def yon_aa_descriptor_calculator(df):
    return [int(sum_rolroe_aa(df, aa) > 0) for aa in amino_acids]
    
## for elements
def yon_el_descriptor_calculator(df):
    return [int(sum_rolroe_el(df, el) > 0) for el in elements.keys()]

Making empty dictionary with the future columns to fill in

In [11]:
d = {
### quantitative    
    'pdb_id': [], 'num_of_crit_points': [], 'num_of_3_minus_1': [], 'num_of_3_plus_1': [],
    'num_of_3_plus_3': [], 'sum_rol': [], 'sum_roe': [], 'sum_rolroe': [],
    **{f'sum_rolroe_{aa.lower()}': [] for aa in amino_acids},
    **{f'sum_rol_{elements[el]}': [] for el in elements.keys()},
### qualitative    
    'yon_3_minus_1': [], 'yon_3_plus_1': [], 'yon_3_plus_3': [],
    **{f'yon_rolroe_{aa.lower()}': [] for aa in amino_acids},
    **{f'yon_rol_{elements[el]}': [] for el in elements.keys()}
}

Applying functions for our dict

In [16]:
for csv in csv_files:
    df = pd.read_csv(map_csv_path + csv)
    d['pdb_id'].append(csv[2:-7])
# quantitative
## whole system    
    whole_sys_results = whole_sys_descriptor_calculator(df)
    d['num_of_crit_points'].append(whole_sys_results[0])
    d['num_of_3_minus_1'].append(whole_sys_results[1])
    d['num_of_3_plus_1'].append(whole_sys_results[2])
    d['num_of_3_plus_3'].append(whole_sys_results[3])
    d['sum_rol'].append(whole_sys_results[4])
    d['sum_roe'].append(whole_sys_results[5])
    d['sum_rolroe'].append(whole_sys_results[6])
    
## amino acids
    aa_results = aa_descriptor_calculator(df)
    for i in range(len(amino_acids)):
        d['sum_rolroe_' + amino_acids[i].lower()].append(aa_results[i])
        
## elements        
    el_results = el_descriptor_calculator(df)
    for i in range(len(elements)):
        d['sum_rol_' + list(elements.values())[i]].append(el_results[i])

# qualitative
## whole system
    yon_whole_sys_results = yon_whole_sys_descriptor_calculator(df)
    d['yon_3_minus_1'].append(yon_whole_sys_results[0])
    d['yon_3_plus_1'].append(yon_whole_sys_results[1])
    d['yon_3_plus_3'].append(yon_whole_sys_results[2])

## amino acids
    yon_aa_results = yon_aa_descriptor_calculator(df)
    for i in range(len(amino_acids)):
        d['yon_rolroe_' + amino_acids[i].lower()].append(yon_aa_results[i])

## elements
    yon_el_results = yon_el_descriptor_calculator(df)
    for i in range(len(elements)):
        d['yon_rol_' + list(elements.values())[i]].append(yon_el_results[i])

In [17]:
pd.DataFrame(d).head()

Unnamed: 0,pdb_id,num_of_crit_points,num_of_3_minus_1,num_of_3_plus_1,num_of_3_plus_3,sum_rol,sum_roe,sum_rolroe,sum_rolroe_ala,sum_rolroe_arg,...,yon_rol_h,yon_rol_c,yon_rol_n,yon_rol_o,yon_rol_f,yon_rol_p,yon_rol_s,yon_rol_cl,yon_rol_br,yon_rol_i
0,1a30,51,37,14,0,1.730304,1.661475,3.391779,0.228078,0.13826,...,1,1,1,1,0,0,0,0,0,0
1,1a94,112,82,29,1,3.481085,3.479742,6.960827,0.860011,0.30715,...,1,1,1,1,0,0,0,0,0,0
2,1a9m,106,69,36,1,2.784076,2.933608,5.717684,0.408293,0.282534,...,1,1,1,1,0,0,0,0,0,0
3,1aaq,75,52,23,0,2.801469,4.168071,6.969541,0.853034,0.357454,...,1,1,0,1,0,0,0,0,0,0
4,1aid,45,35,10,0,0.845498,0.795053,1.64055,0.014875,0.0,...,1,1,0,0,1,0,1,1,0,0


Converting to a dataframe, concating with target and saving it in a CSV file

In [None]:
features_data = pd.DataFrame(d)
data = pd.concat([biol_activity_data.set_index('PDB'), features_data.set_index('pdb_id')], axis=1)
data.to_csv('calculation/crit_points_hiv_dataset.csv')
data.head()