# LOAD DATA 

In [None]:
import pandas as pd
from sklearn.metrics  import f1_score,accuracy_score, precision_score

In [None]:
#IC50 and CYTOTOX only from SCA3
data = pd.read_csv('DATA/Sca3_ic50_cyto_DATA.csv')
targets = ['TARGET#1 QPCR IC50 ATXN3','TARGET#2 CYTOTOX WINDOW']
#targets = ['TARGET#1 QPCR IC50 ATXN3']
keep = targets.copy()
keep.append('SMILES')
data = data[keep]
print(data.count())
directory ='SCA3_only_ic50_cyto'
training_data = data.copy()

In [None]:
new_data = pd.read_csv('new_sca3_test_points.csv')
new_data = new_data[new_data['QPCR IC50 ATXN3']!= 222.0]
new_data.columns = [0,'SMILES','QPCR IC50 ATXN3','PTC_ID']

# Benchmarks

In [None]:
from modeling import assign_label, modeling
import descriptor_generation_essentials 
import importlib
importlib.reload(descriptor_generation_essentials)
from descriptor_generation_essentials import generate_descriptors

In [None]:
data['TARGET_QUALITY'] = data['TARGET#1 QPCR IC50 ATXN3'].apply(lambda x: assign_label(x, 'less', 45))
new_data['TARGET_QUALITY'] = new_data['QPCR IC50 ATXN3'].apply(lambda x: assign_label(x, 'less', 45))

# with all descriptors - SCA3 IC50 - No Changes


In [None]:
from descriptor_generation_essentials import generate_descriptors

In [None]:
data, descriptorlist_full = generate_descriptors.generate_all_descriptors(data, 'no')

In [None]:
data, descriptorlist_full = generate_descriptors.generate_all_descriptors(data, 'no')
new_data, descriptorlist_full2 = generate_descriptors.generate_all_descriptors(new_data, 'no')

data = data.fillna(0)
descriptorlist_full = data[descriptorlist_full].select_dtypes(['number']).columns.tolist()
cols = data.columns.tolist()

for i in descriptorlist_full:
    if i not in cols: 
        descriptorlist_full.remove(i)

results1 = modeling(descriptorlist_full, data, 'yes')

cols = new_data.columns.tolist()
for i in descriptorlist_full:
    if i not in cols:
        new_data[i] = 0
        
new_data_temp = results1[3].transform(new_data[descriptorlist_full])
new_data_temp = pd.DataFrame(new_data_temp, columns=descriptorlist_full)
new_data_temp = new_data_temp[results1[1]]
vals = results1[0].predict(new_data_temp)
precision_score(vals, new_data['TARGET_QUALITY'], average='binary', pos_label='GOOD')

In [None]:
results1 = modeling(descriptorlist_full, data, 'yes')

cols = new_data.columns.tolist()
for i in descriptorlist_full:
    if i not in cols:
        new_data[i] = 0
        
new_data_temp = results1[3].transform(new_data[descriptorlist_full])
new_data_temp = pd.DataFrame(new_data_temp, columns=descriptorlist_full)
new_data_temp = new_data_temp[results1[1]]
vals = results1[0].predict(new_data_temp)
precision_score(vals, new_data['TARGET_QUALITY'], average='binary', pos_label='GOOD')

# with all descriptors + all fingeprints - SCA3 IC50 


In [None]:
data, descriptorlist_full_fps = generate_descriptors.calc_fingerprints(data)
new_data, descriptorlist_fps_full2 = generate_descriptors.calc_fingerprints(new_data)

for i in descriptorlist_full_fps:
    for j in i: 
        if len(data[j].unique()) == 1:
            i.remove(j)
            
desc_fps = []
for i in descriptorlist_full_fps:
        desc_fps.extend(i)
desc_fps = desc_fps + descriptorlist_full
        
data[desc_fps] = data[desc_fps].fillna(0)
results2 = modeling(desc_fps, data, 'yes')

new_data_temp = results2[3].transform(new_data[desc_fps])
new_data_temp = pd.DataFrame(new_data_temp, columns=desc_fps)
new_data_temp = new_data_temp[results2[1]]
vals = results2[0].predict(new_data_temp)
precision_score(vals, new_data['TARGET_QUALITY'], average='binary', pos_label='GOOD')

In [None]:
# with ALL fingeprints, fingeprints only- SCA3 IC50 
#data, descriptorlist_full_fps = generate_descriptors.calc_fingerprints(data)
desc_fps = []
for i in descriptorlist_full_fps:
        desc_fps.extend(i)
        
for i in desc_fps:
    if len(data[i].unique()) == 1:
        desc_fps.remove(i)

results3 = modeling(desc_fps, data, 'no')

new_data_temp = new_data[results3[1]]
vals = results3[0].predict(new_data_temp)
precision_score(vals, new_data['TARGET_QUALITY'], average='binary', pos_label='GOOD'

# Evaluate individual fingerprints - SCA3 IC50 

In [None]:
data, descs = generate_descriptors.calc_fingerprints(data)

print('TOTAL FINGERPRINTS' , len(descs))
import itertools

for i in descs:
    for col in i:
        if len(data[col].unique()) == 1:
            i.remove(col)

subsets = []
for L in range(1, 2):
    print(L)
    for subset in itertools.combinations(descs, L):
        i = []
        for x in subset:
            i.extend(x)
        subsets.append(i)
        
for i in descs:
    print(len(i))
    
print('COMBINATIONS:',len(subsets))

import multiprocessing as mp
from functools import partial

ns = mp.Manager()
ns.df = data

def worker(n, data):
    rank = mp.current_process()._identity[0]
    print(f'I am processor {rank}.')
    return modeling(n, data,  'no')

if __name__ == '__main__':  
    pool = mp.Pool(processes = mp.cpu_count()-1)
    %time results4 = pool.map(partial(worker, data=ns.df) ,subsets)
    pool.close()
    pool.join()

    scores = []
    models = []
    feature_list = []
    scaler = []
    params = []
    for i in results4: 
            models.append(i[0])
            feature_list.append(i[1])
            scores.append(i[2])
            scaler.append(i[3])
            params.append(i[4])
            
new_data, descs = generate_descriptors.calc_fingerprints(new_data)

for i in range(len(scores)):
    new_data_temp = new_data[feature_list[i]]
    vals = models[i].predict(new_data_temp)
    print(i, precision_score(vals, new_data['TARGET_QUALITY'], average='binary', pos_label='GOOD'))

# Different combinations of fingerprints - SCA3 IC50 


In [None]:
data, descs = generate_descriptors.calc_fingerprints(data)

print('TOTAL FINGERPRINTS' , len(descriptorlist_full))
import itertools

for i in descs:
    for col in i:
        if len(data[col].unique()) == 1:
            i.remove(col)

subsets = []
for L in range(2, 4):
    print(L)
    for subset in itertools.combinations(descs, L):
        i = []
        for x in subset:
            i.extend(x)
        subsets.append(i)
        
for i in descs:
    print(len(i))
    
print('COMBINATIONS:',len(subsets))

import multiprocessing as mp
from functools import partial

ns = mp.Manager()
ns.df = data

def worker(n, data):
    rank = mp.current_process()._identity[0]
    print(f'I am processor {rank}.')
    return modeling(n, data,  'no')

if __name__ == '__main__':  
    pool = mp.Pool(processes = mp.cpu_count()-1)
    %time results4 = pool.map(partial(worker, data=ns.df) ,subsets)
    pool.close()
    pool.join()

    scores = []
    models = []
    feature_list = []
    scaler = []
    params = []
    for i in results4: 
            models.append(i[0])
            feature_list.append(i[1])
            scores.append(i[2])
            scaler.append(i[3])
            params.append(i[4])

# Using newly created functional group tree

In [None]:
from importlib import reload 
import descriptor_generation_essentials 
reload(descriptor_generation_essentials)
from descriptor_generation_essentials import tree_search_main 
list_of_nodes, cluster_list = tree_search_main.initialize_nodes()

In [None]:
data['FGS'] = tree_search_main.generate_fg_hierarchy_descriptors_no_clustering(data, list_of_nodes)

# Averaging Results Of Multiple Models

In [None]:
score_indexes = [idx for idx, element in enumerate(scores) if element > .50]

for i in score_indexes: 
    new_data_temp = new_data[feature_list[i]]
    new_data['LABEL_' + str(i)] = models[i].predict(new_data_temp)
  

In [None]:
cols = ['LABEL_' +str(i) for i in score_indexes]
new_data_fps = new_data[cols]
new_data_fps = new_data_fps.replace('GOOD',1)
new_data_fps = new_data_fps.replace('BAD',0)
new_data_fps['SUMMED'] = new_data_fps.sum(axis=1)
new_data['SUMMED'] = new_data_fps['SUMMED']
new_data['SUMMED'] = new_data['SUMMED'] / len(score_indexes)
new_data.loc[(new_data['SUMMED'] >= .5),'SUMMED'] = 'GOOD'
new_data.loc[new_data['SUMMED'] !='GOOD','SUMMED'] = 'BAD'
print('PRECISION REGULAR',precision_score(new_data['SUMMED'], new_data['TARGET_QUALITY'], average='binary', pos_label='GOOD'))

In [None]:
from descriptor_generation_essentials import tree_search_main 
list_of_nodes, cluster_list = tree_search_main.initialize_nodes()

  """)
  start43 = pd.read_csv('descriptor_generation_essentials/11182021_SMILES_SMARTS_Hierarchy/D2/D2_Node_0a_SMARTS_FGs_2_left_c.txt',sep='/t')


DONE HERE
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
