In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from openeye.oechem import *
from chem.transform import SmilesToOEGraphMol, SmartsToOEGraphMol
from Fingerprint.ECFP4 import CalcECFPSparse_tracker, ConvertAnnotationsIntoStr

In [2]:
target = 'melanocortin_receptor_4'
data = pd.read_csv('./Dataset/Data/melanocortin_receptor_4.tsv', sep='\t', index_col=0)
df = pd.read_csv('./Dataset/ECFP/%s.tsv' %target, sep='\t', index_col=0)
df_row = pd.read_csv('./Dataset/ECFP/melanocortin_receptor_4_wosubdiff.tsv', sep='\t', index_col=0)

In [3]:
smi1 = data['sub1'].iloc[0]
mol1 = SmilesToOEGraphMol(smi1)
smi2 = data['sub2'].iloc[0]
mol2 = SmilesToOEGraphMol(smi2)

In [4]:
anno1f = CalcECFPSparse_tracker(mol1, diameter=4, nbits='unfold', pos=1)
str_hashf, str_pidxf, str_patf, str_smif = ConvertAnnotationsIntoStr(anno1f, is_folded=False)

anno1f = CalcECFPSparse_tracker(mol1, diameter=4, nbits='unfold', pos=2)
str_hashr, str_pidxr, str_patr, str_smir = ConvertAnnotationsIntoStr(anno1f, is_folded=False)

  value = (mult*value) ^ Int(ord(char))
  value = mult*(value ^ myHashLong(item))


In [5]:
str_smif.split(' ')

['[R1]',
 '[C]',
 '[CH]',
 '[CH2]',
 '[N]',
 '[C]',
 '[O]',
 '[CH3]',
 '[R1][C]',
 '[R1]C(=[C])[CH]',
 '[CH]=C[C]',
 '[CH]=C[CH]',
 '[CH2]C(=[CH])[C]',
 '[CH2]C(=[C])[C]',
 '[C]C[N]',
 '[CH2]N([CH2])[C]',
 '[CH2]C[N]',
 '[CH2]C[C]',
 'CC(=O)[N]',
 '[C]=O',
 'C[C]',
 '[R1]C(=C([CH2])[C])C=[CH]',
 '[R1]C(=[C])C=C[CH]',
 '[C]=CC=C[C]',
 '[CH2]C(=CC=[CH])[C]',
 '[CH2]CC(=C[CH])C(=[C])[CH2]',
 '[R1]C(=C(C[N])C(=[CH])[CH2])[CH]',
 '[CH2]N([C])CC(=[C])[C]',
 'CC(=O)N(C[CH2])C[C]',
 '[CH2]N([C])CC[C]',
 '[CH]=C([C])CC[N]',
 'CC(=O)N([CH2])[CH2]']

In [6]:
str_smir.split(' ')

['[R1]',
 '[C]',
 '[CH]',
 '[CH2]',
 '[N]',
 '[C]',
 '[O]',
 '[CH3]',
 '[R1][C]',
 '[R1]C(=[C])[CH]',
 '[CH]=C[C]',
 '[CH]=C[CH]',
 '[CH2]C(=[CH])[C]',
 '[CH2]C(=[C])[C]',
 '[C]C[N]',
 '[CH2]N([CH2])[C]',
 '[CH2]C[N]',
 '[CH2]C[C]',
 'CC(=O)[N]',
 '[C]=O',
 'C[C]',
 '[R1]C(=C([CH2])[C])C=[CH]',
 '[R1]C(=[C])C=C[CH]',
 '[C]=CC=C[C]',
 '[CH2]C(=CC=[CH])[C]',
 '[CH2]CC(=C[CH])C(=[C])[CH2]',
 '[R1]C(=C(C[N])C(=[CH])[CH2])[CH]',
 '[CH2]N([C])CC(=[C])[C]',
 'CC(=O)N(C[CH2])C[C]',
 '[CH2]N([C])CC[C]',
 '[CH]=C([C])CC[N]',
 'CC(=O)N([CH2])[CH2]']

In [7]:
str_patf==str_patr

False

In [8]:
np.unique(df_row['sub2_smi_forward']==df_row['sub2_smi_reverse'])

array([ True])

In [9]:
smi1

'[R1]c1cccc2c1CN(CC2)C(=O)C'

In [10]:
np.unique(df_row['sub2_forward'].apply(lambda x: x.split(' ')).sum()).shape

(1418,)

In [11]:
np.unique(df_row['sub2_reverse'].apply(lambda x: x.split(' ')).sum()).shape

(1419,)

In [12]:
np.unique(df_row['sub2_smi_forward'].apply(lambda x: x.split(' ')).sum()).shape

(1266,)

In [13]:
np.unique(df_row['sub2_smi_reverse'].apply(lambda x: x.split(' ')).sum()).shape

(1266,)

In [14]:
bitf = df_row['sub2_forward'].apply(lambda x: x.split(' ')).sum()
bitr = df_row['sub2_reverse'].apply(lambda x: x.split(' ')).sum()

smif = df_row['sub2_smi_forward'].apply(lambda x: x.split(' ')).sum()
smir = df_row['sub2_smi_reverse'].apply(lambda x: x.split(' ')).sum()

In [15]:
d={1:defaultdict(list), 2:defaultdict(list)}

for bit, smi in zip(bitf, smif):
    d[1][smi].append(bit)
    
for bit, smi in zip(bitr, smir):
    d[2][smi].append(bit)

count = pd.DataFrame.from_dict(d)

In [16]:
df_count = count.applymap(lambda x:len(x))

In [17]:
df_count['bool'] = (df_count[1]-df_count[2])==0

In [18]:
count = count.applymap(lambda x: list(np.unique([int(i) for i in x])))

In [19]:
count['#1'] = count[1].apply(lambda x: len(x))
count['#2'] = count[2].apply(lambda x: len(x))
count['bool'] = count['#1'] - count['#2']

In [20]:
count.to_csv('./count.csv')

In [21]:
count[count['bool']!=0]

Unnamed: 0,1,2,#1,#2,bool


In [22]:
count['#2'].sum()

1499

In [23]:
count

Unnamed: 0,1,2,#1,#2,bool
[R1],[-1704794629],[-1704794632],1,1,0
[C],"[965199744, 966199747, 1827616958, 1828616945]","[965199747, 966199744, 1827616957, 1828616946]",4,4,0
[CH],"[1942913796, 1943913799, 2113710432, 2114710435]","[1942913799, 1943913796, 2113710435, 2114710432]",4,4,0
[CH2],"[1457709166, 1987549200, 1988549203]","[1457709165, 1987549203, 1988549200]",3,3,0
[N],"[-80336274, -79336287, 2112315142, 2113315257]","[-80336275, -79336286, 2112315141, 2113315258]",4,4,0
...,...,...,...,...,...
[CH2]C=[N],[-566514575],[-482680872],1,1,0
[R1]CC=[N],[-525561444],[669029778],1,1,0
[R1]CC=N[NH],[-900547723],[-1619324806],1,1,0
[CH2]C=NN[C],[723861848],[145422003],1,1,0


In [24]:
len(count[1].sum())

1499

In [26]:
np.unique(count[1].sum()).shape

(1418,)

In [27]:
d={1:defaultdict(list), 2:defaultdict(list)}

for bit, smi in zip(bitf, smif):
    d[1][int(bit)].append(smi)
    
for bit, smi in zip(bitr, smir):
    d[2][int(bit)].append(smi)

count = pd.DataFrame.from_dict(d)

In [31]:
count = count.applymap(lambda x:np.unique(x))

In [35]:
count[3] = count[1].apply(lambda x: x.shape[0])
count[4] = count[2].apply(lambda x: x.shape[0])
count[5] = count[3]>1
count[6] = count[4]>1
count[7] = count[5] + count[6]

In [36]:
count

Unnamed: 0,1,2,3,4,5,6,7
-1704794629,[[R1]],[nan],1,1,False,False,False
966199747,[[C]],[nan],1,1,False,False,False
1942913796,[[CH]],[nan],1,1,False,False,False
1988549203,[[CH2]],[nan],1,1,False,False,False
-79336287,[[N]],[nan],1,1,False,False,False
...,...,...,...,...,...,...,...
-482680872,[nan],[[CH2]C=[N]],1,1,False,False,False
669029778,[nan],[[R1]CC=[N]],1,1,False,False,False
-1619324806,[nan],[[R1]CC=N[NH]],1,1,False,False,False
145422003,[nan],[[CH2]C=NN[C]],1,1,False,False,False


In [39]:
count[count[7]].to_csv('./count.csv')