In [22]:
#Mapping the RNA and small molecule features to create the final dataset for model training

import sys
import csv
import pandas as pd
import random

def derangement(lst):
    shuffled = lst[:]
    while any(a == b for a, b in zip(lst, shuffled)):
        random.shuffle(shuffled)
    return dict(zip(lst, shuffled))

rna_feat = "./Riboswitch_output/RNA_features_v1.csv"
mol_feat = "./Riboswitch_output/Mol_features_Riboswitch.csv"
dataset_raw = "./data/Riboswitch_dataset_v1.csv"
outfile = "./Riboswitch_output/Final_dataset_Riboswitch_v4.csv"

# Ajout de la gestion de l'argument swapping
swapping = True
if len(sys.argv) > 5:
    swapping = sys.argv[5].lower() == "true"

rna_df = pd.read_csv(rna_feat, sep="\t", header=0)
rna_df.dropna(axis=1, how='any', subset=None, inplace=True)

mol_df = pd.read_csv(mol_feat, sep=",", header=0)
mol_df.dropna(axis=1, how='any', subset=None, inplace=True)

data_df = pd.read_csv(dataset_raw, sep="\t", header=0)

entries = list(data_df['Entry_ID'])

# Swapping conditionnel
if swapping:
    unique_rnas = list(data_df['Target_RNA_ID'].unique())
    rna_swap_dict = derangement(unique_rnas)
    rna_ids = list(data_df['Target_RNA_ID'].map(rna_swap_dict))
else:
    rna_ids = list(data_df['Target_RNA_ID'])

mol_ids = list(data_df['Molecule_ID'])

colnames = []
colnames.extend(list(rna_df.columns))
colnames.extend(list(mol_df.columns))
colnames.extend(["pKd"])
colnames.remove("name")
colnames.insert(2, "name")
final_df = pd.DataFrame(columns = colnames)
print(len(rna_df.index), len(mol_df.index), len(data_df.index))

for entry, rid, mid in zip(entries, rna_ids, mol_ids):
    try:
        data_point = pd.DataFrame(columns = colnames)
        rna_feat = rna_df.loc[rna_df['Target_RNA_ID'] == rid].to_dict('list')
        mol_feat = mol_df.loc[mol_df['name'] == mid].to_dict('list')
        # kd_val = {'pKd': [float(data_df.loc[data_df['Entry_ID'] == entry]['pKd'])]}
        kd_val = {'pKd': [float(data_df.loc[data_df['Entry_ID'] == entry]['pKd'].iloc[0])]}


        data_point = rna_feat
        data_point.update(mol_feat)
        data_point.update(kd_val)
        row = pd.DataFrame.from_dict(data_point)
        #print(data_point)
        final_df = pd.concat([final_df, row], ignore_index=True)  #Append the row to the final dataframe to create the dataset
        #print(final_df)
    except:
        print("Failed: ", entry, rid, mid)
        continue


34 64 100


  final_df = pd.concat([final_df, row], ignore_index=True)  #Append the row to the final dataframe to create the dataset


Failed:  742 Target_191 Target_lig_504


In [24]:
final_df

Unnamed: 0,Target_RNA_ID,Target_RNA_name,name,A,G,C,U,AA_x,AG,AC,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb2,pKd
0,Target_416,Tetracycline riboswitch G51U mutant,Target_lig_165,0.29577,0.22535,0.29577,0.18310,0.07042,0.05634,0.08451,...,9.161465,53.745115,135.054495,9.003633,105,12,52.0,61.0,2.222222,6.397940
1,Target_416,Tetracycline riboswitch G51U mutant,Target_lig_166,0.29577,0.22535,0.29577,0.18310,0.07042,0.05634,0.08451,...,9.097508,53.600166,135.054495,9.003633,108,11,52.0,60.0,2.194444,5.789147
2,Target_416,Tetracycline riboswitch G51U mutant,Target_lig_167,0.29577,0.22535,0.29577,0.18310,0.07042,0.05634,0.08451,...,8.760767,37.736937,125.070145,7.816884,84,9,42.0,45.0,2.000000,4.698970
3,Target_416,Tetracycline riboswitch G51U mutant,Target_lig_168,0.29577,0.22535,0.29577,0.18310,0.07042,0.05634,0.08451,...,9.317849,55.364239,150.065394,8.827376,138,14,58.0,68.0,2.388889,7.698970
4,Target_416,Tetracycline riboswitch G51U mutant,Target_lig_169,0.29577,0.22535,0.29577,0.18310,0.07042,0.05634,0.08451,...,9.511555,56.191602,150.065394,8.827376,135,15,60.0,71.0,2.333333,8.397940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,Target_352,Fusibacterium ulcerans ZTP riboswitch,Target_lig_496,0.32000,0.25333,0.20000,0.22667,0.08000,0.02667,0.10667,...,10.991157,77.031951,543.174061,7.987854,4392,83,220.0,275.0,8.416667,4.978811
95,Target_189,"165 ribD FMN Riboswitch Aptamer, B. subtilis",Target_lig_663,0.28485,0.32727,0.18182,0.20606,0.06667,0.09091,0.03636,...,10.113384,75.174897,382.157580,7.799134,2021,39,142.0,165.0,5.972222,7.795880
96,Target_189,"165 ribD FMN Riboswitch Aptamer, B. subtilis",Target_lig_1187,0.28485,0.32727,0.18182,0.20606,0.06667,0.09091,0.03636,...,10.113384,75.174897,382.157580,7.799134,2021,39,142.0,165.0,5.972222,5.000000
97,Target_189,"165 ribD FMN Riboswitch Aptamer, B. subtilis",Target_lig_663,0.28485,0.32727,0.18182,0.20606,0.06667,0.09091,0.03636,...,10.113384,75.174897,382.157580,7.799134,2021,39,142.0,165.0,5.972222,8.180456


In [23]:
mol_feat

{'name': ['Target_lig_664'],
 'nAcid': [0],
 'nBase': [1],
 'nAromAtom': [22],
 'nAromBond': [22],
 'nAtom': [51],
 'nHeavyAtom': [30],
 'nSpiro': [0],
 'nBridgehead': [0],
 'nHetero': [9],
 'nH': [21],
 'nB': [0],
 'nC': [21],
 'nN': [7],
 'nO': [1],
 'nS': [1],
 'nP': [0],
 'nF': [0],
 'nCl': [0],
 'nBr': [0],
 'nI': [0],
 'nX': [0],
 'ATS0dv': [408.44444444444446],
 'ATS1dv': [425.6666666666667],
 'ATS2dv': [593.6666666666667],
 'ATS3dv': [539.3333333333333],
 'ATS4dv': [542.3333333333334],
 'ATS5dv': [447.6666666666667],
 'ATS6dv': [444.6666666666667],
 'ATS7dv': [349.66666666666663],
 'ATS8dv': [357.6666666666667],
 'ATS0d': [183.0],
 'ATS1d': [233.0],
 'ATS2d': [366.0],
 'ATS3d': [397.0],
 'ATS4d': [390.0],
 'ATS5d': [381.0],
 'ATS6d': [369.0],
 'ATS7d': [331.0],
 'ATS8d': [299.0],
 'ATS0Z': [1440.0],
 'ATS1Z': [1579.0],
 'ATS2Z': [2331.0],
 'ATS3Z': [2278.0],
 'ATS4Z': [2246.0],
 'ATS5Z': [2302.0],
 'ATS6Z': [2131.0],
 'ATS7Z': [1894.0],
 'ATS8Z': [1715.0],
 'ATS0m': [5708.06782

In [25]:
data_df[]

Unnamed: 0,Entry_ID,SMILES,Target_RNA_sequence,Molecule_name,Molecule_ID,Target_RNA_name,Target_RNA_ID,pKd
0,221,C1=NC2=NC=NC(=C2N1)N,GGACAUAUAAUCGCGUGGAUAUGGCACGCAAGUUUCUACCGGGCAC...,Adenine,Target_lig_165,ADENINE RIBOSWITCH,Target_69,6.397940
1,222,C1=C2C(=NC(=N1)N)N=CN2,GGACAUAUAAUCGCGUGGAUAUGGCACGCAAGUUUCUACCGGGCAC...,2-aminopurine,Target_lig_166,ADENINE RIBOSWITCH,Target_69,5.789147
2,223,C1=C(N=C(N=C1N)N)N,GGACAUAUAAUCGCGUGGAUAUGGCACGCAAGUUUCUACCGGGCAC...,"2,4,6-triaminopyrimidine",Target_lig_167,ADENINE RIBOSWITCH,Target_69,4.698970
3,224,C1=NC2=NC(=NC(=C2N1)N)N,GGACAUAUAAUCGCGUGGAUAUGGCACGCAAGUUUCUACCGGGCAC...,"2,6- diaminopurine",Target_lig_168,ADENINE RIBOSWITCH,Target_69,7.698970
4,225,C1=NC(=NC2(C1=NC=N2)N)N,GGACAUAUAAUCGCGUGGAUAUGGCACGCAAGUUUCUACCGGGCAC...,"2,4-diaminopurine",Target_lig_169,ADENINE RIBOSWITCH,Target_69,8.397940
...,...,...,...,...,...,...,...,...
95,2284,C[C@H]1[C@H]([C@H](C[C@@H](O1)O[C@H]2C[C@@](CC...,GGGCCUAAAACAUACCAGAUCGCCACCCGCGCUUUAAUCUGGAGAG...,dox,Target_lig_496,Tetracycline riboswitch A50U mutant,Target_415,4.978811
96,2320,CNC1=NC=C(C=N1)CN2CCC[C@@H](C2)C3=NC(=CC(=O)N3...,GGCGUGUAGGAUAUGCUUCGGCAGAAGGACACGCC,Ribocil,Target_lig_663,FMN RIBOSWITCH APTAMER,Target_36,7.795880
97,2321,CNc1ncc(cn1)CN1CCC[C@H](C1)c1nc(O)cc(n1)c1cccs1,GGCGUGUAGGAUAUGCUUCGGCAGAAGGACACGCC,Ribocil-A,Target_lig_1187,FMN RIBOSWITCH APTAMER,Target_36,5.000000
98,2322,CNC1=NC=C(C=N1)CN2CCC[C@@H](C2)C3=NC(=CC(=O)N3...,GGCGUGUAGGAUAUGCUUCGGCAGAAGGACACGCC,Ribocil-B,Target_lig_663,FMN RIBOSWITCH APTAMER,Target_36,8.180456


In [18]:
mid

'Target_lig_504'

In [19]:
data_df['Molecule_ID'].unique()

array(['Target_lig_165', 'Target_lig_166', 'Target_lig_167',
       'Target_lig_168', 'Target_lig_169', 'Target_lig_173',
       'Target_lig_162', 'Target_lig_248', 'Target_lig_249',
       'Target_lig_250', 'Target_lig_314', 'Target_lig_318',
       'Target_lig_68', 'Target_lig_331', 'Target_lig_1248',
       'Target_lig_31', 'Target_lig_502', 'Target_lig_503',
       'Target_lig_504', 'Target_lig_70', 'Target_lig_505',
       'Target_lig_57', 'Target_lig_506', 'Target_lig_507',
       'Target_lig_508', 'Target_lig_509', 'Target_lig_662',
       'Target_lig_663', 'Target_lig_664', 'Target_lig_1254',
       'Target_lig_665', 'Target_lig_1255', 'Target_lig_666',
       'Target_lig_667', 'Target_lig_668', 'Target_lig_336',
       'Target_lig_669', 'Target_lig_714', 'Target_lig_1260',
       'Target_lig_1261', 'Target_lig_1044', 'Target_lig_1045',
       'Target_lig_1046', 'Target_lig_1047', 'Target_lig_1048',
       'Target_lig_1049', 'Target_lig_1050', 'Target_lig_1051',
       'Target_

In [20]:
mol_df['name'].unique()

array(['Target_lig_165', 'Target_lig_166', 'Target_lig_167',
       'Target_lig_168', 'Target_lig_169', 'Target_lig_173',
       'Target_lig_162', 'Target_lig_248', 'Target_lig_249',
       'Target_lig_250', 'Target_lig_314', 'Target_lig_318',
       'Target_lig_68', 'Target_lig_331', 'Target_lig_1248',
       'Target_lig_31', 'Target_lig_502', 'Target_lig_503',
       'Target_lig_70', 'Target_lig_505', 'Target_lig_57',
       'Target_lig_506', 'Target_lig_507', 'Target_lig_508',
       'Target_lig_509', 'Target_lig_662', 'Target_lig_663',
       'Target_lig_664', 'Target_lig_1254', 'Target_lig_665',
       'Target_lig_1255', 'Target_lig_666', 'Target_lig_667',
       'Target_lig_668', 'Target_lig_336', 'Target_lig_669',
       'Target_lig_714', 'Target_lig_1260', 'Target_lig_1261',
       'Target_lig_1044', 'Target_lig_1045', 'Target_lig_1046',
       'Target_lig_1047', 'Target_lig_1048', 'Target_lig_1049',
       'Target_lig_1050', 'Target_lig_1051', 'Target_lig_1121',
       'Target