# Creation of csv databases for complexes and ligands

In [None]:
import pandas as pd
import numpy as np

First, let us regroup all the raw data we want for our project. For the ligands, this corresponds to the ID, SMILES, stoichiometry, charge and number of atoms.

In [None]:
#uploading the data we have in csv form
df_ligands1 = pd.read_csv('../Raw_Data/ligands_fingerprints.csv',sep=';')
df_ligands2= pd.read_csv('../Raw_Data/ligands_misc_info.csv',sep=';')

#only keeping the desired columns in our csv
df_ligands = pd.DataFrame()
df_ligands['ID'] = df_ligands2['name']
df_ligands['Smiles'] = df_ligands2['smiles']
df_ligands['Stoichiometry'] = df_ligands2['stoichiometry']
df_ligands['Charge'] = df_ligands1['charge']
df_ligands['Nb atoms'] = df_ligands1['n_atoms']

Now we move on to complexes. The information we have available are the id, charge, molecular mass, number of atoms and number of electrons.

In [None]:
df_complex1 = pd.read_csv('../Raw_Data/tmQMg_properties_and_targets.csv', sep=',')
df_complex= df_complex1.iloc[:, 0:5]

We notice that the stoichiometry of the complexes is available in another file that has to be parsed differently due to the fact that it is originally in xyz form. It is first converted into csv, then operated upon.

In [None]:
df_complexname= pd.read_csv('../Raw_Data/tmQM_X1.csv', sep='|')

#after looking at the format of this dataframe, a function is written to extract the corresponding stoichiometry of a specific complex.
l_complexname1= []
for i in range(0,len(df_complexname)):
    if 'CSD_code;=' in str(df_complexname.iloc[i]):
        l_complexname1.append(str(df_complexname.iloc[i]))

def idtostoi(list1):
    '''
    function that extracts the ID and corresponding stoichiometry of complex
    input : list containing misc info of complex
    output : dict with key = complex id and object = complex stoichiometry  
    '''
    dict1 = {} 
    for i in range(0,len(list1)):
        if str(list1[i])[37]==' ':
            id = str(list1[i])[38:44]
        else:
            id = str(list1[i])[37:43]
        k=82
        while str(list1[i])[k] != 'C':
            k+=1
        j=k
        b=str()
        while str(list1[i])[j] !=' ':
            b+= str(list1[i])[j]
            j+=1
        dict1[f'{id}']=b
    return dict1

l_complexname2 = idtostoi(l_complexname1)

In [None]:
df_complex['Stoichiometry'] = [x for x in range(0,len(df_complex))] #creating a new column for the stoichiometry

#adding the corresponding stoichiometry to a complex
for j in range(0, len(df_complex)):
    id = str(df_complex.iloc[j, 0])
    if id in l_complexname2.keys():
        df_complex['Stoichiometry'][j]= l_complexname2[id]

This process is repeated a second time as the amount of complexes required the list to be separated into two files.

In [None]:
df_complexname1= pd.read_csv('../Raw_Data/tmQM_X2.csv', sep='|')
l_complexname3= []
for i in range(0,len(df_complexname1)):
    if 'CSD_code;=' in str(df_complexname1.iloc[i]):
        l_complexname3.append(str(df_complexname1.iloc[i]))

def idtostoi2(list1):
    '''
    function that extracts the ID and corresponding stoichiometry of complex
    input : list containing misc info of complex
    output : dict with key = complex id and object = complex stoichiometry 
    (modified to fit new database) 
    '''
    dict1 = {} 
    for i in range(0,len(list1)):
        if str(list1[i])[36]==' ':
            id = str(list1[i])[37:43]
        else:
            id = str(list1[i])[36:42]
        k=81
        while str(list1[i])[k] != 'C':
            k+=1
        j=k
        b=str()
        while str(list1[i])[j] !=' ':
            b+= str(list1[i])[j]
            j+=1
        dict1[f'{id}']=b
    return dict1

l_complexname4 = idtostoi2(l_complexname3)

for j in range(0, len(df_complex)):
    id = str(df_complex.iloc[j, 0])
    if id in l_complexname4.keys():
        df_complex['Stoichiometry'][j]= l_complexname4[id]

Some values are missing. We will replace them with N/A. Not all the complexes are given in the tmQm_X.csv documents, thus it is normal for them to not appear in the final dataframe.

In [None]:
for i in range(0, len(df_complex)):
    if type(df_complex['Stoichiometry'][i]) == int:
        df_complex['Stoichiometry'][i] = 'N/A'

In [None]:
j=0
for i in range(0, len(df_complex)):
    if df_complex["Stoichiometry"][i]=="N/A":
        j+=1
print(j)

We remember this number in order to later on verify that the same amount of complexes have missing information.

Now we want to show which metal appears in which complex.

In [None]:
df_ligands1= pd.read_csv('../Raw_Data/ligands_misc_info.csv',sep=';')
df_ligands1=df_ligands1[['name',"parent_metal_occurrences"]] 

metal_complex={}
"""
We want to make a dictionary where the keys are the metals and their respective values a list containing all complexes they appear in.
"""
for string in df_ligands1['parent_metal_occurrences']:
    acceptable_string=string.replace("'", "\"")
    dico= json.loads(acceptable_string)
    for k in list(dico.keys()):
        metal_complex[f"{k}"] = []
for string in df_ligands1['parent_metal_occurrences']:
    acceptable_string=string.replace("'", "\"")
    dico= json.loads(acceptable_string)
    for i, (k,o) in enumerate(dico.items()):
        for j in o:
            metal_complex[f"{k}"].append(j[:6])


In [None]:
df_complex["Metals"] = list
"""
We make a new column in the dataframe to include the metals.
"""
for j in range(0, len(df_complex)):
    temp= []
    for i, (l, cl) in enumerate(metal_complex.items()):
        for c in cl:
            if df_complex['id'][j]==c:
                temp.append(l)
                break
    df_complex["Metals"][j]=temp


In [None]:
j=0
for i in range(0, len(df_complex)):
    if df_complex["Metals"][i]==[]:
        j+=1
print(j)

This gives the same number as the complexes without stoichiometry: we verified they are the same.

Now we are going to link the ligands to the complexes they appear in.

In [None]:
tot_complex = [] #create a list of all complex names to iterate faster than a full dataframe
for j, complex in enumerate(df_complex['id']):
    tot_complex.append(complex)

In [None]:
df_complex["Ligands and number of occurance"]=[{} for _ in range(len(df_complex))] #add dictionaries to the complex dataframe to indicate ligands appearing as keys and number of occurance as value
df_ligands["Complex in which they appear and number of appearances"]=[{} for _ in range(len(df_ligands))] #add dictionaries to the ligand dataframe to indicate the complex in which they appear as key and their number of occurance as value 

#we use vectorized operations for efficiency

#first completing the ligand dataframe
for complex in tot_complex:
    mask=df_ligands1['parent_metal_occurrences'].str.contains(complex)
    counts=df_ligands1.loc[mask,"parent_metal_occurrences"].str.count(complex)
    df_ligands.loc[mask,"Complex in which they appear and number of appearances"]=df_ligands.loc[mask,"Complex in which they appear and number of appearances"].apply(lambda x:{**x, f"{complex}":counts.iloc[0]})

#then completing the complex dataframe
for i, complex in enumerate(df_complex['id']):
    for j, ligand_dico in enumerate(df_ligands['Complex in which they appear and number of appearances']):
        if complex in ligand_dico.keys():
            df_complex["Ligands and number of occurance"][i][f'{df_ligands.iloc[j,1]}']=ligand_dico[complex]

In [None]:
j=0
for i in range(0, len(df_complex)):
    if df_complex["Ligands and number of occurance"][i]=={}:
        j+=1
print(j)

Verify the name number of information is missing. Still normal.

In [None]:
#Saving our dataframes in csv form.
pd.DataFrame.to_csv(df_ligands, '../Created_Data/Ligands_info.csv') 
pd.DataFrame.to_csv(df_complex, '../Created_Data/Complex_info.csv') 