# Designing metal organic frameworks for carbon dioxide capture
## Data wrangling
## by Zachary Brown

Let's start off by installing and importing the libraries we'll need for this notebook.

In [3]:
!pip install numpy==1.24.2
!pip install pandas==1.5.3
!pip install requests==2.28.2

Collecting requests==2.28.2
  Downloading requests-2.28.2-py3-none-any.whl (62 kB)
     ---------------------------------------- 62.8/62.8 kB 3.3 MB/s eta 0:00:00
Collecting charset-normalizer<4,>=2
  Downloading charset_normalizer-3.0.1-cp310-cp310-win_amd64.whl (96 kB)
     ---------------------------------------- 96.5/96.5 kB ? eta 0:00:00
Collecting urllib3<1.27,>=1.21.1
  Downloading urllib3-1.26.14-py2.py3-none-any.whl (140 kB)
     -------------------------------------- 140.6/140.6 kB 8.2 MB/s eta 0:00:00
Installing collected packages: charset-normalizer, urllib3, requests
Successfully installed charset-normalizer-3.0.1 requests-2.28.2 urllib3-1.26.14


In [1]:
import numpy as np
import pandas as pd
import requests

Now we'll download the qmof dataset and then unzip it into the raw data folder.

In [2]:
url = 'https://figshare.com/ndownloader/articles/13147324/versions/14'
r = requests.get(url, allow_redirects=True)
open('../data/raw/qmof.zip', 'wb').write(r.content)

407219972

In [3]:
from zipfile import ZipFile
  
with ZipFile("../data/raw/qmof.zip", 'r') as zobject:
    zobject.extractall(path="../data/raw/")

In [4]:
with ZipFile("../data/raw/qmof_database.zip", 'r') as zobject:
    zobject.extractall(path="../data/raw/")

Next we'll download the hmof dataset (which is the subset of the MOFXDB with carbon dioxide isotherms) and unzip that in the raw data folder.

In [5]:
url = 'https://mof.tech.northwestern.edu/Datasets/hMOF-10%201039%20C2EE23201D-CarbonDioxide-mofdb-version:dc8a0295db.zip'
r = requests.get(url, allow_redirects=True)
open('../data/raw/hmof.zip', 'wb').write(r.content)

982292348

In [6]:
with ZipFile("../data/raw/hmof.zip", 'r') as zobject:
    zobject.extractall(path="../data/raw/hmof")

Now I'll load in the qmof dataset into a dataframe.

In [7]:
qmof = pd.read_csv('../data/raw/qmof_database/qmof.csv', low_memory=False)

In [8]:
qmof.shape

(20375, 94)

In [9]:
qmof.head()

Unnamed: 0,qmof_id,name,info.formula,info.formula_reduced,info.mofid.mofid,info.mofid.mofkey,info.mofid.smiles_nodes,info.mofid.smiles_linkers,info.mofid.smiles,info.mofid.topology,...,outputs.hse06.energy_elec,outputs.hse06.net_magmom,outputs.hse06.bandgap,outputs.hse06.cbm,outputs.hse06.vbm,outputs.hse06.directgap,outputs.hse06.bandgap_spins,outputs.hse06.cbm_spins,outputs.hse06.vbm_spins,outputs.hse06.directgap_spins
0,qmof-8a95c27,ABACUF01_FSR,Ba2CuC6H14O16,Ba2CuC6H14O16,,,"['O', '[Ba]', '[Cu]']",['[O-]C=O'],O.[Ba].[Cu].[O-]C=O,,...,,,,,,,,,,
1,qmof-019ba28,ABALOF_FSR,Cu12C36H56I16N4S4,Cu3C9H14I4NS,,,,,,,...,,,,,,,,,,
2,qmof-830ed1c,ABAVIJ_FSR,Co4C48H32N8O16,CoC12H8N2O4,[Co].[O-]C(=O)c1ccncc1 MOFid-v1.rtl.cat0,Co.TWBYWOBDOCUKOW.MOFkey-v1.rtl,['[Co]'],['[O-]C(=O)c1ccncc1'],[Co].[O-]C(=O)c1ccncc1,rtl,...,,,,,,,,,,
3,qmof-5bd4a24,ABAVOP_FSR,Co4C48H32N8O16,CoC12H8N2O4,[Co].[O-]C(=O)c1ccncc1 MOFid-v1.rtl.cat0,Co.TWBYWOBDOCUKOW.MOFkey-v1.rtl,['[Co]'],['[O-]C(=O)c1ccncc1'],[Co].[O-]C(=O)c1ccncc1,rtl,...,,,,,,,,,,
4,qmof-644aab4,ABAXUZ_FSR,Zn2C50H32N6O8S4,ZnC25H16N3O4S2,,,['[Zn][Zn]'],"['[O-]C(=O)c1cccc(c1)c1nccs1', 'n1ccc(cc1)c1cc...",[O-]C(=O)c1cccc(c1)c1nccs1.[Zn][Zn].n1ccc(cc1)...,,...,-811.553858,0.0,2.901747,2.246703,-0.655044,True,"[None, None]","[None, None]","[None, None]","[None, None]"


In [10]:
qmof['qmof_id'].str.startswith('hMOF').sum()

0

In [11]:
qmof.columns

Index(['qmof_id', 'name', 'info.formula', 'info.formula_reduced',
       'info.mofid.mofid', 'info.mofid.mofkey', 'info.mofid.smiles_nodes',
       'info.mofid.smiles_linkers', 'info.mofid.smiles', 'info.mofid.topology',
       'info.natoms', 'info.pld', 'info.lcd', 'info.density', 'info.volume',
       'info.symmetry.spacegroup', 'info.symmetry.spacegroup_number',
       'info.symmetry.spacegroup_crystal', 'info.symmetry.pointgroup',
       'info.synthesized', 'info.source', 'info.doi', 'inputs.pbe.theory',
       'inputs.pbe.pseudopotentials', 'inputs.pbe.encut', 'inputs.pbe.kpoints',
       'inputs.pbe.gamma', 'inputs.pbe.spin', 'outputs.pbe.energy_total',
       'outputs.pbe.energy_vdw', 'outputs.pbe.energy_elec',
       'outputs.pbe.net_magmom', 'outputs.pbe.bandgap', 'outputs.pbe.cbm',
       'outputs.pbe.vbm', 'outputs.pbe.directgap', 'outputs.pbe.bandgap_spins',
       'outputs.pbe.cbm_spins', 'outputs.pbe.vbm_spins',
       'outputs.pbe.directgap_spins', 'inputs.hle17.theory',

I noticed that the MOF ID columns are null for a few rows in the qmof data, so I want to check right now to make sure I'll have enough rows with identifiers that I can still generate a suitable dataset since I plan on merging based on those IDs. I have 20,375 rows of data total.

In [12]:
qmof['info.mofid.mofid'].isna().sum()

12911

Ok, taking into account those rows with no identifiers I'll still have ~8,400 rows of data to work with. That should be enough to get by. Next I'll need to create a dataframe for the hmof data, loop through the hmof JSON files and pull in those data points for the isotherms. I'll start by loading one JSON file to identify which information I need to pull from them.

In [13]:
import json
  
# Opening JSON file
f = open('../data/raw/hmof/hMOF-7.json')
  
# returns JSON object as 
# a dictionary
data = json.load(f)
  
print(data)
  
# Closing file
f.close()

{'id': 15340, 'cif': "data_functionalizedCrystal\n_audit_creation_method\t'MofGen! by Chris Wilmer'\n_symmetry_space_group_name_H-M\t'P1'\n_symmetry_Int_Tables_number\t1\n_symmetry_cell_setting\ttriclinic\nloop_\n_symmetry_equiv_pos_as_xyz\n  x,y,z\n_cell_length_a\t12.759393\n_cell_length_b\t12.759401\n_cell_length_c\t12.759399\n_cell_angle_alpha\t89.983359\n_cell_angle_beta\t89.967969\n_cell_angle_gamma\t90.019837\nloop_\n_atom_site_label\n_atom_site_type_symbol\n_atom_site_fract_x\n_atom_site_fract_y\n_atom_site_fract_z\nZn1\tZn\t-0.594017\t-1.606252\t-0.418161\nZn2\tZn\t-0.094017\t-1.106252\t0.081839\nZn3\tZn\t-0.418336\t-1.606145\t-0.593880\nZn4\tZn\t0.081664\t-1.106145\t-0.093880\nZn5\tZn\t-0.418352\t-1.430554\t-0.418220\nZn6\tZn\t0.081648\t-0.930554\t0.081780\nZn7\tZn\t-0.593973\t-1.430541\t-0.593803\nZn8\tZn\t-0.093973\t-0.930541\t-0.093803\nO9\tO\t-0.506170\t-1.518373\t-0.506016\nO10\tO\t-0.006170\t-1.018373\t-0.006016\nO11\tO\t-0.570724\t-1.753259\t-0.441478\nO12\tO\t-0.070724

Unfortunately these isotherms only have datapoints at 0.01, 0.05, 0.1, 0.5, and 2.5 bar, not at 1. As pressure rises above atmospheric pressure, you begin packing in gas tighter than would be expected, and the pore volume becomes disproportionately more important because that free volume allows more gas to get packed in. At sub-atmospheric pressure the adsorption is driven by attraction between the gas and framework. It is beneficial to have high attraction in a carbon dioxide capture process, so I believe it makes more sense to use the 0.5 bar measurements as a substitute.

In [14]:
# Create empty dataframe to add IDs and isotherm datapoints
hmof = pd.DataFrame(columns=['hmof_mofid', 'co2_298_0.5'])

import os

# Loop through files
for filename in os.listdir('../data/raw/hmof'):
    if filename.endswith('.json'):
        with open(os.path.join('../data/raw/hmof', filename)) as f:
            file = json.load(f)

            # Identify CO2 isotherms within JSON files
            if file['isotherms'][0]['adsorbates'][0]['formula'] == 'CO2':
                # Parse datapoints until 0.5 bar is found
                for i in range(len(file['isotherms'][0]['isotherm_data'])):
                    if file['isotherms'][0]['isotherm_data'][i]['pressure'] == 0.5:
                        # Create a new row for the hmof dataframe with this measurement value and append it
                        new_row = pd.Series({'hmof_mofid': file['mofid'], 'co2_298_0.5':\
                                             file['isotherms'][0]['isotherm_data'][i]['species_data'][0]['adsorption']})
                        hmof = pd.concat([hmof, new_row.to_frame().T], ignore_index=True)
                        
        #Close the file and repeat with the next JSON
        f.close

In [15]:
hmof.shape

(137652, 2)

In [16]:
hmof.head()

Unnamed: 0,hmof_mofid,co2_298_0.5
0,[O-]C(=O)c1ccc(cc1)C(=O)[O-].[Zn][O]([Zn])([Zn...,0.885221
1,[O-]C(=O)c1ccc(cc1)C(=O)[O-].[Zn][O]([Zn])([Zn...,4.77901
2,[O-]C(=O)c1cc(F)c(c(c1F)F)C(=O)[O-].[Zn][O]([Z...,3.90947
3,COc1cc(cc(c1C(=O)[O-])OC)C(=O)[O-].COc1cc(ccc1...,1.75638
4,CCc1cc(C(=O)[O-])c(c(c1C(=O)[O-])CC)CC.[O-]C(=...,2.6942


Awesome! I have my hmof dataframe, now I need to confirm that the MOF ID column is unique since it's going to be the column I'm joining on and I only want one CO2 capacity value for each.

In [17]:
hmof['hmof_mofid'].nunique()

118693

Ok, there are quite a few duplicate values. Let's explore.

In [18]:
hmof[hmof.duplicated('hmof_mofid')]

Unnamed: 0,hmof_mofid,co2_298_0.5
26,* MOFid-v1.NA.NA,3.52241
27,* MOFid-v1.NA.NA,2.80731
37,N#Cc1cc(ccc1C(=O)[O-])C(=O)[O-].N#Cc1cc(ccc1c1...,2.84016
39,* MOFid-v1.NA.NA,3.1969
120,* MOFid-v1.NA.NA,1.79466
...,...,...
137647,,2.01503
137648,,4.29052
137649,,0.575496
137650,,2.29727


Let's start by dropping missing values.

In [19]:
hmof_not_none = hmof[~hmof['hmof_mofid'].isnull()]

In [20]:
hmof_not_none.shape

(127194, 2)

In [21]:
hmof_not_none[hmof_not_none.duplicated('hmof_mofid')]

Unnamed: 0,hmof_mofid,co2_298_0.5
26,* MOFid-v1.NA.NA,3.52241
27,* MOFid-v1.NA.NA,2.80731
37,N#Cc1cc(ccc1C(=O)[O-])C(=O)[O-].N#Cc1cc(ccc1c1...,2.84016
39,* MOFid-v1.NA.NA,3.1969
120,* MOFid-v1.NA.NA,1.79466
...,...,...
130961,[O-]C(=O)C#CC(=O)[O-].[O-]C(=O)C=C(C=C(C(=O)[O...,1.34559
130962,[O-]C(=O)C#CC(=O)[O-].[O-]C(=O)C=C(C=C(C(=O)[O...,1.8604
130963,[O-]C(=O)C#CC(=O)[O-].[O-]C(=O)C=C(C=C(C(=O)[O...,0.887344
130970,[O-]C(=O)c1cc(F)c(c(c1F)F)C(=O)[O-].[O-]C(=O)c...,4.51262


That's a start, now let's drop these MOF IDs with .NA.NA, since they don't have actual identifying information.

In [22]:
hmof_not_na = hmof_not_none[hmof_not_none['hmof_mofid'].str.contains('.NA.') == False]

In [23]:
hmof_not_na.shape

(120014, 2)

In [24]:
dups = hmof_not_na[hmof_not_na.duplicated('hmof_mofid')]
dups

Unnamed: 0,hmof_mofid,co2_298_0.5
37,N#Cc1cc(ccc1C(=O)[O-])C(=O)[O-].N#Cc1cc(ccc1c1...,2.84016
147,[O-]C(=O)c1ccc(c2c1CC2)C(=O)[O-].[O].[V] MOFid...,2.18198
181,N#Cc1cc(ccc1C(=O)[O-])C(=O)[O-].[O-]C(=O)C#CC(...,2.25383
196,[O-]C(=O)C#CC(=O)[O-].[O].[V] MOFid-v1.rna.cat0,0.766846
197,[O-]C(=O)C#CC(=O)[O-].[O].[V] MOFid-v1.rna.cat1,0
...,...,...
130961,[O-]C(=O)C#CC(=O)[O-].[O-]C(=O)C=C(C=C(C(=O)[O...,1.34559
130962,[O-]C(=O)C#CC(=O)[O-].[O-]C(=O)C=C(C=C(C(=O)[O...,1.8604
130963,[O-]C(=O)C#CC(=O)[O-].[O-]C(=O)C=C(C=C(C(=O)[O...,0.887344
130970,[O-]C(=O)c1cc(F)c(c(c1F)F)C(=O)[O-].[O-]C(=O)c...,4.51262


We're getting there. Now I'm going to look for duplicates and if the CO2 value is 0 I'll drop those, because these calculations are precise enough that you wouldn't get a flat 0 in one case and a value in another. The zeroes are likely entry errors.

In [25]:
dup_zero = dups[dups['co2_298_0.5'] == 0]

In [26]:
hmof_not_zero = hmof_not_na.drop(labels=dup_zero.index, axis=0)

In [27]:
hmof_not_zero[hmof_not_zero.duplicated('hmof_mofid')]

Unnamed: 0,hmof_mofid,co2_298_0.5
37,N#Cc1cc(ccc1C(=O)[O-])C(=O)[O-].N#Cc1cc(ccc1c1...,2.84016
147,[O-]C(=O)c1ccc(c2c1CC2)C(=O)[O-].[O].[V] MOFid...,2.18198
181,N#Cc1cc(ccc1C(=O)[O-])C(=O)[O-].[O-]C(=O)C#CC(...,2.25383
196,[O-]C(=O)C#CC(=O)[O-].[O].[V] MOFid-v1.rna.cat0,0.766846
226,[O-]C(=O)C#CC(=O)[O-].[O].[O][C]C#CC(=O)[O-].[...,0.434989
...,...,...
130961,[O-]C(=O)C#CC(=O)[O-].[O-]C(=O)C=C(C=C(C(=O)[O...,1.34559
130962,[O-]C(=O)C#CC(=O)[O-].[O-]C(=O)C=C(C=C(C(=O)[O...,1.8604
130963,[O-]C(=O)C#CC(=O)[O-].[O-]C(=O)C=C(C=C(C(=O)[O...,0.887344
130970,[O-]C(=O)c1cc(F)c(c(c1F)F)C(=O)[O-].[O-]C(=O)c...,4.51262


That only dropped about 400 rows, but it's still a necessary improvement. Next I'll average any remaining duplicates.

In [56]:
hmof_no_dups = hmof_not_zero.groupby('hmof_mofid').mean()

In [57]:
hmof_no_dups.shape

(115173, 1)

Great! Now I'll check the qmof dataframe for uniqueness in the mofid column, and once that's clean I'll inner join them to see how many rows I have to work with.

In [29]:
qmof[qmof.duplicated('info.mofid.mofid')]

Unnamed: 0,qmof_id,name,info.formula,info.formula_reduced,info.mofid.mofid,info.mofid.mofkey,info.mofid.smiles_nodes,info.mofid.smiles_linkers,info.mofid.smiles,info.mofid.topology,...,outputs.hse06.energy_elec,outputs.hse06.net_magmom,outputs.hse06.bandgap,outputs.hse06.cbm,outputs.hse06.vbm,outputs.hse06.directgap,outputs.hse06.bandgap_spins,outputs.hse06.cbm_spins,outputs.hse06.vbm_spins,outputs.hse06.directgap_spins
1,qmof-019ba28,ABALOF_FSR,Cu12C36H56I16N4S4,Cu3C9H14I4NS,,,,,,,...,,,,,,,,,,
3,qmof-5bd4a24,ABAVOP_FSR,Co4C48H32N8O16,CoC12H8N2O4,[Co].[O-]C(=O)c1ccncc1 MOFid-v1.rtl.cat0,Co.TWBYWOBDOCUKOW.MOFkey-v1.rtl,['[Co]'],['[O-]C(=O)c1ccncc1'],[Co].[O-]C(=O)c1ccncc1,rtl,...,,,,,,,,,,
4,qmof-644aab4,ABAXUZ_FSR,Zn2C50H32N6O8S4,ZnC25H16N3O4S2,,,['[Zn][Zn]'],"['[O-]C(=O)c1cccc(c1)c1nccs1', 'n1ccc(cc1)c1cc...",[O-]C(=O)c1cccc(c1)c1nccs1.[Zn][Zn].n1ccc(cc1)...,,...,-811.553858,0.0,2.901747,2.246703,-0.655044,True,"[None, None]","[None, None]","[None, None]","[None, None]"
5,qmof-eaa4957,ABAYEI_FSR,Ag6C10H10N4O12,Ag3C5H5N2O6,,,"['[OH2][Ag][Ag][Ag][Ag][OH2]', '[OH2][Ag][Ag][...","['[O-]C(=O)C1=NN=C([CH]1)C(=O)[O-]', '[O-]C(=O...",[O-]C(=O)C1=NN=C([CH]1)C(=O)[O-].[O-]C(=O)C1=N...,,...,,,,,,,,,,
6,qmof-ffa4c4a,ABAYIM_FSR,Ag8C24H8N8O16,AgC3HNO2,,,"['[Ag]1[Ag][Ag][Ag]1', '[Ag][Ag]']",['[O-]C(=O)c1nccnc1C(=O)[O-]'],[Ag]1[Ag][Ag][Ag]1.[Ag][Ag].[O-]C(=O)c1nccnc1C...,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20232,qmof-61a517e,tobacco_SR_rtl_v1-3c_pyrrole_Ch_v2-6c_Hf_1_Ch_...,Hf12C52H52N4O64,Hf3C13H13NO16,,,['O[Hf]123([OH2])[OH]4[Hf]56([O]3[Hf]37([OH]2[...,['[O-]C(=O)C#Cn1c(C#CC(=O)[O-])ccc1C#CC(=O)[O-...,O[Hf]123([OH2])[OH]4[Hf]56([O]3[Hf]37([OH]2[Hf...,rtl,...,,,,,,,,,,
20243,qmof-2f9c233,tobacco_SR_tsd_v1-6c_Hf_1_Ch_v2-4c_B_Ch_v3-4c_...,Hf6C46H30N2O32,Hf3C23H15NO16,,,"['O', '[Hf]', '[OH]', '[O]']",['[O-]C(=O)C#Cc1cc(C#CC(=O)[O-])c(cc1C#CC(=O)[...,O.[Hf].[O-]C(=O)C#Cc1cc(C#CC(=O)[O-])c(cc1C#CC...,,...,,,,,,,,,,
20249,qmof-2c7b82d,tobacco_SR_tsd_v1-6c_Zr_1_Ch_v2-4c_B_Ch_v3-4c_...,Zr6C46H30N2O32,Zr3C23H15NO16,,,"['O', '[OH]', '[O]', '[Zr]']",['[O-]C(=O)C#Cc1cc(C#CC(=O)[O-])c(cc1C#CC(=O)[...,O.[O-]C(=O)C#Cc1cc(C#CC(=O)[O-])c(cc1C#CC(=O)[...,,...,,,,,,,,,,
20321,qmof-fc08082,tobacco_bor_sym_3_mc_0_sym_4_on_8_L_15,Cu12C120H90N24,Cu2C20H15N4,C(=C[C]1C=NN=C1)C=Cc1cc(C=CC=CC2=C[N]N=C2)cc(c...,Cu.DFUGZOGIFOXCHO.MOFkey-v1.bor,['[Cu]'],['C(=C[C]1C=NN=C1)C=Cc1cc(C=CC=CC2=C[N]N=C2)cc...,C(=C[C]1C=NN=C1)C=Cc1cc(C=CC=CC2=C[N]N=C2)cc(c...,bor,...,,,,,,,,,,


Let's drop those missing mofid rows.

In [30]:
qmof_no_null = qmof[~qmof['info.mofid.mofid'].isnull()]
qmof_no_null.shape

(7464, 94)

In [31]:
qmof_no_null[qmof_no_null.duplicated('info.mofid.mofid')]

Unnamed: 0,qmof_id,name,info.formula,info.formula_reduced,info.mofid.mofid,info.mofid.mofkey,info.mofid.smiles_nodes,info.mofid.smiles_linkers,info.mofid.smiles,info.mofid.topology,...,outputs.hse06.energy_elec,outputs.hse06.net_magmom,outputs.hse06.bandgap,outputs.hse06.cbm,outputs.hse06.vbm,outputs.hse06.directgap,outputs.hse06.bandgap_spins,outputs.hse06.cbm_spins,outputs.hse06.vbm_spins,outputs.hse06.directgap_spins
3,qmof-5bd4a24,ABAVOP_FSR,Co4C48H32N8O16,CoC12H8N2O4,[Co].[O-]C(=O)c1ccncc1 MOFid-v1.rtl.cat0,Co.TWBYWOBDOCUKOW.MOFkey-v1.rtl,['[Co]'],['[O-]C(=O)c1ccncc1'],[Co].[O-]C(=O)c1ccncc1,rtl,...,,,,,,,,,,
9,qmof-d2e3fe6,ABAZAF_FSR,Zn4C104H76N12O20,ZnC26H19N3O5,Oc1cc(cc(c1)C(=O)[O-])C(=O)[O-].[Zn].n1ccc(cc1...,Zn.MGFJDEHFNMWYBD.QNVNLUSHGRBCLO.MOFkey-v1.dia,['[Zn]'],"['Oc1cc(cc(c1)C(=O)[O-])C(=O)[O-]', 'n1ccc(cc1...",Oc1cc(cc(c1)C(=O)[O-])C(=O)[O-].[Zn].n1ccc(cc1...,dia,...,,,,,,,,,,
162,qmof-2023658,AFOSIY_FSR,Zn4C48H52N4O20,ZnC12H13NO5,[O-]C(=O)c1ccc(cc1)C(=O)[O-].[Zn][Zn] MOFid-v1...,Zn.KKEYFWRCBNTPAC.MOFkey-v1.sql,['[Zn][Zn]'],['[O-]C(=O)c1ccc(cc1)C(=O)[O-]'],[O-]C(=O)c1ccc(cc1)C(=O)[O-].[Zn][Zn],sql,...,-953.140980,0.0,4.602255,2.497769,-2.104486,True,"[None, None]","[None, None]","[None, None]","[None, None]"
251,qmof-582aea9,AJABOD_FSR,Cu6C42H60N18O12,CuC7H10N3O2,CCC1=NN=C([N]1)CC.[Cu] MOFid-v1.nbo-a.cat0,Cu.DCBGWRCUTFNURX.MOFkey-v1.nbo-a,['[Cu]'],['CCC1=NN=C([N]1)CC'],CCC1=NN=C([N]1)CC.[Cu],nbo-a,...,-1008.552505,0.0,3.466100,3.030994,-0.435106,True,"[None, None]","[None, None]","[None, None]","[None, None]"
287,qmof-461be91,AKAROT_FSR,Ni2C36H24N4O8,NiC18H12N2O4,[Ni].[O-]C(=O)c1cccc(c1)C(=O)[O-].n1ccc(cc1)c1...,Ni.MWVTWFVJZLCBMC.QQVIHTHCMHWDBS.MOFkey-v1.sql,['[Ni]'],"['[O-]C(=O)c1cccc(c1)C(=O)[O-]', 'n1ccc(cc1)c1...",[Ni].[O-]C(=O)c1cccc(c1)C(=O)[O-].n1ccc(cc1)c1...,sql,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19521,qmof-ae7c007,gmof_Uio66Zr-BDC_A_No8,Zr6C48Cl12H16O32,Zr3C24Cl6H8O16,[O-]C(=O)c1cc(Cl)c(cc1Cl)C(=O)[O-].[O]12[Zr]34...,Zr.LMOSYFZLPBHEOW.MOFkey-v1.fcu,['[O]12[Zr]34[OH]5[Zr]62[OH]2[Zr]71[OH]4[Zr]14...,['[O-]C(=O)c1cc(Cl)c(cc1Cl)C(=O)[O-]'],[O-]C(=O)c1cc(Cl)c(cc1Cl)C(=O)[O-].[O]12[Zr]34...,fcu,...,,,,,,,,,,
19632,qmof-605d233,gmof_Uio66Zr-irmof8_A_No17,Zr6C72H40O44,Zr3C36H20O22,[O-]C(=O)c1cc(O)c2c(c1)c(O)cc(c2)C(=O)[O-].[O]...,Zr.POXYDFYKMVBSTR.MOFkey-v1.fcu,['[O]12[Zr]34[OH]5[Zr]62[OH]2[Zr]71[OH]4[Zr]14...,['[O-]C(=O)c1cc(O)c2c(c1)c(O)cc(c2)C(=O)[O-]'],[O-]C(=O)c1cc(O)c2c(c1)c(O)cc(c2)C(=O)[O-].[O]...,fcu,...,,,,,,,,,,
19909,qmof-a33f7c9,gmof_Zn4O13-BDC_A-NiC4_No16,Zn8C48F24O26,Zn4C24F12O13,[O-]C(=O)c1c(F)c(F)c(c(c1F)F)C(=O)[O-].[Zn][O]...,Zn.WFNRNCNCXRGUKN.MOFkey-v1.pcu,['[Zn][O]([Zn])([Zn])[Zn]'],['[O-]C(=O)c1c(F)c(F)c(c(c1F)F)C(=O)[O-]'],[O-]C(=O)c1c(F)c(F)c(c(c1F)F)C(=O)[O-].[Zn][O]...,pcu,...,,,,,,,,,,
20321,qmof-fc08082,tobacco_bor_sym_3_mc_0_sym_4_on_8_L_15,Cu12C120H90N24,Cu2C20H15N4,C(=C[C]1C=NN=C1)C=Cc1cc(C=CC=CC2=C[N]N=C2)cc(c...,Cu.DFUGZOGIFOXCHO.MOFkey-v1.bor,['[Cu]'],['C(=C[C]1C=NN=C1)C=Cc1cc(C=CC=CC2=C[N]N=C2)cc...,C(=C[C]1C=NN=C1)C=Cc1cc(C=CC=CC2=C[N]N=C2)cc(c...,bor,...,,,,,,,,,,


In [33]:
hmof_no_dups.to_csv('../data/interim/hmof.csv')

To handle the rest of the duplicates I'm going to sort the dataframe by how many nulls are present in each row, then I'll drop duplicates based on the MOF ID, keeping those rows with the fewest null values.

In [42]:
qmof_no_null['null_sum'] = qmof_no_null.isna().sum(axis=1)

In [43]:
qmof_dropped_dups = qmof_no_null.sort_values('null_sum').drop_duplicates(subset='info.mofid.mofid', keep='first')

In [44]:
qmof_dropped_dups.shape

(7134, 95)

In [59]:
qmof_dropped_dups.to_csv('../data/interim/qmof.csv')

Now to merge the two dataframes. I'll do an inner join on the MOF ID since those should align.

In [53]:
merged = qmof_dropped_dups.join(other = hmof_no_dups, on = 'info.mofid.mofid', how = 'inner')

In [54]:
merged.shape

(30, 96)

In [58]:
merged.head()

Unnamed: 0,qmof_id,name,info.formula,info.formula_reduced,info.mofid.mofid,info.mofid.mofkey,info.mofid.smiles_nodes,info.mofid.smiles_linkers,info.mofid.smiles,info.mofid.topology,...,outputs.hse06.bandgap,outputs.hse06.cbm,outputs.hse06.vbm,outputs.hse06.directgap,outputs.hse06.bandgap_spins,outputs.hse06.cbm_spins,outputs.hse06.vbm_spins,outputs.hse06.directgap_spins,null_sum,co2_298_0.5
8612,qmof-a2d95c3,MIBQAR01_FSR,Zn8C48H24O26,Zn4C24H12O13,[O-]C(=O)c1ccc(cc1)C(=O)[O-].[Zn][O]([Zn])([Zn...,Zn.KKEYFWRCBNTPAC.MOFkey-v1.pcu,['[Zn][O]([Zn])([Zn])[Zn]'],['[O-]C(=O)c1ccc(cc1)C(=O)[O-]'],[O-]C(=O)c1ccc(cc1)C(=O)[O-].[Zn][O]([Zn])([Zn...,pcu,...,4.650938,-1.313401,-5.964339,True,"[None, None]","[None, None]","[None, None]","[None, None]",0,0.885221
5893,qmof-7245894,IDIWOH02_FSR,V2C16H8O10,VC8H4O5,[O-]C(=O)c1ccc(cc1)C(=O)[O-].[O].[V] MOFid-v1....,V.KKEYFWRCBNTPAC.MOFkey-v1.rna,"['[O]', '[V]']",['[O-]C(=O)c1ccc(cc1)C(=O)[O-]'],[O-]C(=O)c1ccc(cc1)C(=O)[O-].[O].[V],rna,...,3.41075,-0.180123,-3.590873,True,"[3.41075, 3.4120269999999997]","[-0.180123, -0.18007]","[-3.590873, -3.592097]","[True, True]",0,1.68345
16968,qmof-83a9586,boydwoo_str_m3_o3_o15_pcu_sym_101,Zn2C52H32N2O8,ZnC26H16NO4,[O-]C(=O)c1c2ccccc2c(c2c1cccc2)C(=O)[O-].[O-]C...,Zn.FDFGHPKPHFUHBP.FZTIWOBQQYPTCJ.MAWKLXRVKVOYL...,['[Zn][Zn]'],"['[O-]C(=O)c1c2ccccc2c(c2c1cccc2)C(=O)[O-]', '...",[O-]C(=O)c1c2ccccc2c(c2c1cccc2)C(=O)[O-].[O-]C...,pcu,...,,,,,,,,,54,1.07822
16963,qmof-19f07bd,boydwoo_str_m3_o3_o14_pcu_sym_1,Zn2C48H30N2O8,ZnC24H15NO4,[O-]C(=O)c1ccc(c2c1cccc2)C(=O)[O-].[O-]C(=O)c1...,Zn.ABMFBCRYHDZLRD.FZTIWOBQQYPTCJ.MAWKLXRVKVOYL...,['[Zn][Zn]'],"['[O-]C(=O)c1ccc(c2c1cccc2)C(=O)[O-]', '[O-]C(...",[O-]C(=O)c1ccc(c2c1cccc2)C(=O)[O-].[O-]C(=O)c1...,pcu,...,,,,,,,,,54,0.739082
20049,qmof-9135e11,gmof_Zn4O13-irmof14_A-irmof16_A_No1,Zn8C116H64O26,Zn4C58H32O13,[O-]C(=O)c1cc2ccc3c4c2c(c1)ccc4cc(c3)C(=O)[O-]...,Zn.FZTIWOBQQYPTCJ.OTAJGWQCQIEFEV.MOFkey-v1.pcu,['[Zn][O]([Zn])([Zn])[Zn]'],['[O-]C(=O)c1cc2ccc3c4c2c(c1)ccc4cc(c3)C(=O)[O...,[O-]C(=O)c1cc2ccc3c4c2c(c1)ccc4cc(c3)C(=O)[O-]...,pcu,...,,,,,,,,,54,0.824963
