# Designing metal organic frameworks for carbon dioxide capture
## Data wrangling
## by Zachary Brown

Let's start off by installing and importing the libraries we'll need for this notebook.

In [3]:
!pip install numpy==1.24.2
!pip install pandas==1.5.3
!pip install requests==2.28.2

Collecting requests==2.28.2
  Downloading requests-2.28.2-py3-none-any.whl (62 kB)
     ---------------------------------------- 62.8/62.8 kB 3.3 MB/s eta 0:00:00
Collecting charset-normalizer<4,>=2
  Downloading charset_normalizer-3.0.1-cp310-cp310-win_amd64.whl (96 kB)
     ---------------------------------------- 96.5/96.5 kB ? eta 0:00:00
Collecting urllib3<1.27,>=1.21.1
  Downloading urllib3-1.26.14-py2.py3-none-any.whl (140 kB)
     -------------------------------------- 140.6/140.6 kB 8.2 MB/s eta 0:00:00
Installing collected packages: charset-normalizer, urllib3, requests
Successfully installed charset-normalizer-3.0.1 requests-2.28.2 urllib3-1.26.14


In [4]:
import numpy as np
import pandas as pd
import requests

Now we'll download the qmof dataset and then unzip it into the raw data folder.

In [6]:
url = 'https://figshare.com/ndownloader/articles/13147324/versions/14'
r = requests.get(url, allow_redirects=True)
open('../data/raw/qmof.zip', 'wb').write(r.content)

407219972

In [7]:
from zipfile import ZipFile
  
with ZipFile("../data/raw/qmof.zip", 'r') as zobject:
    zobject.extractall(path="../data/raw/")

In [8]:
with ZipFile("../data/raw/qmof_database.zip", 'r') as zobject:
    zobject.extractall(path="../data/raw/")

Next we'll download the hmof dataset (which is the subset of the MOFXDB with carbon dioxide isotherms) and unzip that in the raw data folder.

In [9]:
url = 'https://mof.tech.northwestern.edu/Datasets/hMOF-10%201039%20C2EE23201D-CarbonDioxide-mofdb-version:dc8a0295db.zip'
r = requests.get(url, allow_redirects=True)
open('../data/raw/hmof.zip', 'wb').write(r.content)

982292348

In [11]:
with ZipFile("../data/raw/hmof.zip", 'r') as zobject:
    zobject.extractall(path="../data/raw/hmof")

Now I'll load in the qmof dataset into a dataframe.

In [13]:
qmof = pd.read_csv('../data/raw/qmof_database/qmof.csv', low_memory=False)

In [15]:
qmof.shape

(20375, 94)

In [16]:
qmof.head()

Unnamed: 0,qmof_id,name,info.formula,info.formula_reduced,info.mofid.mofid,info.mofid.mofkey,info.mofid.smiles_nodes,info.mofid.smiles_linkers,info.mofid.smiles,info.mofid.topology,...,outputs.hse06.energy_elec,outputs.hse06.net_magmom,outputs.hse06.bandgap,outputs.hse06.cbm,outputs.hse06.vbm,outputs.hse06.directgap,outputs.hse06.bandgap_spins,outputs.hse06.cbm_spins,outputs.hse06.vbm_spins,outputs.hse06.directgap_spins
0,qmof-8a95c27,ABACUF01_FSR,Ba2CuC6H14O16,Ba2CuC6H14O16,,,"['O', '[Ba]', '[Cu]']",['[O-]C=O'],O.[Ba].[Cu].[O-]C=O,,...,,,,,,,,,,
1,qmof-019ba28,ABALOF_FSR,Cu12C36H56I16N4S4,Cu3C9H14I4NS,,,,,,,...,,,,,,,,,,
2,qmof-830ed1c,ABAVIJ_FSR,Co4C48H32N8O16,CoC12H8N2O4,[Co].[O-]C(=O)c1ccncc1 MOFid-v1.rtl.cat0,Co.TWBYWOBDOCUKOW.MOFkey-v1.rtl,['[Co]'],['[O-]C(=O)c1ccncc1'],[Co].[O-]C(=O)c1ccncc1,rtl,...,,,,,,,,,,
3,qmof-5bd4a24,ABAVOP_FSR,Co4C48H32N8O16,CoC12H8N2O4,[Co].[O-]C(=O)c1ccncc1 MOFid-v1.rtl.cat0,Co.TWBYWOBDOCUKOW.MOFkey-v1.rtl,['[Co]'],['[O-]C(=O)c1ccncc1'],[Co].[O-]C(=O)c1ccncc1,rtl,...,,,,,,,,,,
4,qmof-644aab4,ABAXUZ_FSR,Zn2C50H32N6O8S4,ZnC25H16N3O4S2,,,['[Zn][Zn]'],"['[O-]C(=O)c1cccc(c1)c1nccs1', 'n1ccc(cc1)c1cc...",[O-]C(=O)c1cccc(c1)c1nccs1.[Zn][Zn].n1ccc(cc1)...,,...,-811.553858,0.0,2.901747,2.246703,-0.655044,True,"[None, None]","[None, None]","[None, None]","[None, None]"


In [17]:
qmof.columns

Index(['qmof_id', 'name', 'info.formula', 'info.formula_reduced',
       'info.mofid.mofid', 'info.mofid.mofkey', 'info.mofid.smiles_nodes',
       'info.mofid.smiles_linkers', 'info.mofid.smiles', 'info.mofid.topology',
       'info.natoms', 'info.pld', 'info.lcd', 'info.density', 'info.volume',
       'info.symmetry.spacegroup', 'info.symmetry.spacegroup_number',
       'info.symmetry.spacegroup_crystal', 'info.symmetry.pointgroup',
       'info.synthesized', 'info.source', 'info.doi', 'inputs.pbe.theory',
       'inputs.pbe.pseudopotentials', 'inputs.pbe.encut', 'inputs.pbe.kpoints',
       'inputs.pbe.gamma', 'inputs.pbe.spin', 'outputs.pbe.energy_total',
       'outputs.pbe.energy_vdw', 'outputs.pbe.energy_elec',
       'outputs.pbe.net_magmom', 'outputs.pbe.bandgap', 'outputs.pbe.cbm',
       'outputs.pbe.vbm', 'outputs.pbe.directgap', 'outputs.pbe.bandgap_spins',
       'outputs.pbe.cbm_spins', 'outputs.pbe.vbm_spins',
       'outputs.pbe.directgap_spins', 'inputs.hle17.theory',

Next I'll need to create a dataframe for the hmof data, loop through the hmof JSON files and pull in those data points for the isotherms. I'll start by loading one JSON file to identify which information I need to pull from them.

In [20]:
import json
  
# Opening JSON file
f = open('../data/raw/hmof/hMOF-7.json')
  
# returns JSON object as 
# a dictionary
data = json.load(f)
  
print(data)
  
# Closing file
f.close()

{'id': 15340, 'cif': "data_functionalizedCrystal\n_audit_creation_method\t'MofGen! by Chris Wilmer'\n_symmetry_space_group_name_H-M\t'P1'\n_symmetry_Int_Tables_number\t1\n_symmetry_cell_setting\ttriclinic\nloop_\n_symmetry_equiv_pos_as_xyz\n  x,y,z\n_cell_length_a\t12.759393\n_cell_length_b\t12.759401\n_cell_length_c\t12.759399\n_cell_angle_alpha\t89.983359\n_cell_angle_beta\t89.967969\n_cell_angle_gamma\t90.019837\nloop_\n_atom_site_label\n_atom_site_type_symbol\n_atom_site_fract_x\n_atom_site_fract_y\n_atom_site_fract_z\nZn1\tZn\t-0.594017\t-1.606252\t-0.418161\nZn2\tZn\t-0.094017\t-1.106252\t0.081839\nZn3\tZn\t-0.418336\t-1.606145\t-0.593880\nZn4\tZn\t0.081664\t-1.106145\t-0.093880\nZn5\tZn\t-0.418352\t-1.430554\t-0.418220\nZn6\tZn\t0.081648\t-0.930554\t0.081780\nZn7\tZn\t-0.593973\t-1.430541\t-0.593803\nZn8\tZn\t-0.093973\t-0.930541\t-0.093803\nO9\tO\t-0.506170\t-1.518373\t-0.506016\nO10\tO\t-0.006170\t-1.018373\t-0.006016\nO11\tO\t-0.570724\t-1.753259\t-0.441478\nO12\tO\t-0.070724

Unfortunately these isotherms only have datapoints at 0.01, 0.05, 0.1, 0.5, and 2.5 bar, not at 1. As pressure rises above atmospheric pressure, you begin packing in gas tighter than would be expected, and the pore volume becomes disproportionately more important because that free volume allows more gas to get packed in. At sub-atmospheric pressure the adsorption is driven by attraction between the gas and framework. It is beneficial to have high attraction in a carbon dioxide capture process, so I believe it makes more sense to use the 0.5 bar measurements as a substitute.

In [44]:
f = open('../data/raw/hmof/hMOF-7.json')
  
# returns JSON object as 
# a dictionary
data = json.load(f)
  
print(data['isotherms'][0]['isotherm_data'][1])
  
# Closing file
f.close()

{'pressure': 0.1, 'species_data': [{'name': 'CarbonDioxide', 'InChIKey': 'CURLTUGMZLYLDI-UHFFFAOYSA-N', 'adsorption': 3.16225, 'composition': 1}], 'total_adsorption': 3.16225}


In [None]:
hmof = pd.DataFrame(columns=['hmof_id', 'co2_298_0.5'])
f = open('../data/raw/hmof/hMOF-7.json')
file = json.load(f)

if file['isotherms'][0]['adsorbates']['formula'] == 'CO2':
    for i in range(len(data['isotherms'][0]['isotherm_data'])):
        if data['isotherms'][0]['isotherm_data'][i]['pressure'] == 0.5:
            new_row = pd.series('hmof_id': ,'co2_298_0.5': data['isotherms'][0]['isotherm_data'][i]['adsorption'])
            pd.concat([hmof, new_row.to_frame().T], ignore_index=True)
