#credit to https://www.kaggle.com/nofreewill

In [1]:
!conda install -y -c rdkit rdkit

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - rdkit


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    boost-1.74.0               |   py37h6dcda5c_3         342 KB  conda-forge
    ca-certificates-2021.5.30  |       ha878542_0         136 KB  conda-forge
    certifi-2021.5.30          |   py37h89c1867_0         141 KB  conda-forge
    conda-4.10.1               |   py37h89c1867_0         3.1 MB  conda-forge
    openssl-1.1.1k             |       h7f98852_0         2.1 MB  conda-forge
    rdkit-2021.03.2            |   py37haf5a968_0        38.3 MB  conda-forge
    reportlab-3.5.67           |   py37h69800bb_0         2.4 MB  conda-forge
    ------------------------------------------------------------
                                           Total:        46.5 MB


In [2]:
%%writefile normalize_inchis.py

from tqdm import tqdm
from rdkit import Chem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
from pathlib import Path

def normalize_inchi(inchi):
    try:
        mol = Chem.MolFromInchi(inchi)
        return inchi if (mol is None) else Chem.MolToInchi(mol)
    except: return inchi
        
submission_name = '../input/bms-efficientnetv2-tpu-ensemble/submission.csv'
norm_path = Path('submission_norm.csv')

N = norm_path.read_text().count('\n') if norm_path.exists() else 0
print(N, 'number of predictions already normalized')

r = open(submission_name, 'r')
write_mode = 'w' if N == 0 else 'a'
w = open(str(norm_path), write_mode, buffering=1)

for _ in range(N):
    r.readline()
line = r.readline()  # this line is the header or is where it died last time
w.write(line)

pbar = tqdm()
while True:
    line = r.readline()
    if not line:
        break  # done
    image_id = line.split(',')[0]
    inchi = ','.join(line[:-1].split(',')[1:]).replace('"','')
    inchi_norm = normalize_inchi(inchi)
    w.write(f'{image_id},"{inchi_norm}"\n')
    pbar.update(1)

r.close()
w.close()

Writing normalize_inchis.py


In [3]:
!ls

__notebook__.ipynb  normalize_inchis.py


In [4]:
!while [ 1 ]; do python normalize_inchis.py && break; done

0 number of predictions already normalized
97164it [01:21, 1213.50it/s]/bin/bash: line 1:  9530 Segmentation fault      (core dumped) python normalize_inchis.py
97285 number of predictions already normalized
56726it [00:46, 1197.67it/s]/bin/bash: line 1:  9532 Segmentation fault      (core dumped) python normalize_inchis.py
154096 number of predictions already normalized
92693it [01:17, 1222.40it/s]/bin/bash: line 1:  9534 Segmentation fault      (core dumped) python normalize_inchis.py
246895 number of predictions already normalized
31562it [00:26, 1235.78it/s]/bin/bash: line 1:  9536 Segmentation fault      (core dumped) python normalize_inchis.py
278526 number of predictions already normalized
27432it [00:22, 1206.65it/s]/bin/bash: line 1:  9538 Segmentation fault      (core dumped) python normalize_inchis.py
306079 number of predictions already normalized
33003it [00:27, 1245.54it/s]/bin/bash: line 1:  9540 Segmentation fault      (core dumped) python normalize_inchis.py
339182 num

In [None]:
import pandas as pd
import Levenshtein
from pathlib import Path
from tqdm import tqdm

submission_name = '../input/bms-efficientnetv2-tpu-ensemble/submission.csvv'
norm_path = Path('submission_norm.csv')

sub_df = pd.read_csv(submission_name)
sub_norm_df = pd.read_csv(norm_path)

lev = 0
N = len(sub_df)
for i in tqdm(range(N)):
    inchi, inchi_norm = sub_df.iloc[i,1], sub_norm_df.iloc[i,1]
    lev += Levenshtein.distance(inchi, inchi_norm)

print(lev/N)