### Making CSV to have a look

```
$ cd src
$ ipython3 ../scripts/make_csv.py ir ../data/sample/m150_ir.p ../data/sample/m150_ir.csv
$ ipython3 ../scripts/make_csv.py nmr ../data/sample/m150_nmr.p ../data/sample/m150_nmr.csv
```

### Making fingerprints

Trying to evenly divide the bins. Specify the numbers of bins of ir and nmr data.

Maybe specify the positions to make fingerprints? Or specify the width of each bin?

In [1]:
import sys, os
sys.path.append(os.path.abspath('../src'))

In [2]:
from chemspace import Fingerprints, load_data, NMRVector, IRVector

In [3]:
nmr, ir = load_data('../rawdata/sample/nmr_data/150.042927432_vectors.p', '../rawdata/sample/ir_data/loader.p')

In [4]:
sample_nmr = NMRVector.from_old(nmr[0])
sample_smiles = sample_nmr.smiles
sample_ir = IRVector.from_old(ir, sample_smiles)
print(sample_smiles)

N#CCC1OCc2cnoc21


In [5]:
sample_fingerprints = Fingerprints(sample_nmr, sample_ir)

In [9]:
[x for x in enumerate(sample_fingerprints.data) if x[1] > 0]

[(182, 0.18294444444444447),
 (272, 0.27216666666666667),
 (277, 0.2775),
 (303, 0.30394444444444446),
 (399, 0.3993333333333333),
 (622, 7.8562115384615385),
 (627, 8.205788461538463),
 (640, 9.092769230769232),
 (652, 9.876826923076923),
 (664, 10.650740384615384),
 (690, 12.421942307692309),
 (699, 13.033942307692307),
 (715, 14.08689423076923),
 (923, 28.015846153846155),
 (924, 28.081105769230767),
 (928, 28.368182692307695)]

In [10]:
nmr_vectors = [NMRVector.from_old(x) for x in nmr]
ir_vectors = [IRVector.from_old(ir, x.smiles) for x in nmr_vectors if x.smiles in ir.data]

In [11]:
print(len(nmr_vectors), len(ir_vectors))

1305 1297


In [12]:
nmr_vectors = [x for x in nmr_vectors if x.smiles in ir.data]

In [16]:
assert all(nmr.smiles == ir.smiles for nmr, ir in zip(nmr_vectors, ir_vectors))

In [17]:
all_fingerprints = [Fingerprints(nmr, ir) for nmr, ir in zip(nmr_vectors, ir_vectors)]

In [21]:
import numpy as np
import pandas as pd

In [24]:
df = pd.DataFrame({x.smiles: x.data for x in all_fingerprints})

In [27]:
df.to_hdf('../data/sample/all_fingerprints.hdf5', 'r1')

In [57]:
x1, x2 = (x.data for x in all_fingerprints[:2])

In [60]:
nonzeros = lambda arr: [x for x in enumerate(arr) if x[1] != 0]
print(len(nonzeros(x1)), len(nonzeros(x2)))
x3 = x1 - x2
print(len(nonzeros(x3)))

16 6
22


In [61]:
nonzeros(x3)

[(182, 0.18294444444444447),
 (272, 0.27216666666666667),
 (277, 0.2775),
 (303, 0.30394444444444446),
 (334, -0.6688333333333333),
 (338, -0.3383888888888889),
 (399, 0.3993333333333333),
 (442, -0.4424444444444445),
 (546, -2.787326923076923),
 (600, -6.361567307692307),
 (622, 7.8562115384615385),
 (627, 8.205788461538463),
 (640, 9.092769230769232),
 (652, 9.876826923076923),
 (664, 10.650740384615384),
 (690, 12.421942307692309),
 (699, 13.033942307692307),
 (715, 14.08689423076923),
 (752, -16.558798076923075),
 (923, 28.015846153846155),
 (924, 28.081105769230767),
 (928, 28.368182692307695)]