# Dataset convertion and inspect

In [None]:
%matplotlib inline
import logging
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tips.io import load_ds
from glob import glob
from pathlib import Path
logger = logging.getLogger('tips')
logger.setLevel('INFO')
tf.get_logger().setLevel('INFO')

In [None]:
key2name = {
    'hoac-c1im-a32b32i0-rho1.0753':   '0%-1.08',
    'hoac-c1im-a16b16i16-rho1.0753': '50%-1.08',
    'hoac-c1im-a0b0i32-rho1.0753':  '100%-1.08',
    'hoac-c1im-a32b32i0-rho1.1551':   '0%-1.16',
    'hoac-c1im-a16b16i16-rho1.1551': '50%-1.16',
    'hoac-c1im-a0b0i32-rho1.1551':  '100%-1.16',
}

## To get the PiNN formatted datasets from CP2K logs...

```bash
#!/bin/bash
projs=();for f in nvt-*/*; do projs+=($f/cp2k-md); done; echo ${projs[@]}
tips convert ${projs[@]} -f cp2k -of pinn -o pils-40ps --shuffle
```

In [None]:
inits = set([Path(init).name for init in glob('../trajs/cp2k/nvt-*/*/')])
ds_all = {}
for init in inits:
    projs = [Path(proj)/'cp2k-md' for proj in glob(f'../trajs/cp2k/nvt-*/{init}/')]
    ds_all[init] = load_ds(projs, fmt='cp2k')
    print(init, list(ds_all[init].meta['spec'].keys()))

In [None]:
for init in inits:
    #load_ds(f'../trajs/cp2k/nvt-30-40ps/{init}/cp2k-md', fmt='cp2k')[-1:].convert(init, fmt='extxyz')
    datum = load_ds(f'../trajs/cp2k/nvt-30-40ps/{init}/cp2k-md', fmt='cp2k')[-1]
    print(init, datum['energy']/640, np.abs(datum['force']).max())


In [None]:
energy_stat, force_stat, labels = [], [], []

for k in sorted(ds_all.keys()):
    labels.append(key2name[k])
    e_stat, f_stat = [], []
    for datum in ds_all[k].shuffle()[:5000]:
        e_stat.append(datum['energy'])
        f_stat.append(datum['force'].ravel())
    energy_stat.append(np.array(e_stat))
    force_stat.append(np.concatenate(f_stat))

In [None]:
e_stat, f_stat = [], []
for datum in load_ds('../datasets//pils-40ps.yml', fmt='pinn')[:5000]:
    e_stat.append(datum['energy'])
    f_stat.append(datum['force'].ravel())
energy_stat_gen=np.array(e_stat)
force_stat_gen=np.concatenate(f_stat)

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, figsize=[4,6])


ax1.hist([stat/640. for stat in energy_stat], 
         bins=100, stacked=True, label=labels, density=True)

ax1.hist(energy_stat_gen/640., bins=100, color='k', ls='--', histtype='step', fill=False, density=True)
ax2.hist([stat for stat in force_stat], 
         bins=100, stacked=True, label=labels, density=True)
ax2.hist(force_stat_gen, bins=100, color='k', ls='--', histtype='step', fill=False, density=True)

ax1.legend()
ax1.set_xlabel('Pot. Ener. [eV/atom]')
#ax1.set_yscale('log')
ax2.set_yscale('log')