In [9]:
import numpy as np

In [10]:
d = np.load('data/qm9_eV.npz')

In [11]:
d

<numpy.lib.npyio.NpzFile at 0x7fe57d625190>

In [12]:
list(d.keys())

['R',
 'N',
 'Z',
 'id',
 'A',
 'B',
 'C',
 'mu',
 'alpha',
 'homo',
 'lumo',
 'gap',
 'r2',
 'zpve',
 'U0',
 'U',
 'H',
 'G',
 'Cv',
 'meta']

In [38]:
d['N'].mean()

18.02485649425595

In [18]:
np.unique(d['Z'])

array([1, 6, 7, 8, 9], dtype=int32)

In [15]:
d['R'].shape

(2358210, 3)

In [16]:
d['id']

array([     0,      1,      2, ..., 133882, 133883, 133884])

In [54]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm


def _transform_df(df:pd.DataFrame):
    d = df.to_numpy()
    f2index = np.array(d[:, 0])
    # 0: HOMO
    # 1: LUMO
    labels = d[:, 1:].astype(np.float)
    
    return labels, f2index

def load_data(pth, sheet):
    df = pd.read_excel(os.path.join(pth, 'labels.xlsx'), sheet)
    labels, f2index = _transform_df(df)

    element_mapping = {'H': 1, 'B': 5, 'C': 6, 'N': 7, 'O': 8, 'F': 9, 'Si': 14, 'P': 15, 'S': 16, 'Cl': 17, 'Se': 34, 'Br': 35}

    xyz_pth = os.path.join(pth, 'xyz')
    R = []
    N = []
    Z = []
    for index in tqdm(range(f2index.shape[0])):  
        with open(os.path.join(xyz_pth, f'{f2index[index]}.xyz'), 'r') as fp:
            txt = fp.readlines()
            z = np.zeros(len(txt[2:]), dtype=np.int)
            atm_xyz = np.zeros((len(txt[2:]), 3))
            for i, l in enumerate(txt[2:]):
                valid_str = [s for s in l.split(' ') if s != '']
                assert(len(valid_str) == 4)
                z[i] = element_mapping[valid_str[0]]
                for j in range(3):
                    # axis x, y, z
                    atm_xyz[i, j] = float(valid_str[j+1])
            R.append(atm_xyz)
            N.append(len(txt[2:]))
            Z.append(z)

    Rnp = np.vstack(R)
    Nnp = np.array(N)
    Znp = np.hstack(Z)
    homo, lumo = labels[:, 0], labels[:, 1]
    dataset = {'id': np.arange(Nnp.shape[0]), 'N': Nnp, 'Z': Znp, 'R': Rnp, 'homo': homo, 'lumo': lumo}
    return dataset

In [55]:
pth = '/mnt/exp/eri/data/homolumo'
test = load_data(pth, 'TestFinal')

train = load_data(pth, 'Dataset')


100%|██████████████████████████████████████████████████████████████| 13199/13199 [00:01<00:00, 7756.64it/s]
100%|██████████████████████████████████████████████████████████████| 98526/98526 [00:13<00:00, 7495.03it/s]


In [59]:
a1 = np.unique(test['Z'])
for a in a1:
    print(a, np.sum(test['Z'] == a)/test['Z'].shape[0])

1 0.3967708320095566
5 1.9062384831424978e-05
6 0.44768646189429273
7 0.04975917853829633
8 0.06512346071242486
9 0.009734524520581022
14 0.00027322751591709135
15 0.00028593577247137465
16 0.024853113734660075
17 0.004130183380142078
34 0.00037277552559231066
35 0.0009912440112340988


In [60]:
a1 = np.unique(train['Z'])
for a in a1:
    print(a, np.sum(train['Z'] == a)/train['Z'].shape[0])

1 0.3900602083558872
5 2.1651690913784633e-05
6 0.45395212755621805
7 0.04746772371332051
8 0.06331509594752519
9 0.011128691543904355
14 0.0004088838553333944
15 0.0002595427051844696
16 0.02899661067761465
17 0.0031797451207359355
34 0.0004405286343612335
35 0.0007691901990012464


In [62]:
import pickle

with open('data/dye_train.npz', 'wb') as fp:
    pickle.dump(train, fp)
    
with open('data/dye_test.npz', 'wb') as fp:
    pickle.dump(test, fp)

In [44]:
homo, lumo = labels[:, 0], labels[:, 1]
dataset['homo'] = homo
dataset['lumo'] = lumo

In [None]:
pd.read_excel(os.path.join(pth, 'labels.xlsx'), 'Dataset')

In [42]:
np.save('data/dye', dataset)

In [39]:
Nnp.mean()

36.563851166189636

In [61]:
test['homo'].shape

(13199,)

In [63]:
type(test['id'][0])

numpy.int64

In [64]:
from collections import iterable

ImportError: cannot import name 'iterable' from 'collections' (/home/wenshi/anaconda3/envs/tf/lib/python3.8/collections/__init__.py)

In [68]:
type(train['id'][0])

numpy.int64

In [74]:
for k, v in train.items():
    print(k, type(v), v.dtype, v.shape)


id <class 'numpy.ndarray'> int64 (98526,)
N <class 'numpy.ndarray'> int64 (98526,)
Z <class 'numpy.ndarray'> int64 (3602490,)
R <class 'numpy.ndarray'> float64 (3602490, 3)
homo <class 'numpy.ndarray'> float64 (98526,)
lumo <class 'numpy.ndarray'> float64 (98526,)


In [76]:
for k, v in d.items():
    print(k, type(v), v.dtype, v.shape)


R <class 'numpy.ndarray'> float64 (2358210, 3)
N <class 'numpy.ndarray'> int64 (130831,)
Z <class 'numpy.ndarray'> int32 (2358210,)
id <class 'numpy.ndarray'> int64 (130831,)
A <class 'numpy.ndarray'> float64 (130831,)
B <class 'numpy.ndarray'> float64 (130831,)
C <class 'numpy.ndarray'> float64 (130831,)
mu <class 'numpy.ndarray'> float64 (130831,)
alpha <class 'numpy.ndarray'> float64 (130831,)
homo <class 'numpy.ndarray'> float64 (130831,)
lumo <class 'numpy.ndarray'> float64 (130831,)
gap <class 'numpy.ndarray'> float64 (130831,)
r2 <class 'numpy.ndarray'> float64 (130831,)
zpve <class 'numpy.ndarray'> float64 (130831,)
U0 <class 'numpy.ndarray'> float64 (130831,)
U <class 'numpy.ndarray'> float64 (130831,)
H <class 'numpy.ndarray'> float64 (130831,)
G <class 'numpy.ndarray'> float64 (130831,)
Cv <class 'numpy.ndarray'> float64 (130831,)


ValueError: Object arrays cannot be loaded when allow_pickle=False

In [79]:
train['N'].max()

159

In [80]:
d['N'].max()

29

In [90]:
t = train['id'].shape[0]

In [92]:
t - 9852*2

78822