In [1]:
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import feather
import dscribe
from tqdm import tqdm
import ase

In [2]:
train = feather.read_dataframe('../data/input/train.feather').head(1000)

In [3]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


In [4]:
structures = feather.read_dataframe('../data/input/structures.feather').head(10000)

In [5]:
structures.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


In [6]:
#Atomおぶじぇくとは原子しんぼるのリストとそのポジションのリストをうけとる
#それをもちいてACSF記述子はけいさんできる、はず

In [7]:
a = ase.Atoms(['C', 'H', 'H', 'H', 'H'], [[0, 0, 0], [0, 1, 1], [1, 0, 0], [1, 1, 0], [0, 0, 1]])

In [19]:
from dscribe.descriptors import SOAP

# Setting up the ACSF descriptor
soap = SOAP(species=["H", "O", 'N', 'C', 'F'],
                    rcut=10.0,
                    nmax=2,
                    lmax=1
                    )

In [20]:
acsf_des = soap.create(a)

print(acsf_des)
print(acsf_des.shape)

[[1.5691590e+01 5.3696068e+01 1.8374603e+02 9.6476901e-01 5.0264993e+00
  2.6510244e+01 8.6807547e+00 1.4684865e+01 5.0251087e+01 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  4.8022857e+00 8.1238232e+00 1.3742727e+01 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000

In [10]:
groups = structures.groupby('molecule_name')

In [17]:
molecule_names = []
acsf_descs = []
for group in groups:
    name = group[0]
    atoms = group[1]['atom'].values
    positions = group[1][['x', 'y', 'z']].values
    atoms_obj = ase.Atoms(atoms, positions)
    acsf_des = acsf.create(atoms_obj)
    for row in acsf_des:
        acsf_descs.append(row)
acsf_descs = pd.DataFrame(acsf_descs)

In [19]:
acsf_descs.shape

(10000, 125)

In [20]:
acsf_descs.head(30)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,115,116,117,118,119,120,121,122,123,124
0,3.883469,1.702629,3.576102,3.851579,1.341277e-10,0.34918,3.052128,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.770731,2.643446,2.757731,2.769428,5.24824e-08,0.468098,2.319362,0.970867,0.42566,0.894026,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.770731,2.643447,2.757731,2.769428,5.24828e-08,0.468098,2.319362,0.970867,0.425659,0.894026,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.770729,2.643455,2.75773,2.769426,5.248672e-08,0.468101,2.319362,0.970867,0.425655,0.894025,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.770729,2.643455,2.75773,2.769426,5.248666e-08,0.468101,2.319362,0.970867,0.425656,0.894025,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2.92406,1.113009,2.654835,2.895952,4.821399e-11,0.244177,2.281173,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.873474,1.619862,1.846421,1.870751,8.623905e-09,0.27476,1.546245,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.873475,1.61986,1.846421,1.870751,8.623742e-09,0.274759,1.546245,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.87346,1.619963,1.84642,1.870739,8.630757e-09,0.27478,1.546246,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.954668,0.665643,1.755051,1.933725,1.855746e-11,0.154461,1.516517,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
acsf_descs = acsf_descs.reset_index(drop=True)
acsf_descs.columns = [f'acsf_{i}' for i in range(acsf_descs.shape[1])]
acsf_descs.head()

Unnamed: 0,acsf_0,acsf_1,acsf_2,acsf_3,acsf_4,acsf_5,acsf_6,acsf_7,acsf_8,acsf_9,...,acsf_115,acsf_116,acsf_117,acsf_118,acsf_119,acsf_120,acsf_121,acsf_122,acsf_123,acsf_124
0,3.883469,1.702629,3.576102,3.851579,1.341277e-10,0.34918,3.052128,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.770731,2.643446,2.757731,2.769428,5.24824e-08,0.468098,2.319362,0.970867,0.42566,0.894026,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.770731,2.643447,2.757731,2.769428,5.24828e-08,0.468098,2.319362,0.970867,0.425659,0.894026,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.770729,2.643455,2.75773,2.769426,5.248672e-08,0.468101,2.319362,0.970867,0.425655,0.894025,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.770729,2.643455,2.75773,2.769426,5.248666e-08,0.468101,2.319362,0.970867,0.425656,0.894025,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
structures = pd.concat([structures, acsf_descs], axis=1)

In [23]:
structures.shape

(10000, 131)

In [24]:
structures.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z,acsf_0,acsf_1,acsf_2,acsf_3,...,acsf_115,acsf_116,acsf_117,acsf_118,acsf_119,acsf_120,acsf_121,acsf_122,acsf_123,acsf_124
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,3.883469,1.702629,3.576102,3.851579,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,2.770731,2.643446,2.757731,2.769428,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,2.770731,2.643447,2.757731,2.769428,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,2.770729,2.643455,2.75773,2.769426,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,2.770729,2.643455,2.75773,2.769426,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074
