In [1]:
import json
import nglview
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
from pymatgen.core import Structure

from pymatgen.core import Lattice


tqdm.pandas()


plt.rcParams["figure.dpi"] = 70
plt.rcParams["figure.figsize"] = (10, 5)


def read_json_structures(root: Path) -> pd.DataFrame:
    return pd.DataFrame([
        {'_id': path.name.strip('.json'), 'structure': Structure.from_dict(json.load(open(path)))}
        for path in tqdm(root.glob('*.json'))
    ])


root_public_path = Path('../data/dichalcogenides_public')
root_private_path = Path('../data/dichalcogenides_private')

df_private = read_json_structures(root_private_path / 'structures')
df_public = read_json_structures(
    root_public_path / 'structures'
).merge(pd.read_csv(root_public_path / 'targets.csv'))

# понадобится дальше
df_public['formula'] = df_public['structure'].apply(lambda x: x.formula)
df_private['formula'] = df_private['structure'].apply(lambda x: x.formula)

show = lambda x: nglview.show_pymatgen(x)



2967it [00:19, 155.01it/s]
2966it [00:18, 162.41it/s]


### Создание идеального материала

In [2]:
coords = {
    'high': {
        'a': np.linspace(0.08333333, 0.95833333, 8, endpoint=True),
        'b': np.linspace(0.04166667, 0.91666667, 8, endpoint=True),
        'c': 0.355174,
        'element': ['S'],
        'position': []
    },
    'mid': {
        'a': np.linspace(0.04166667, 0.91666667, 8, endpoint=True),
        'b': np.linspace(0.08333333, 0.95833333, 8, endpoint=True),
        'c': 0.25,
        'element': ['Mo'],
        'position': []
    },
    'low': {
        'b': np.linspace(0.04166667, 0.91666667, 8, endpoint=True),
        'a': np.linspace(0.08333333, 0.95833333, 8, endpoint=True),
        'c': 0.144826,
        'element': ['S'],
        'position': []
    }
}

for position in ('high', 'mid', 'low'):
    for a in coords[position]['a']:
        for b in coords[position]['b']:
            coords[position]['position'].append([a, b, coords[position]['c']])

lat = Lattice.from_parameters(25.5225256, 25.5225256, 14.879004, 90, 90, 120)


In [7]:
elements = coords['low']['element'] * 64 + coords['mid']['element'] * 64 + coords['high']['element'] * 64
positions = coords['low']['position'] + coords['mid']['position'] + coords['high']['position']

ideal = Structure(lat, elements,
                  positions,
                  coords_are_cartesian=False)

ideal_set = set(ideal)


### Отличие структур от идеальной

In [11]:
def diff_ideal(df):
    

    ideal_defected_atoms = tuple(ideal_set - set(df['structure']))  # координаты молекул с проблемой
    defects = list(set(df['structure']) - ideal_set)

    ideal_defected_coords = np.array([np.around(i.frac_coords, 5) for i in ideal_defected_atoms])
    defects_coords = np.array([np.around(i.frac_coords, 5) for i in defects])

    for n, i in enumerate(ideal_defected_coords):
        if not all(np.isin(i, defects_coords, True)):
            defects.append(ideal_defected_atoms[n])
            
    df['diff'] = Structure.from_sites(defects)
    return df


result = df_public.iloc[:6, :].progress_apply(diff_ideal, axis=1)

100%|██████████| 6/6 [00:16<00:00,  2.77s/it]


In [12]:
result

Unnamed: 0,_id,structure,band_gap,formula,diff
0,6141eb094e27a1844a5f03dc,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...,1.1388,Mo63 W1 Se1 S126,"[[ 1.59515781 11.97254266 2.15486663] Se, [-7..."
1,6141d6544e27a1844a5f01ea,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...,1.1381,Mo63 W1 Se1 S126,"[[11.16610482 17.49833154 5.28463537] Se, [19..."
2,61421dc931cf3ef3d4a9f318,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...,0.3966,Mo63 Se1 S126,"[[20.73705192 6.44675377 5.28463537] Se, [ 6..."
3,6141d0dabaaf234b35290260,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...,1.0837,Mo63 W1 S126,"[[7.97578938 4.604824 3.719751 ] W, [20.737..."
4,6143483831cf3ef3d4a9f722,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...,0.3566,Mo63 Se1 S126,"[[11.16610482 0.92096489 2.15486663] Se, [9...."
5,6142db3c4e27a1844a5f0954,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...,1.1474,Mo63 W1 Se1 S126,"[[ 4.78547342 17.49833154 5.28463537] Se, [17..."


In [29]:
(_, example), (_, diff) = result.sample(1)[['structure', 'diff']].iteritems()

In [30]:
show(example.iloc[0])

NGLWidget()

In [32]:
show(diff.iloc[0])

NGLWidget()