## MPContribs

* Project on materials project contrib
- Using environment: ML
- Related directory: struc_info(r), refine_set(r), organic_molecule/organic_molecule.csv(r),  ?(w)
- Input files: organic_genome, structure_info, Form_energy, PBEEnergyLevel, HSEEnergyLevel
- Output files: ./organic_molecule/*.xyz, ./organic_molecule/organic_molcule.csv


### Set up project info

In [1]:
import os
import json
import pandas as pd
import numpy as np

from mpcontribs.client import Client
from pymatgen.core import Structure
from pymatgen.io.cif import CifParser
from mp_api.client import MPRester

import sys
sys.path.append('../03-code/')

from config import PROJECT_ROOT_DIRECTORY

In [31]:
import os

# set API key, obtained from materials project dashboard page
os.environ["MPCONTRIBS_API_KEY"] = "Bf7eV4WxQisyZ4iD2a3HXuCo3G4nFSDr"

In [32]:
project_name = "dj_perovskite"
client = Client(project=project_name)
client.get_project(project_name).keys()

dict_keys(['name', 'is_public', 'title', 'owner', 'is_approved', 'unique_identifiers', 'long_title', 'authors', 'description', 'references', 'license', 'columns', 'stats'])

In [None]:
#client.make_public()

{'published': True}

In [None]:
update = {
    "unique_identifiers": True,
    "references": [{
        "label": "scholar",
        "url": "https://scholar.google.com/citations?user=UMQWBRQAAAAJ&hl=en"},],
    "description": "A repository of structural and electronic properties for two-dimensional Dion-Jacobson perovskites, derived from density functional theory calculation and machine learning.",
    #"is_approved": True,
    #"is_public": False,
    "title": "DJ Perovskite Repository",
    "long_title": "Dion-Jacobson Perovskite Repository",
    "authors": "Yongxin Lyu, Yifan Zhou, Yu Zhang, Yang Yang, Bosen Zou, Qiang Weng, Tong Xie, Claudio Cazorla, Jianhua Hao, Jun Yin*, Tom Wu*",
}


client.update_project(update)
#client.projects.updateProjectByName(pk=project_name, project=update).result()

### Contribute data on DJ Perovskite Database

### Initialize data columns

In [34]:
client.delete_contributions()

It took 0.1min to delete 126 contributions.


In [None]:
# Load dataframes from CSV files
organic_genome_dataframe = pd.read_csv(
    PROJECT_ROOT_DIRECTORY + '02-metadata/06-csv-files/01-organic-genome.csv', index_col='identifier'
)

organic_fingeprint_dataframe = pd.read_csv(
    PROJECT_ROOT_DIRECTORY + '02-metadata/06-csv-files/02-organic-fingerprints.csv', index_col='identifier'
)

mo_energetics_dataframe = pd.read_csv(
    PROJECT_ROOT_DIRECTORY + '02-metadata/06-csv-files/04-mo-energetics.csv', index_col='identifier'
)

structure_info_dataframe = pd.read_csv(
    PROJECT_ROOT_DIRECTORY + '02-metadata/06-csv-files/07-structure-info.csv', index_col='identifier'
)

hse_frontier_dataframe = pd.read_csv(
    PROJECT_ROOT_DIRECTORY + '02-metadata/06-csv-files/08-hse-frontier.csv', index_col='identifier'
)



In [3]:

synthesis_feasibility_dataframe = pd.read_csv(
    PROJECT_ROOT_DIRECTORY + '02-metadata/06-csv-files/12-synthesis-feasibility.csv'
)

In [4]:
synthesis_feasibility_dataframe.head()

Unnamed: 0,smiles_canonical,ringcount,linkage_p,six_ring_p,primaryamine,linker_length,linker_position,hetero_nitrogen,fluorination,furan,...,formability_score,STEI,NumRot_tail,eccentricity,disNN,formability_decision,cid,PubChem_existence,synthesizability_decision,generation
0,[NH3+]Cc1ccc(C[NH3+])cc1,1,0.0,1.0,2,2,1.0,0,0,0,...,0.999963,1.245869,2.0,7.0,0.020408,True,68315.0,True,True,0
1,[NH3+]Cc1ccc(C[NH3+])s1,1,0.0,0.0,2,2,1.0,0,0,0,...,0.999911,1.242954,2.0,6.0,0.027778,True,12979461.0,True,True,1
2,[NH3+]Cc1csc(C[NH3+])c1,1,0.0,0.0,2,2,1.0,0,0,0,...,0.999911,1.242954,2.0,6.0,0.027778,True,84038569.0,True,True,1
3,[NH3+]Cc1ccc(-c2ccc(C[NH3+])cc2)cc1,2,1.0,1.0,2,2,1.0,0,0,0,...,0.999991,1.255814,2.0,11.0,0.008264,True,1394282.0,True,True,1
4,[NH3+]Cc1ccc(-c2csc(C[NH3+])c2)cc1,2,1.0,0.5,2,2,1.0,0,0,0,...,0.999989,1.257739,2.0,10.0,0.01,True,,False,False,1


In [16]:
synthesis_feasibility_dataframe = pd.merge(
    organic_genome_dataframe,synthesis_feasibility_dataframe[['smiles_canonical','STEI','NumRot_tail','eccentricity','disNN','formability_score','cid']],on='smiles_canonical',how='left')

In [19]:
synthesis_feasibility_dataframe = pd.merge(
    organic_genome_dataframe.reset_index(), synthesis_feasibility_dataframe.drop(columns=['generation']), on='smiles_canonical').set_index('identifier')

In [20]:
synthesis_feasibility_dataframe

Unnamed: 0_level_0,smiles_canonical,generation,STEI,NumRot_tail,eccentricity,disNN,formability_score,cid
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,[NH3+]Cc1cc2sc(C[NH3+])cc2s1,2.0,1.255822,2.0,8.0,0.015625,0.999979,
2,[NH3+]Cc1cc2sc([NH3+])cc2s1,3.0,1.370869,1.5,7.0,0.020408,0.999855,
3,[NH3+]c1cc2sc([NH3+])cc2s1,4.0,1.367954,1.0,6.0,0.027778,0.985222,69289713.0
4,[NH3+]Cc1ccc(C[NH3+])s1,1.0,1.242954,2.0,6.0,0.027778,0.999911,12979461.0
5,[NH3+]Cc1ccc(C[NH3+])o1,2.0,1.242954,2.0,6.0,0.027778,0.999911,13561170.0
...,...,...,...,...,...,...,...,...
40605,[NH3+]Cc1ccc2nc(-c3cs[nH+]n3)[nH]c2c1,6.0,2.352702,1.0,9.0,0.012346,0.970471,79004987.0
40606,[NH3+]Cc1ccc2nc(-c3cn[nH+]s3)[nH]c2c1,6.0,2.352702,1.0,9.0,0.012346,0.970471,136279877.0
40607,Cc1c[nH+]nc2oc(C[NH3+])nc12,6.0,2.404991,1.0,6.0,0.027778,0.868909,82414954.0
40608,Cc1c[nH+]nc2sc(C(C)[NH3+])nc12,6.0,2.409620,1.0,6.0,0.027778,0.856921,82400975.0


In [21]:
dataframe_all_data = pd.merge(
    organic_genome_dataframe, hse_frontier_dataframe,
    left_index=True,
    right_index=True,
    how='inner')
dataframe_all_data = pd.merge(
    dataframe_all_data, organic_fingeprint_dataframe,
    left_index=True,
    right_index=True,
    how='inner')
dataframe_all_data = pd.merge(
    dataframe_all_data, mo_energetics_dataframe,
    left_index=True,
    right_index=True,
    how='inner')    
dataframe_all_data

Unnamed: 0_level_0,smiles_canonical,generation,inorganic_cbm_gamma,inorganic_cbm_z,inorganic_vbm_gamma,inorganic_vbm_z,organic_LUMO,organic_HOMO,alignment_type,ringcount,...,linker_position,hetero_nitrogen,fluorination,furan,pyrrole,sidechain_on_linker,sidechain_on_backbone,HOMO,LUMO,HOMO_LUMO_gap
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,[NH3+]Cc1cc2sc(C[NH3+])cc2s1,2.0,2.618151,2.618151,-0.066349,-0.066349,3.901251,-0.772949,Ia,2,...,1.0,0,0,0,0,0,0,-12.6300,-7.88800,4.74200
4,[NH3+]Cc1ccc(C[NH3+])s1,1.0,2.583026,2.583026,0.006826,0.006826,4.142126,-1.545974,Ia,1,...,1.0,0,0,0,0,0,0,-14.2210,-8.49400,5.72700
5,[NH3+]Cc1ccc(C[NH3+])o1,2.0,2.419301,2.419301,0.184801,0.184801,4.525701,-1.222399,Ia,1,...,1.0,0,0,1,0,0,0,-14.1470,-8.37000,5.77700
6,[NH3+]Cc1ccc(C[NH3+])[nH]1,2.0,2.296376,2.296376,0.046376,0.046376,4.524576,-0.671524,Ia,1,...,1.0,0,0,0,1,0,0,-13.4490,-7.46300,5.98600
7,Cc1cc(C[NH3+])ccc1C[NH3+],1.0,2.588476,2.588476,-0.046924,-0.046924,4.209476,-1.704724,Ia,1,...,1.0,0,0,0,0,0,1,-13.7241,-8.10519,5.61891
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34296,CC([NH3+])c1cn[nH+]cn1,4.0,2.671026,2.623126,-0.211774,-0.053574,1.813926,-4.157974,IIb,1,...,1.0,2,0,0,0,1,0,-17.4591,-12.52620,4.93290
34299,[NH3+]Cc1nn[nH+]cc1F,4.0,2.589251,2.596551,-0.182949,0.190351,1.707651,-4.296349,IIb,1,...,1.0,2,1,0,0,0,0,-17.6714,-12.65600,5.01540
34300,[NH3+]Cc1cc(F)[nH+]nn1,4.0,2.500526,2.441726,-0.115174,0.218726,1.490026,-4.347574,IIb,1,...,1.0,2,1,0,0,0,0,-17.8243,-12.79340,5.03090
34301,CC([NH3+])c1cc[nH+]nn1,4.0,2.646776,2.595976,-0.149524,-0.013324,2.043376,-3.812524,IIb,1,...,1.0,2,0,0,0,1,0,-17.2425,-12.39100,4.85150


In [22]:
dataframe_all_data_2 = pd.merge(
    dataframe_all_data, synthesis_feasibility_dataframe.drop(columns=['generation','smiles_canonical']),
    left_index=True,
    right_index=True,
    how='inner'
)

In [23]:
dataframe_all_data_2

Unnamed: 0_level_0,smiles_canonical,generation,inorganic_cbm_gamma,inorganic_cbm_z,inorganic_vbm_gamma,inorganic_vbm_z,organic_LUMO,organic_HOMO,alignment_type,ringcount,...,sidechain_on_backbone,HOMO,LUMO,HOMO_LUMO_gap,STEI,NumRot_tail,eccentricity,disNN,formability_score,cid
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,[NH3+]Cc1cc2sc(C[NH3+])cc2s1,2.0,2.618151,2.618151,-0.066349,-0.066349,3.901251,-0.772949,Ia,2,...,0,-12.6300,-7.88800,4.74200,1.255822,2.0,8.0,0.015625,0.999979,
4,[NH3+]Cc1ccc(C[NH3+])s1,1.0,2.583026,2.583026,0.006826,0.006826,4.142126,-1.545974,Ia,1,...,0,-14.2210,-8.49400,5.72700,1.242954,2.0,6.0,0.027778,0.999911,12979461.0
5,[NH3+]Cc1ccc(C[NH3+])o1,2.0,2.419301,2.419301,0.184801,0.184801,4.525701,-1.222399,Ia,1,...,0,-14.1470,-8.37000,5.77700,1.242954,2.0,6.0,0.027778,0.999911,13561170.0
6,[NH3+]Cc1ccc(C[NH3+])[nH]1,2.0,2.296376,2.296376,0.046376,0.046376,4.524576,-0.671524,Ia,1,...,0,-13.4490,-7.46300,5.98600,1.242954,2.0,6.0,0.027778,0.999911,17975041.0
7,Cc1cc(C[NH3+])ccc1C[NH3+],1.0,2.588476,2.588476,-0.046924,-0.046924,4.209476,-1.704724,Ia,1,...,1,-13.7241,-8.10519,5.61891,1.261494,2.0,7.0,0.020408,0.999963,23453398.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34296,CC([NH3+])c1cn[nH+]cn1,4.0,2.671026,2.623126,-0.211774,-0.053574,1.813926,-4.157974,IIb,1,...,0,-17.4591,-12.52620,4.93290,2.318662,1.0,5.0,0.040000,0.982395,82593607.0
34299,[NH3+]Cc1nn[nH+]cc1F,4.0,2.589251,2.596551,-0.182949,0.190351,1.707651,-4.296349,IIb,1,...,0,-17.6714,-12.65600,5.01540,2.347699,1.0,5.0,0.040000,0.973376,
34300,[NH3+]Cc1cc(F)[nH+]nn1,4.0,2.500526,2.441726,-0.115174,0.218726,1.490026,-4.347574,IIb,1,...,0,-17.8243,-12.79340,5.03090,2.435662,1.0,5.0,0.040000,0.808631,
34301,CC([NH3+])c1cc[nH+]nn1,4.0,2.646776,2.595976,-0.149524,-0.013324,2.043376,-3.812524,IIb,1,...,0,-17.2425,-12.39100,4.85150,2.318662,1.0,5.0,0.040000,0.982395,69776878.0


In [45]:
columns = {
    "identifier": None,
    "formula": None,
    "OrganicSpacer": {"SMILES": None,},
    "MolecularFingerprint": {
        "NoRing": None,
        "PercentageLinkage": None,
        "PercentageSixRing": None,
        "NoPrimaryAmine": None,
        "LinkerLength": None,
        "LinkerPositions": None,
        "NoNitrogenPyridine": None,
        "NoFluorine": None,
        "NoOxygenFuran": None,
        "NoNitrogenPyrrole": None,
        "NoSideChainLinker": None,
        "NoSideChainBackbone": None,
    },
    "EnergyLevelAlignment": {
        "VBMg": None,
        "VBMz": None,
        "CBMg": None,
        "CBMz": None,
        "HOMO": None,
        "LUMO": None,
    },
    "SynthesisFeasibility": {
        "STEI": None,
        "NumRotTail": None,
        "Eccentricity": None,
        "DisNN": None,
        "FormabilityScore": None,
        #"cid": None,
    },
    "structures": None,
    "attachments": None,
}


client.init_columns(columns)

{'count': 1}

In [25]:
raw_data = dataframe_all_data_2.copy()

In [49]:
def get_contrib(identifier):  #identifier need to be an integer
    structure_file = PROJECT_ROOT_DIRECTORY + '02-metadata/03-final-perovskite-cif/' + str(identifier).zfill(5) + '.cif'
    structure = Structure.from_file(filename=structure_file)
    formula = structure.formula
    UniqueID = identifier

    OrganicSpacer = {
        "SMILES": raw_data.loc[identifier, 'smiles_canonical'],
    } 
    
    MolecularFingerprint = {
        "NoRing": int(raw_data.loc[identifier, 'ringcount']),
        "PercentageLinkage": round(raw_data.loc[identifier, 'linkage_p'],2),
        "PercentageSixRing": round(raw_data.loc[identifier, 'six_ring_p'],2),
        "NoPrimaryAmine": int(raw_data.loc[identifier, 'primaryamine']),
        "LinkerLength": int(raw_data.loc[identifier, 'linker_length']),
        "LinkerPositions": round(raw_data.loc[identifier, 'linker_position'],2),
        "NoNitrogenPyridine": int(raw_data.loc[identifier, 'hetero_nitrogen']),
        "NoFluorine": int(raw_data.loc[identifier, 'fluorination']),
        "NoOxygenFuran": int(raw_data.loc[identifier, 'furan']),
        "NoNitrogenPyrrole": int(raw_data.loc[identifier, 'pyrrole']),
        "NoSideChainLinker": int(raw_data.loc[identifier, 'sidechain_on_linker']),
        "NoSideChainBackbone": int(raw_data.loc[identifier, 'sidechain_on_backbone']),
    }

    SynthesisFeasibility = {
        "STEI": round(raw_data.loc[identifier, 'STEI'],2),
        "NumRotTail": int(raw_data.loc[identifier, 'NumRot_tail']),
        "Eccentricity": round(raw_data.loc[identifier, 'eccentricity'],2),
        "DisNN": round(raw_data.loc[identifier, 'disNN'],2),
        "FormabilityScore": round(raw_data.loc[identifier, 'formability_score'],2),
        #"cid": raw_data.loc[identifier, 'cid'],
    }

    EnergyLevelAlignment = {
        "VBMg": round(raw_data.loc[identifier, 'inorganic_vbm_gamma'],2),
        "VBMz": round(raw_data.loc[identifier, 'inorganic_vbm_z'],2),
        "CBMg": round(raw_data.loc[identifier, 'inorganic_cbm_gamma'],2),
        "CBMz": round(raw_data.loc[identifier, 'inorganic_cbm_z'],2),
        "HOMO": round(raw_data.loc[identifier, 'organic_HOMO'],2),
        "LUMO": round(raw_data.loc[identifier, 'organic_LUMO'],2),
    }

    return {
        "identifier": 'dj-'+'{:05}'.format(UniqueID),
        "formula": formula,
        "data": {
            "OrganicSpacer": OrganicSpacer,
            "MolecularFingerprint": MolecularFingerprint,
            "EnergyLevelAlignment": EnergyLevelAlignment,
            "SynthesisFeasibility": SynthesisFeasibility,
        },
        "structures": [structure],
    }

In [50]:
identifier_list = raw_data.index.to_list()

contributions = []

for identifier in identifier_list:
    contrib = get_contrib(identifier)
    contributions.append(contrib)

client.submit_contributions([contrib for contrib in contributions])

Prepare:   0%|          | 0/363 [00:00<?, ?it/s]

Submit: 100%|##########| 363/363 [00:46<00:00,  7.86it/s]

It took 1.4min to submit 363/363 contributions.


In [42]:
contributions

[{'identifier': 'dj-00001',
  'formula': 'H48 Pb4 C32 S8 I16 N8',
  'data': {'OrganicSpacer': {'smiles_canonical': '[NH3+]Cc1cc2sc(C[NH3+])cc2s1'},
   'MolecularFingerprint': {'no_ring': 2,
    'percentage_linkage': 0.0,
    'percentage_six_ring': 0.0,
    'no_primary_amine': 2,
    'linker_length': 2,
    'linker_positions': 1.0,
    'no_nitrogen_pyridine': 0,
    'no_fluorine': 0,
    'no_oxygen_furan': 0,
    'no_nitrogen_pyrrole': 0,
    'no_side_chain_linker': 0,
    'no_side_chain_backbone': 0},
   'EnergyLevelAlignment': {'VBMg': -0.07,
    'VBMz': -0.07,
    'CBMg': 2.62,
    'CBMz': 2.62,
    'HOMO': -0.77,
    'LUMO': 3.9},
   'SynthesisFeasibility': {'STEI': 1.26,
    'NumRot_tail': 2,
    'eccentricity': 8.0,
    'disNN': 0.02,
    'formability_score': 1.0,
    'cid': nan}},
  'structures': [Structure Summary
   Lattice
       abc : 12.30655697 12.094636120000002 14.73509557
    angles : 91.21603026000001 88.90032604 88.94805456999998
    volume : 2191.936485521268
        