# Inference of all algorithms for IDAO-22 Track 1
Team: NESCafe Gold 3in1

## ALIGNNs

In [None]:
from functools import reduce

from alignn.models.alignn import ALIGNN
from jarvis.core.atoms import Atoms
from glob import glob
import torch
from tqdm.notebook import tqdm
import pandas as pd

In [None]:
atoms_array = []
ids_array = []
for i in tqdm(glob("../data/eval/defects/jarvis/*.vasp")):
    atoms = Atoms.from_poscar(i)
    id = i.split('/')[-1].split('.vasp')[0]
    atoms_array.append(atoms)
    ids_array.append(id)

In [None]:
device = torch.device('cuda')

qm9_gap_alignn = ALIGNN()
qm9_gap_alignn.load_state_dict(torch.load('../models/ALIGNN/defects/qm9_gap_alignn/checkpoint_60.pt')["model"])
qm9_gap_alignn.to(device)
qm9_gap_alignn.eval()

mp_gappbe_alignnn = ALIGNN()
mp_gappbe_alignnn.load_state_dict(torch.load('../models/ALIGNN/defects/mp_gappbe_alignnn/checkpoint_95.pt')["model"])
mp_gappbe_alignnn.to(device)
mp_gappbe_alignnn.eval()

jv_optb88vdw_bandgap_alignn = ALIGNN()
jv_optb88vdw_bandgap_alignn.load_state_dict(torch.load('../models/ALIGNN/defects/jv_optb88vdw_bandgap_alignn/checkpoint_52.pt')["model"])
jv_optb88vdw_bandgap_alignn.to(device)
jv_optb88vdw_bandgap_alignn.eval()

jv_mbj_bandgap_alignn = ALIGNN()
jv_mbj_bandgap_alignn.load_state_dict(torch.load('../models/ALIGNN/defects/jv_mbj_bandgap_alignn/checkpoint_70.pt')["model"])
jv_mbj_bandgap_alignn.to(device)
jv_mbj_bandgap_alignn.eval()

In [None]:
%%time
predict(atoms_array=atoms_array,
        idx_array=ids_array,
        model=qm9_gap_alignn,
        filename='../predictions/qm9_gap_alignn_pred_eval_no_defects.json',
        )

predict(atoms_array=atoms_array,
        idx_array=ids_array,
        model=mp_gappbe_alignnn,
        filename='../predictions/mp_gappbe_alignnn_pred_eval_no_defects.json',
        )

predict(atoms_array=atoms_array,
        idx_array=ids_array,
        model=jv_optb88vdw_bandgap_alignn,
        filename='../predictions/jv_optb88vdw_bandgap_alignn_pred_eval_no_defects.json',
        )

predict(atoms_array=atoms_array,
        idx_array=ids_array,
        model=jv_mbj_bandgap_alignn,
        filename='../predictions/jv_mbj_bandgap_alignn_pred_eval_no_defects.json',
        )

In [None]:
device = torch.device('cuda')

qm9_gap_alignn = ALIGNN()
qm9_gap_alignn.load_state_dict(torch.load('../models/ALIGNN/no_defects/qm9_gap_alignn/checkpoint_60.pt')["model"])
qm9_gap_alignn.to(device)
qm9_gap_alignn.eval()

mp_gappbe_alignnn = ALIGNN()
mp_gappbe_alignnn.load_state_dict(torch.load('../models/ALIGNN/no_defects/mp_gappbe_alignnn/checkpoint_95.pt')["model"])
mp_gappbe_alignnn.to(device)
mp_gappbe_alignnn.eval()

jv_optb88vdw_bandgap_alignn = ALIGNN()
jv_optb88vdw_bandgap_alignn.load_state_dict(torch.load('../models/ALIGNN/no_defects/jv_optb88vdw_bandgap_alignn/checkpoint_52.pt')["model"])
jv_optb88vdw_bandgap_alignn.to(device)
jv_optb88vdw_bandgap_alignn.eval()

jv_mbj_bandgap_alignn = ALIGNN()
jv_mbj_bandgap_alignn.load_state_dict(torch.load('../models/ALIGNN/no_defects/jv_mbj_bandgap_alignn/checkpoint_70.pt')["model"])
jv_mbj_bandgap_alignn.to(device)
jv_mbj_bandgap_alignn.eval()

In [None]:
preds_train = []
for path in glob('../predictions/*.csv'):
    pred_col = path.split('/')[-1].split('.csv')[0]
    prediction = pd.read_csv(path, index_col=0)
    prediction = prediction.set_index('id').rename({'pred': pred_col}, axis=1)
    preds_train.append(prediction)

preds_train = reduce(lambda a, x: a.merge(x, left_index=True, right_index=True), preds_train)

In [None]:
!rm -rf ../predictions/*.csv
!rm -rf ../predictions/*.json

## MEGNet

In [None]:
from megnet.models import MEGNetModel
from scripts.utils import structures_to_df, read_json_structures
from pathlib import Path


_, df_private = structures_to_df()
df_private = df_private.merge(read_json_structures(Path('../data/eval/defects/pymatgen')).rename({'structure': 'diff'}, axis=0), on=['_id'])

model = MEGNetModel()
model.load_weights('../models/megnet/defects/val_mae_00481_0.010377.hdf5')
preds_train['megnet_defects'] = model.predict_structures(df_private['structure_y'])

## CatBoost

In [None]:
super_good_features = ['megnet_defects', 'graphs_pagerank_min', 'graphs_adamic_adar_max',
                       'no_defects_adfa_81', 'defects_mean_chg_334',
                       'defects_mean_chg_194', 'defects_therm_cond_divi_first_ion_en',
                       'no_defects_mean_chg_83', 'no_defects_nn_43',
                       'defects_mean_chg_346', 'defects_mean_chg_178',
                       'no_defects_ddf_40', 'defects_hfus_add_X', 'defects_mean_chg_327',
                       'graphs_num_edges', 'defects_X_subs_bp',
                       'no_defects_first_ion_en_subs_therm_cond', 'graphs_pagerank_max',
                       'defects_mean_chg_343', 'no_defects_nn_94', 'defects_nsunfill',
                       'no_defects_voro_coord_divi_bp', 'no_defects_nn_54',
                       'defects_mean_chg_205', 'graphs_max_formula_band_gap',
                       'no_defects_adfb_81', 'no_defects_adfb_134',
                       'graphs_distance_mean', 'no_defects_mean_chg_314',
                       'no_defects_nn_30', 'defects_atom_mass_divi_atom_rad',
                       'no_defects_rdf_23', 'defects_mean_chg_273',
                       'defects_atom_rad_mult_X', 'defects_mp_divi_hfus',
                       'no_defects_mean_chg_3', 'no_defects_rdf_62',
                       'defects_mean_chg_158', 'no_defects_bp_subs_hfus',
                       'defects_mean_chg_203', 'defects_hfus_mult_mp',
                       'no_defects_cell_2', 'defects_mp_divi_therm_cond',
                       'no_defects_mean_chg_242', 'defects_mean_chg_130',
                       'no_defects_mean_chg_228', 'defects_cell_2', 'graphs_density',
                       'no_defects_adfb_130', 'graphs_global_efficiency',
                       'no_defects_rdf_30', 'no_defects_cell_0', 'no_defects_nn_50',
                       'no_defects_rdf_94', 'defects_mean_chg_369',
                       'defects_mean_chg_322', 'defects_nsvalence',
                       'defects_mean_chg_305', 'defects_mean_chg_230',
                       'no_defects_mp_divi_atom_mass', 'defects_mean_chg_268',
                       'no_defects_ddf_95', 'no_defects_adfb_89',
                       'graphs_degree_assortativity_coefficient',
                       'no_defects_mean_chg_19', 'defects_mean_chg_317',
                       'defects_mean_chg_61', 'defects_mean_chg_318', 'no_defects_ddf_26',
                       'defects_mean_chg_55', 'defects_C-13', 'no_defects_rdf_39',
                       'defects_mean_chg_288', 'graphs_min_formula_band_gap',
                       'defects_first_ion_en_subs_atom_rad', 'defects_rdf_23',
                       'defects_mean_chg_157', 'defects_mp_mult_voro_coord',
                       'graphs_mean_formula_band_gap', 'no_defects_elec_aff_subs_X',
                       'no_defects_mol_vol_subs_atom_rad', 'no_defects_rdf_92',
                       'defects_mean_chg_169', 'defects_nn_23', 'no_defects_mean_chg_271',
                       'defects_mean_chg_367', 'defects_polzbl_subs_voro_coord',
                       'no_defects_X_subs_atom_rad', 'graphs_num_sites',
                       'defects_hfus_subs_bp', 'defects_mean_chg_372', 'no_defects_nn_80',
                       'defects_adfb_110', 'no_defects_nn_23', 'defects_mean_chg_221',
                       'defects_nn_43', 'no_defects_mp_subs_X', 'no_defects_adfa_134',
                       'no_defects_rdf_97', 'no_defects_mean_chg_204',
                       'graphs_pagerank_mean', 'mp_gappbe_alignnn_pred_no_defects',
                       'graphs_adamic_adar_mean', 'no_defects_nn_97',
                       'defects_mean_chg_250', 'defects_polzbl_mult_therm_cond',
                       'defects_rdf_43', 'no_defects_polzbl_subs_X',
                       'defects_mean_chg_341', 'graphs_atomic_numbers_mean',
                       'no_defects_nn_83', 'defects_nn_50', 'defects_mean_chg_163',
                       'defects_voro_coord_divi_polzbl', 'defects_first_ion_en_add_bp',
                       'jv_mbj_bandgap_alignn_pred_defects', 'defects_polzbl_divi_hfus',
                       'no_defects_rdf_67', 'no_defects_mol_vol_add_bp',
                       'jv_mbj_bandgap_alignn_pred_no_defects', 'defects_mean_chg_74',
                       'defects_voro_coord_subs_therm_cond', 'defects_mean_chg_126',
                       'no_defects_mean_chg_73', 'defects_cell_3', 'defects_mean_chg_270',
                       'defects_bp_add_X', 'no_defects_mean_chg_185',
                       'no_defects_mean_chg_188', 'no_defects_nn_62',
                       'no_defects_mean_chg_317', 'defects_mp_subs_polzbl',
                       'no_defects_mol_vol_subs_voro_coord', 'no_defects_mol_vol_mult_X',
                       'defects_mean_chg_14', 'defects_mean_chg_370',
                       'defects_hfus_divi_polzbl', 'no_defects_mean_chg_258',
                       'defects_first_ion_en_mult_X', 'defects_therm_cond_divi_mol_vol',
                       'defects_mp_subs_atom_rad', 'no_defects_rdf_54',
                       'defects_mol_vol_divi_polzbl', 'defects_C-23',
                       'no_defects_mean_chg_119', 'defects_polzbl_divi_bp',
                       'no_defects_adfb_59', 'no_defects_mean_chg_216',
                       'defects_polzbl_subs_bp', 'qm9_gap_alignn_pred_no_defects',
                       'no_defects_mean_chg_374', 'no_defects_mean_chg_264',
                       'no_defects_rdf_43', 'defects_hfus_subs_first_ion_en',
                       'no_defects_C-25', 'defects_mol_vol_mult_therm_cond',
                       'no_defects_adfb_48', 'defects_rdf_80', 'defects_mean_chg_131',
                       'defects_mean_chg_149', 'no_defects_atom_mass',
                       'defects_mean_chg_290', 'no_defects_mean_chg_133',
                       'defects_hfus_mult_X', 'defects_adfa_86', 'defects_mean_chg_281',
                       'defects_C-10', 'defects_nn_39', 'defects_mol_vol_mult_bp',
                       'defects_elec_aff_subs_hfus', 'defects_mp_mult_atom_rad',
                       'defects_atom_rad_add_therm_cond', 'no_defects_mean_chg_89',
                       'defects_mean_chg_153', 'defects_mean_chg_185', 'defects_adfb_115',
                       'defects_elec_aff_mult_therm_cond',
                       'defects_first_ion_en_divi_therm_cond', 'defects_mean_chg_137',
                       'defects_mol_vol_mult_mp', 'defects_mean_chg_142',
                       'defects_mean_chg_144', 'no_defects_bp_add_mp', 'defects_cell_1',
                       'defects_mean_chg_118', 'defects_mean_chg_57',
                       'defects_mean_chg_347', 'defects_nn_54', 'defects_mean_chg_251',
                       'no_defects_bp_subs_therm_cond', 'defects_elec_aff_subs_mp',
                       'defects_adfb_72', 'no_defects_mean_chg_143',
                       'no_defects_mean_chg_58', 'defects_mean_chg_113',
                       'defects_first_ion_en_subs_hfus', 'no_defects_adfb_178',
                       'defects_voro_coord_subs_polzbl', 'defects_bp_subs_elec_aff',
                       'defects_adfa_122', 'defects_mol_vol_add_therm_cond',
                       'defects_adfb_137', 'defects_mean_chg_95',
                       'defects_first_ion_en_subs_voro_coord', 'defects_adfb_76',
                       'defects_polzbl_subs_first_ion_en', 'defects_bp_divi_polzbl',
                       'defects_mean_chg_134', 'defects_adfa_76', 'defects_mol_vol_add_X',
                       'defects_mp_subs_atom_mass', 'defects_bp_subs_atom_mass',
                       'defects_elec_aff_add_atom_rad', 'defects_rdf_54',
                       'defects_adfb_122', 'defects_adfa_137', 'defects_polzbl_add_X',
                       'no_defects_bp_subs_atom_rad', 'defects_mean_chg_223',
                       'defects_mean_chg_241', 'defects_mean_chg_350',
                       'defects_mean_chg_298', 'defects_mean_chg_133',
                       'defects_mean_chg_193', 'defects_mean_chg_287',
                       'defects_mean_chg_101', 'defects_mean_chg_293',
                       'defects_mean_chg_85', 'defects_mean_chg_172',
                       'defects_voro_coord_subs_hfus']

In [None]:
from functools import reduce
import pandas as pd

from catboost import CatBoostRegressor, Pool

eval_defects_cfid = pd.read_csv('../input/idao-22/data/eval/defects/cfid/eval.csv', index_col=0)
eval_defects_cfid = eval_defects_cfid.add_prefix('defects_')

eval_no_defects_graphs = pd.read_csv('../input/idao-22/data/eval/no_defects/graph/eval.csv', index_col=0)
eval_no_defects_graphs = eval_no_defects_graphs.add_prefix('graphs_')

eval_no_defects_cfid = pd.read_csv('../input/idao-22/data/eval/no_defects/cfid/eval.csv', index_col=0)
eval_no_defects_cfid = eval_no_defects_cfid.add_prefix('no_defects_')
eval_dataframe = reduce(lambda a, x: a.merge(x, left_index=True, right_index=True), [eval_defects_cfid, eval_no_defects_graphs, eval_no_defects_cfid])

eval_preds = pd.read_csv('../input/idao-22/predictions/eval_predictions.csv', index_col=0)

# Heal columns format
eval_preds.columns = eval_preds.columns.str.replace('_eval', '')
eval_dataframe = eval_dataframe.merge(eval_preds, right_index=True, left_index=True)

In [None]:
model = CatBoostRegressor().load_model('../models/CatBoost/boosting.cbm')

test_pool = Pool(
    data=eval_dataframe[super_good_features],
    has_header=True,
)

preds = model.predict(test_pool)

result = eval_dataframe.copy()
result['predictions'] = preds
result = result.reset_index().rename({'index': 'id'}, axis=1)
result = result[['id', 'predictions']]
result.to_csv('../predictions/final_answer.csv', index=False)

Yay!