In [1]:
from pymatgen import MPRester
import pandas as pd
import os
import ast

In [86]:
def dist(p1, p2):
    "Returns the distance between two 3D points"
    return ((p1[0]-p2[0])**2+(p1[1]-p2[1])**2+(p1[2]-p2[2])**2)**(1/2)

def carlos(dic):
    """Maybe one day Carlos will use this input. But nobody knows fo sure..."""
    if not (type(dic) is dict):
        dic = ast.literal_eval(dic)
    string = ''
    for e in dic.keys():
        string += f'{e}_{str(int(dic[e]))}_'
   
    return string[:-1]

def bs_prop(dictionary, bs):
    """ This function takes the original dictionary and add band gap features from the band sctructure dictionary (bs)"""
   
    dictionary['direct'] = bs['band_gap']['direct']
   
    dictionary['vbm_energy'] = bs['vbm']['energy']
    dictionary['cbm_energy'] = bs['cbm']['energy']
    dictionary['band_gap_bs'] = dictionary['cbm_energy'] - dictionary['vbm_energy']
   
    dictionary['vb_index_list'] = bs['vbm']['band_index']['1']
    dictionary['vb_index'] = dictionary['vb_index_list'][-1]
    dictionary['vbm_kpoint_index'] = bs['vbm']['kpoint_index'][0]
    dictionary['vbm_kpoint'] = bs['kpoints'][dictionary['vbm_kpoint_index']]
   
    dictionary['cb_index_list'] = bs['cbm']['band_index']['1']
    dictionary['cb_index'] = dictionary['cb_index_list'][0]
    dictionary['cbm_kpoint_index'] = bs['cbm']['kpoint_index'][0]
    dictionary['cbm_kpoint'] = bs['kpoints'][dictionary['cbm_kpoint_index']]
   
    dictionary['dist'] = dist(dictionary['vbm_kpoint'], dictionary['cbm_kpoint'])
    dictionary['carlos_input'] = carlos(dictionary['unit_cell_formula'])


def bs_error1(dictionary):
    """ This function assigns 'None' to the band gap features if an error occurs"""
   
    dictionary['direct'] = None
   
    dictionary['vbm_energy'] = None
    dictionary['cbm_energy'] = None
    dictionary['band_gap_bs'] = None
   
    dictionary['vb_index_list'] = None
    dictionary['vb_index'] = None
    dictionary['vbm_kpoint_index'] = None
    dictionary['vbm_kpoint'] = None
   
    dictionary['cb_index_list'] = None
    dictionary['cb_index'] = None
    dictionary['cbm_kpoint_index'] = None
    dictionary['cbm_kpoint'] = None
   
    dictionary['dist'] = None
    dictionary['carlos_input'] = None

In [1]:
# A query must have a criteria and a set of properties we want.
# List of fields we can query https://github.com/materialsproject/mapidoc/tree/master/materials
# List of properties (access using mpr.supported_properties method)

crit = {'magnetic_type': 'NM',
        'band_gap': {'$gt': 0},
        'has': {'$all': ['bandstructure']}, 'nsites': {'$lte': 30}}

prop = ['material_id', 'pretty_formula', 'elements', 'nelements', 'nsites', 'energy',
        'energy_per_atom', 'formation_energy_per_atom', 'e_above_hull', 'volume', 'density',
        'unit_cell_formula', 'spacegroup','is_hubbard', 'hubbards',
        'band_gap', 'icsd_ids', 'cif']

# Querying and saving the results ('results' is a list of dictionaries with keys from 'prop')
with MPRester(api_key='40biQ81yTxRukLV3') as mpr:
    results = mpr.query(criteria=crit,properties=prop)
   
i=1
update_int = 20

for r in results:
    m_id = r['material_id']
    print(f'{i}/{len(results)}: {m_id}')
   
    try:
        # Getting the bandstructure
        bs = mpr.get_bandstructure_by_material_id(m_id).as_dict()
        bs_prop(r, bs)

    except:
        # If we get an error, bs_error1 assigns 'None' to all the band gap features
        bs_error1(r)
        continue
   
    # just saving every update_int iteration (temp_mp_dataset.csv)
    if i == 1:
        temp_df = pd.DataFrame(columns=r.keys())
        temp_df.to_csv('temp_mp_dataset.csv')
   
    if (i % update_int) == 0:
        temp_df = pd.DataFrame(results[i-update_int:i+1])
        temp_df.to_csv('temp_mp_dataset.csv', mode='a', header= False)
        print('update', i)
   
    i += 1

df = pd.DataFrame(results)    

df['spacegroup'] = df.apply(lambda x: x['spacegroup']['number'], axis=1)

# Saving our final dataset
df.to_csv('mp_dataset.csv')

HBox(children=(IntProgress(value=0, max=11810), HTML(value='')))

1/11810: mp-1019740
2/11810: mp-10281
3/11810: mp-1287
4/11810: mp-13926
5/11810: mp-14037
6/11810: mp-16136
7/11810: mp-165
8/11810: mp-20337
9/11810: mp-21892
10/11810: mp-22375
11/11810: mp-2245
12/11810: mp-22894
13/11810: mp-22898
14/11810: mp-23037
15/11810: mp-23173
16/11810: mp-23326
17/11810: mp-23879
18/11810: mp-23885
19/11810: mp-2412
20/11810: mp-24181
update 20
21/11810: mp-2542
22/11810: mp-28020
23/11810: mp-28240
24/11810: mp-28369
25/11810: mp-28944
26/11810: mp-29196
27/11810: mp-29331
28/11810: mp-29754
29/11810: mp-29761
30/11810: mp-2979
31/11810: mp-29946
32/11810: mp-30530
33/11810: mp-3098
34/11810: mp-31235
35/11810: mp-32684
36/11810: mp-3277
37/11810: mp-33449
38/11810: mp-33746
39/11810: mp-34081
40/11810: mp-34467
update 40
41/11810: mp-35035
42/11810: mp-35907
43/11810: mp-4731
44/11810: mp-510032
45/11810: mp-540635
46/11810: mp-540787
47/11810: mp-541097
48/11810: mp-541937
49/11810: mp-542136
50/11810: mp-556880
51/11810: mp-559252
52/11810: mp-559893


In [87]:
df = pd.read_csv('mp_dataset.csv')

In [88]:
df.set_index('material_id', inplace=True)

In [90]:
# The 'lista' list has all the material-ids with empty band_gap variables 
# This might have happened due to a temporary server error when requesting the bandstructure (or the entry simply was empty for whatever reason)
lista = df[pd.isnull(df['band_gap_bs'])].index

# We are going to try again to request the bandstructures with empty fields 
mpr = MPRester(api_key='40biQ81yTxRukLV3')

for m_id in lista:
    try:
        print(m_id)
        bs = mpr.get_bandstructure_by_material_id(m_id).as_dict()
        # Just in case there's spin_polarized guys... We don't want them
        if bs['is_spin_polarized']:
            print('spin_pol')
            continue
        
        # Filling in the missing fields for band gap features
        df.at[m_id, 'direct'] = bs['band_gap']['direct']
   
        df.at[m_id, 'vbm_energy'] = bs['vbm']['energy']
        df.at[m_id, 'cbm_energy'] = bs['cbm']['energy']
        df.at[m_id, 'band_gap_bs'] = df.at[m_id, 'cbm_energy'] - df.at[m_id, 'vbm_energy']

        df.at[m_id, 'vb_index_list'] = bs['vbm']['band_index']['1']
        df.at[m_id, 'vb_index'] = df.at[m_id, 'vb_index_list'][-1]
        df.at[m_id, 'vbm_kpoint_index'] = bs['vbm']['kpoint_index'][0]
        df.at[m_id, 'vbm_kpoint'] = bs['kpoints'][int(df.at[m_id, 'vbm_kpoint_index'])]
   
        df.at[m_id, 'cb_index_list'] = bs['cbm']['band_index']['1']
        df.at[m_id, 'cb_index'] = df.at[m_id, 'cb_index_list'][0]
        df.at[m_id, 'cbm_kpoint_index'] = bs['cbm']['kpoint_index'][0]
        df.at[m_id, 'cbm_kpoint'] = bs['kpoints'][int(df.at[m_id, 'cbm_kpoint_index'])]
   
        df.at[m_id, 'dist'] = dist(df.at[m_id, 'vbm_kpoint'], df.at[m_id, 'cbm_kpoint'])
        df.at[m_id, 'carlos_input'] = carlos(df.at[m_id, 'unit_cell_formula'])
        print('done')
        
    except Exception as error:
        print(error)

mp-684904
'NoneType' object has no attribute 'as_dict'
mp-771484
spin_pol
mvc-12752
'NoneType' object has no attribute 'as_dict'
mp-29510
'NoneType' object has no attribute 'as_dict'
mp-636827
'NoneType' object has no attribute 'as_dict'
mp-675189
done
mp-675376
'NoneType' object has no attribute 'as_dict'
mp-675769
spin_pol
mp-675832
'NoneType' object has no attribute 'as_dict'
mp-676
'NoneType' object has no attribute 'as_dict'
mp-676489
spin_pol
mp-684897
'NoneType' object has no attribute 'as_dict'
mp-771556
spin_pol
mp-8845
spin_pol
mp-978114
done
mp-9807
'NoneType' object has no attribute 'as_dict'
mvc-11360
'NoneType' object has no attribute 'as_dict'
mvc-14312
'NoneType' object has no attribute 'as_dict'
mvc-5096
'NoneType' object has no attribute 'as_dict'
mp-567318
'NoneType' object has no attribute 'as_dict'
mp-568286
'NoneType' object has no attribute 'as_dict'
mp-616541
'NoneType' object has no attribute 'as_dict'
mp-651311
done
mp-676284
'NoneType' object has no attribute

In [99]:
# Number of entries with empty band gap features
df[pd.isnull(df['band_gap_bs'])].shape

(130, 32)

In [106]:
# Filtering for the guys with all band gap features
final_df = df[~pd.isnull(df['band_gap_bs'])]

final_df.shape

In [107]:
# Saving to the final dataset
final_df.to_csv('mp_dataset2.csv')

In [110]:
#Number of indirect band gap entries
final_df[final_df['direct'] == False].shape

(8581, 32)

In [9]:
df = pd.read_csv('mp_dataset2.csv')