# Build Sample Data

This tutorial shows the details of sample data building. This is exactly what we did in `xenonpy.datatools.Preset#build`.


### dataset

We selected 1,000 inorganic compounds randomly from the [Materials Project](https://materialsproject.org) database for test and benchmark.
You can check all **MP ids** at `mp_ids.txt`.



### API key

Before starting, users have to create their own `API key`. See [The Materials API](https://materialsproject.org/open) to learn how to do it.

In [1]:
# your api key

api_key = ''

### import packages

In [2]:
from itertools import zip_longest
from pathlib import Path

from pymatgen import MPRester
from tqdm import tqdm

import pandas as pd
import numpy as np

### fetch function

In [3]:
def data_fetcher(api_key, mp_ids):

#     print('Will fetch %s inorganic compounds from Materials Project' % len(mp_ids))
    
    # split requests into fixed number groups
    # eg: grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
    def grouper(iterable, n, fillvalue=None):
        """Collect data into fixed-length chunks or blocks"""
        args = [iter(iterable)] * n
        return zip_longest(fillvalue=fillvalue, *args)

    # the following props will be fetched
    mp_props = [
        'band_gap',
        'density',
        'volume',
        'material_id',
        'pretty_formula',
        'elements',
        'efermi',
        'e_above_hull',
        'formation_energy_per_atom',
        'final_energy_per_atom',
        'unit_cell_formula',
        'structure'
    ]



    entries = []
    mpid_groups = [g for g in grouper(mp_ids, 10)]

    with MPRester(api_key) as mpr:
        for group in tqdm(mpid_groups):
            mpid_list = [id for id in filter(None, group)]
            chunk = mpr.query({"material_id": {"$in": mpid_list}}, mp_props)
            entries.extend(chunk)


    df = pd.DataFrame(entries, index=[e['material_id'] for e in entries])
    df = df.drop('material_id', axis=1)
    df = df.rename(columns={'unit_cell_formula': 'composition'})
    df = df.reindex(columns=sorted(df.columns))

    return df

In [4]:
# read ids
mp_ids = [s.decode('utf-8') for s in np.loadtxt('mp_ids.txt', 'S20')]

# fetch data as pandas.DataFrame
df = data_fetcher(api_key, mp_ids)
df.head(5)

100%|██████████| 100/100 [01:06<00:00,  1.63it/s]


Unnamed: 0,band_gap,composition,density,e_above_hull,efermi,elements,final_energy_per_atom,formation_energy_per_atom,pretty_formula,structure,volume
mp-20866,0.1849,"{'Ge': 4.0, 'Rh': 4.0}",9.755532,0.039943,,"[Ge, Rh]",-6.496651,-0.506775,GeRh,"[[0.80283811 1.66009496 3.26577118] Ge, [1.660...",119.521991
mp-30759,0.0,"{'Li': 1.0, 'Mg': 2.0, 'Tl': 1.0}",5.02291,0.027278,4.641088,"[Li, Mg, Tl]",-1.970913,-0.099951,LiMg2Tl,"[[2.85976352 2.02215817 4.95325571] Li, [1.429...",85.932461
mp-3416,6.7145,"{'F': 12.0, 'Na': 6.0, 'Al': 2.0}",2.844098,0.0,-1.60253,"[Al, F, Na]",-5.035544,-3.414627,Na3AlF6,"[[3.6307153 1.31968004 3.40159567] F, [4.5775...",245.15029
mp-505412,2.0103,"{'K': 8.0, 'In': 8.0, 'S': 16.0}",3.080923,0.0,1.459451,"[In, K, S]",-3.971909,-1.274151,KInS2,"[[ 6.09352925 1.20936514 14.43416479] K, [ 6....",940.171218
mp-684652,6.8002,"{'Be': 3.0, 'F': 6.0}",1.246978,0.223526,-5.588439,"[Be, F]",-5.541302,-3.346732,BeF2,"[[3.76534884 1.64321 7.5707047 ] Be, [1.460...",187.798605
