In [None]:
!rm -rf SubmitJobs/* *tinydb

In [1]:
# Python stdlib imports
import sys
import qgrid
import copy

# Module imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pymatgen import Structure

# DFTman imports
sys.path.append('../../lib/')
import dftmanlib
from dftmanlib.pwscf import pwcalculation_helper, pseudo_helper, pseudo_table, PWOutput
from dftmanlib.job import SubmitJob, submitjob_statuses, submit_status
from dftmanlib.matproj import mpquery_helper
from dftmanlib.db import load_db

# Database imports
from tinydb import Query

# Jupyter setup
qgrid.enable()
%matplotlib notebook

# Load database
db = load_db()
table = db.table('SubmitJob')

# Pseudopotential constants
# PSEUDO_TABLE = '/data/tools/shared/dftman/pseudo_table.json'
# available pseudo families:
# ['SSSP_EFFICIENCY', 'SSSP_PRECISION', 'GBRV_LDA_US',
#  'GBRV_PBE_US', 'GBRV_PBEsol_US', 'DOJO_STANDARD_LDA_NC',
#  'DOJO_STANDARD_PBE_NC', 'DOJO_STANDARD_PBEsol_NC',
#  'DOJO_STRINGENT_LDA_NC', 'DOJO_STRINGENT_PBE_NC',
#  'DOJO_STRINGENT_PBAsol_NC',]
PSEUDO_FAMILY = 'GBRV_PBE_US'

# Materials Project API Key
MP_API_KEY = '0WqdPfXxloze6T9N'

### Materials Project Query
The first step in any ab-initio calculation is to define the structure which should be simulated.
DFTman uses pymatgen `Structure` objects to handle crystal structures, so there are a few convenient ways to load structures into the programming environment:
* Manually input structure parameters by calling `pymatgen.Structure`
* Load structure from file using `pymatgen.io`
* Query for structures using the Materials Project API

Querying the Materials Project (MP) is often the simplest and most powerful way to load structures.
This process is further simplified by included functions in DFTmanLib: `MPQuery` and `mpquery_helper`.
Queries have 3 inputs:
* Query criteria which specify properties such as chemistry, and crystal structure
* Query parameters which define what information the query returns
* API key to authenticate with the Materials Project

The available criteria and properties are described on the [Materials Project website](https://materialsproject.org/docs/api) and in their [Github repository](https://github.com/materialsproject/mapidoc/tree/master/materials/elasticity).

In [None]:
# Define the criteria for retrieving FCC Aluminum
criteria = {
    'elements': 'Al',  # only Aluminum in elements
    'nsites': 1,  # one site per cell
    'spacegroup.number': 225,  # FCC International space group
}
# List properties to return
properties = ['band_gap', 'elasticity', 'spacegroup.symbol']
# Create an MPQuery object using mpquery_helper
mp_query = mpquery_helper(criteria, properties, MP_API_KEY)
mp_query.query()  # Run the query
mp_query.display()  # Display query results in a pretty table

### Configure Calculation
After loading an input structure into the programming environment, other calculation parameters must be defined before running a calculation.
The availabile input parameters for the DFT package used here, Quantum Espresso PWscf, are listed in the [pw.x input documentation](https://www.quantum-espresso.org/Doc/INPUT_PW.html).

DFTmanLib uses `dftmanlib.pwscf` for interacting with pw.x's inputs and outputs.
Inputs are handled by `dftmanlib.pwscf.PWInput`, and outputs by `dftmanlib.pwscf.PWOutput`.
Entire calculations are represented by the object `dftmanlib.pwscf.PWCalculation`, which is a convenient wrapper around a pair of `PWInput` and `PWOutput` objects.
There are a lot of unnecessarily complicated details about directly configuring `PWInput`, `PWOutput`, and `PWCalculation` objects, so some nice helper functions are provided, similar to `mpquery_helper` for `MPQuery` objects.
These are `dftmanlib.pwscf.pwinput_helper` and `dftmanlib.pwscf.pwcalculation_helper`. **Almost always, `pwcalculation_helper` will be most convenient.**

Aside from calculation parameters, pseudopotentials are another critical input to DFT calculations, and they are selected using `dftmanlib.pwscf.pseudo_helper` by selecting a pseudopotential family from the collection of families provided by DFTman, including [Standard Solid State Pseudopotentials (SSSP)](https://www.materialscloud.org/discover/sssp/table/efficiency), [Garrity-Bennett-Rabe-Vanderbilt (GBRV)](https://www.physics.rutgers.edu/gbrv/), [PseudoDojo (DOJO)](http://www.pseudo-dojo.org/) potentials with a variety of ultrasoft, norm-conserving; LDA, and GGA exchange-correlation functionals.

In [None]:
# Load Al structure from the MPQuery
mp_id = 'mp-134'
structure = mp_query.df[mp_query.df.material_id == mp_id].structure.tolist()[0]
mp_alat = structure['lattice']['a']  # We will use this later!
# Declare a pseudopotential family
pseudo_family = 'GBRV_PBE_US'
# Create pseudopotential dictionary
pseudo = pseudo_helper(structure, pseudo_family)

# Declare pw.x inputs
pw_inputs = {
    # These are not traditional inputs to pw.x
    #     in this format, but this is the best way
    #     to provide them in DFTman
    'structure': structure,
    'pseudo': pseudo,
    
    # The following mirror traditional pw.x input
    #     cards
    'control': {
        'calculation': 'vc-relax',
        'verbosity': 'high',
        'disk_io': 'none',
    },
    'system': {
        'ibrav': 0,
        'ecutwfc': 45.0,
        'occupations': 'smearing',
        'degauss': 0.025,
        'smearing': 'mv'
    },
    'electrons': {
        'electron_maxstep': 500,
        'conv_thr': 1.0e-8
    },
    'ions': {
        
    },
    'cell': {
        
    },
    'kpoints_mode': 'automatic',
    'kpoints_grid': (20, 20, 20),
    'kpoints_shift': (0, 0, 0)
}

# Create a PWCalculation
# additional inputs is a requirement for nanoHUB
#     and should _always_ be equal to list(pseudo.values())
#     for Quantum Espresso pw.x calculations
pw_calculation = pwcalculation_helper(**pw_inputs, 
    additional_inputs=list(pseudo.values()))
display(pw_calculation)

### Create, Store, and Run using nanoHUB Submit
After loading an input structure, finding appropriate pseudopotentials, and configuring calculation parameters, all the necessary components are present to run a DFT calculation. The last step in this process is to create a `Job`, store it in the database, and run it. DFTman provides the following types of `Job`:
* `SubmitJob` is used on nanoHUB for remotely submitting to the nanoHUB cluster
* `PBSJob` can be used on any cluster running Torque to submit to the cluster
* `LocalJob` runs the calculation locally, e.g. in a nanoHUB workspace, on a cluster workspace, or on an actually local workstation

`Jobs` generally require the following inputs:
* A `Calculation` object which has the necessary properties described in `dftmanlib.base`, e.g. `PWCalculation`
* Which program to use. This is implemented differently in different types of `Job`.
* Resource parameters describing how many processors, nodes, and which queue to run on. These also vary slightly depending on the type of `Job`

The best way to figure out what specific inputs a type of `Job` needs is to run `help(JobClass)` where `JobClass` is the type of `Job` you're interested in (e.g. `help(SubmitJob)`)

`Jobs` should be automatically stored in the database when they are run, but often when their data are updated, they are **not** automatically updated in the database, to improve performance. Make sure you update jobs in the database by running `job.update()` at critical points in your code, like when you check a job's status or parse its output.

In [None]:
submit_job = SubmitJob(calculation=pw_calculation,  # the calculation configured above
                       code='espresso-6.2.1_pw',  # QE v6.2.1 on nanoHUB
                       walltime='00:30:00',  # 1 hour max run time
                       ncpus=2,  # 8 processors requested
                       runname='DFTmanExample',  # run name label in nanoHUB queue
                       metadata={'mp_alat': mp_alat})  # saving MP alat for later comparison

doc_id = submit_job.run()  # SubmitJob.run() returns the job's database document ID

### Loading Jobs from the Database
When `Job`s are run, they are automatically stored in the project database, located at `dftman/projects/PROJECT DIRECTORY/db.tinydb`. DFTman uses the [TinyDB module](https://tinydb.readthedocs.io/en/latest/) to manage flat-file JSON-encoded document-based databases. DFTman implements a modified storage protocol based on the [monty module](http://guide.materialsvirtuallab.org/monty/)'s MSON encoder and decoder to automatically convert MSONable python objects to and from serializable python dicationaries. All of this is to say that you can directly insert and retrieve objects like `Job`s which have `from_dict()` and `as_dict()` methods.

The database is organized by tables which are named after the type of thing that they store. For example, the `'SubmitJob'` table stores `SubmitJob`s, and the `'PBSJob'` table stores `PBSJob`s. So, for a project using `SubmitJob`s, the way to load the appropriate database table is the following:
```
db = load_db()  # This is from dftmanlib.db
table = db.table('SubmitJob')
```

In practice, this is useful for storing input and output information for all `Job`s and `Workflow`s run using DFTman. A lot of information about how to get and query for database entries is listed in the [TinyDB documentation](https://tinydb.readthedocs.io/en/latest/), but the simplest way to load `Job`s into the programming environment from the database is to use the `TinyDB.table.get()` method with a `doc_id` (which is shown and returned whenever something new is added to the database).

Any time a project's notebook is shut down, all the variables and data stored in the python environment are lost. So, in order to check on the status of submitted `Job`s or load completed `Job`s and their output, they must be loaded from the database back into the python environment.

In [2]:
# The database and table are already loaded
#     in the first cell of the notebook (see above)
# doc_id is created in the last cell when
#     submit_job is run
doc_id = 1

loaded_submit_job = table.get(doc_id=doc_id)
display(loaded_submit_job)

submit_job = loaded_submit_job

{'@class': 'SubmitJob',
 '@module': 'dftmanlib.job.SubmitJob',
 'calculation': {'directory': '/home/nanohub/azadoks/git/dftman/projects/SubmitJob '
              'Example/SubmitJobs/DFTmanExample_b0e8379349f6',
 'input': {'@class': 'PWInput',
 '@module': 'dftmanlib.pwscf.pwscf',
 'cell': {},
 'control': {'calculation': 'vc-relax',
             'disk_io': 'none',
             'prefix': 'pwscf',
             'pseudo_dir': '/data/tools/dftman/GBRV_PBE_US',
             'restart_mode': 'from_scratch',
             'verbosity': 'high'},
 'electrons': {'conv_thr': 1e-08, 'electron_maxstep': 500},
 'ions': {},
 'kpoints_grid': [20, 20, 20],
 'kpoints_mode': 'automatic',
 'kpoints_shift': [0, 0, 0],
 'pseudo': {'Al': 'al_pbe_v1.uspp.F.UPF'},
 'structure': {'@class': 'Structure',
               '@module': 'pymatgen.core.structure',
               'charge': None,
               'lattice': {'a': 2.8559545504615884,
                           'alpha': 59.99999994324255,
                           

### Watch Job Statuses
Once `Job`s are run on a remote resource like the nanoHUB cluster using `SubmitJob` or a research cluster using `PBSJob`, it's important to wach the status of the `Job` so DFTman knows when it's complete. There are generally three ways to check `Job` statuses:
* Queue status functions like `dftmanlib.job.submit_status()` provide information on **currently running** jobs and are the fastest status functions.
* Batch status functions like `dftmanlib.job.submitjob_statuses()` which act on a list of `Job` objects provide the status of all `Job`s which they are provided and are the slowest status functions.
* Individual status functions `Job.check_status()` which are provided by every `Job` class give the status of an individual `Job` and are useful when running few jobs or checking on specific statuses. 

Batch and individual status functions often also have the option to automatically update `Job`s in the database after checking their status, but this can be a *very* slow process and should only be done when necessary (i.e. when first checking job statuses and when jobs are complete).

In [None]:
# Queue status
submit_status()

In [None]:
# Batch status
submitjob_statuses([submit_job], update_in_db=False)

In [None]:
# Individual status
status = submit_job.check_status(update_in_db=False)
display(status)
if submit_job.status == 'Complete':
    submit_job.update()

### Retrieve Output
When `Job`s complete successfully (and often when they complete unsuccessfully), their output can be parsed and brought into the programming environment using `job.parse_output()`. Behind the scenes, this function calls the `Job`'s `calculation.parse_output()` which in turn creates a new instance of the calculation's output class which is passed back up the chain. In the case of Quantum Espresso PWscf, this means creating a `PWOutput` object which provides many of the outputs in the pw.x stdout.

`job.parse_output()` always check's if `job.status['status'] == 'Complete'`, so it is important to ensure that `job.check_status()` is called before trying to parse output and that the job is up-to-date in the database.

To see the properties provided by `PWOutput`, use `help(PWOutput)` and look under Data Descriptors.
To list the entries in `PWOutput.data`, use `display(list(dftmanlib.pwscf.pwoutput.patterns.keys()))`

In [3]:
pw_output = submit_job.parse_output()
display(pw_output)  # Note: bands data are not shown to avoid verbosity

Updated Job b0e8379349f6 in database with doc_id 1


{'a1': [],
 'a2': [],
 'a3': [],
 'absolute_magnetization': [],
 'atomic_positions': [],
 'b1': [],
 'b2': [],
 'b3': [],
 'bands_data': 'EXCLUDED FROM PRINTING',
 'cell_parameters': [],
 'celldm1': [],
 'celldm2': [],
 'celldm3': [],
 'celldm4': [],
 'celldm5': [],
 'celldm6': [],
 'conv_iters': [],
 'conv_thr': [],
 'cpu_time_exceeded': [],
 'date': [],
 'degauss': [],
 'density': [],
 'deprecated_feature_used': [],
 'echutrho': [],
 'ecutwfc': [],
 'eigenvalues_not_converged': [],
 'energy': [],
 'enthalpy': [],
 'exc': [],
 'fermi_energy': [],
 'final_energy': [],
 'final_enthalpy': [],
 'force': [],
 'general_error': ['Error in routine readpp (2):\n'
                   '     file '
                   '/data/tools/dftman/GBRV_PBE_US/al_pbe_v1.uspp.F.UPF not '
                   'found',
                   'Error in routine readpp (2):\n'
                   '     file '
                   '/data/tools/dftman/GBRV_PBE_US/al_pbe_v1.uspp.F.UPF not '
                   'found'],
 'initi

In [None]:
pw_output.succeeded