
<div style="
    background-color: #f7f7f7;
    background-image: url(''), url('') ;
    background-position: left bottom, right top;
    background-repeat: no-repeat,  no-repeat;
    background-size: auto 60px, auto 160px;
    border-radius: 5px;
    box-shadow: 0px 3px 1px -2px rgba(0, 0, 0, 0.2), 0px 2px 2px 0px rgba(0, 0, 0, 0.14), 0px 1px 5px 0px rgba(0,0,0,.12);">

<h1 style="
    color: #2a4cdf;
    font-style: normal;
    font-size: 2.25rem;
    line-height: 1.4em;
    font-weight: 600;
    padding: 30px 200px 0px 30px;"> 
        Battery Research Data Management (RDM) with NOMAD</h1>
<p style="font-size: 1.25em; font-style: italic; padding: 5px 200px 30px 30px;">
    Uday Gajera, Axel Gross.</p>
</div>

The notebook is divided into the following sections:

  1. **Data**: Overview of the data we are working with
  2. **Schema**: Designing a structure for our data
  3. **ELN**: Manually creating entries from a schema
  4. **Parser**: Automatically creating entries from a schema 
  5. **App**: Creating a customized search experience
  6. **Plugin**: Wrapping everything into a plugin

## Data

We start with pre-cleaned version of this data as JSON files that look like this:

```json
{
        "elements": {
            "Ti": "1.0",
            "O": "2.0"
        },
        "doi": "https://doi.org/10.1016/j.jallcom.2018.01.359",
        "compound_names": "['TiO2 electrodes']",
        "capacity": null,
        "capacity_unit": null,
        "capacity_raw_unit": null,
        "conductivity": null,
        "conductivity_unit": null,
        "conductivity_raw_unit": null,
        "coulombic_efficiency": null,
        "coulombic_efficiency_unit": null,
        "coulombic_efficiency_raw_unit": null,
        "energy": null,
        "energy_unit": null,
        "energy_raw_unit": null,
        "voltage": 3.0,
        "voltage_unit": "Volt^(1.0)",
        "voltage_raw_unit": "V"
    }

### Data loading and inspection

In [None]:
import pandas as pd
file_path = 'battery.csv'
df = pd.read_csv(file_path)

In [None]:
df.head()

In [None]:
df[df.Property == 'Capacity'].values[0]

In [None]:
# Let's drop a bunch of columns that we will not use at the beginning to simplify the data
try:
    df = df.drop(['Unnamed: 0','Raw_unit', 'Raw_value', 'Extracted_name', 'Tag', 'Warning', 'Type', 'Info', 'Title', 'Journal', 'Date', 'Correctness'], axis=1)
except:
    print("Columns already dropped")

df.head()

In [None]:
# Let's look at the unique values in some of the columns to understand better what we are working with.
print(f"Unique values in the 'Property' column: {df['Property'].unique()}")
print(f"Unique values in the 'Unit' column: {df['Unit'].unique()}")
print(f"Number of unique compounds: {len(df['Name'].unique())}")
print(f"Number of unique papers as data source: {len(df['DOI'].unique())}")


### Data Cleaning

In [None]:
# First, let's check the unique properties and ensure they are what we expect
print(df['Property'].unique())

# Create new columns in a loop and assign the values
for property_name in df['Property'].unique():
    # Check if the dataframe has this property
    if property_name in df['Property'].unique():
        # Extract values into a new column based on property
        df[f'{property_name}_value'] = df[df['Property'] == property_name]['Value']
        # Extract units into a new column based on property
        df[f'{property_name}_unit'] = df[df['Property'] == property_name]['Unit']
        # Ensure the values are numeric
        df[f'{property_name}_value'] = pd.to_numeric(df[f'{property_name}_value'], errors='coerce')

df.head()

In [None]:
# Convert all text in the 'Specifier' column to lowercase to standardize it
df['Specifier'] = df['Specifier'].str.lower()

# Create a dictionary to map similar terms to a single consistent term
specifier_mapping = {
    'charge capacities': ['charge capacity'],
    'discharge capacities': ['discharge capacity', 'discharge specific capacities'],
    'capacities': ['capacity', 'specific capacity', 'theoretical capacities', 'theoretical capacity'],
    'reversible capacities': ['reversible capacity', 'reversible discharge capacities', 'reversible specific capacities'],
    'coulombic': ['coulombic efficiency', 'coulombic'],
    'voltage': ['voltage', 'potential']
}

# Function to apply the mapping
def map_specifier(specifier):
    for key, values in specifier_mapping.items():
        if specifier in values:
            return key
    return specifier

# Apply the mapping function to the 'Specifier' column
df['Specifier'] = df['Specifier'].apply(map_specifier)

# Check the updated unique values in the 'Specifier' column
df['Specifier'].unique()


In [None]:
!pip install pubchempy

In [None]:
import pubchempy as pcp
import time
import pprint
import json

def standardize_chemical_name(name):
    try:
        # Search for compounds in PubChem by name
        compounds = pcp.get_compounds(name, 'name')
        # Wait for 1 second to avoid overloading the server
        time.sleep(1)
        # Use the first matching compound's IUPAC name or the first synonym as the standardized name
        if compounds:
            return compounds[0].iupac_name or (compounds[0].synonyms[0] if compounds[0].synonyms else None)
        else:
            return None
    except Exception as e:
        print(f"An error occurred for chemical {name}: {e}")
        return None

# Extract unique chemical names from the 'Name' column in the DataFrame
chemicals = df['Name'].unique()

# Standardize chemical names
mapping_dict = {}
for chemical in chemicals:
    standardized_name = standardize_chemical_name(chemical)
    if standardized_name is not None:
        mapping_dict[chemical] = standardized_name

# Print the mapping dictionary beautifully
pprint.pprint(mapping_dict)

# Optional: Save the mapping to a JSON file
with open('chemical_mapping.json', 'w') as f:
    json.dump(mapping_dict, f)

# Optionally, you can create a new column in the DataFrame to store the standardized names
df['Standardized_IUPAC_Name'] = df['Name'].apply(lambda x: mapping_dict.get(x, None))

# Showing the DataFrame head with the new column
print(df.head())


In [None]:
import ast

# Define the function to extract names from a string representation of a list
def extract_names(names_str):
    try:
        # Safely evaluate the string as a list
        names_list = ast.literal_eval(names_str)

        # Sort the names by length
        sorted_names = sorted(names_list, key=len, reverse=True)

        # Extract the IUPAC and abbreviated names
        return sorted_names[0], (sorted_names[1] if len(sorted_names) > 1 else None)
    except:
        # Return None if there is an error in parsing or sorting
        return None, None

# Apply the function to each row in the DataFrame and create new columns
df[['iupac_name', 'abbreviated_name']] = df['Name'].apply(
    lambda x: pd.Series(extract_names(x))
)

df.head()

In [None]:
# Print the number of unique values in the Specifier column
print(f"Number of unique values for the specifier components: {len(df['Specifier'].unique())}")

### Data Aggreagation

In [None]:
# Combine the rows that have both the same Name and same DOI so we can have a single row for each compound and paper
df_grouped = df.groupby(['Name', 'DOI']).agg('first').reset_index()

# Check how many unique chemicals and papers are there
print(f"Number of unique chemicals: {len(df['Name'].unique())}")
print(f"Number of unique papers as data source: {len(df['DOI'].unique())}")

# Check how many times are repeated together and include other aggregated columns
df_grouped = df.groupby(['Name', 'DOI']).agg('first').reset_index()

# Convert the NaN to None for cleaner output
df_grouped = df_grouped.where(pd.notnull(df_grouped), None)

# Display a sample of the grouped data
df_grouped.head()

In [None]:
import plotly.express as px

fig = px.scatter(
    df_grouped,
    x="Voltage_value",
    y="Capacity_value",
    color="Specifier",  # Color points by Specifier to see different categories
    hover_data=['DOI']  # Use hover data to show DOI and names
)
fig.show()

In [None]:
# Define the properties for which to plot histograms
properties = [
    'Capacity_value', 'Voltage_value', 'Coulombic Efficiency_value',
    'Conductivity_value', 'Energy_value'
]

# Generate a histogram for each property
for property in properties:
    fig = px.histogram(
        df_grouped,
        x=property,
        title=f'Histogram of {property}',
        labels={property: property.replace('_', ' ')},  # Clean up the label for display
        nbins=100  # Number of bins can be adjusted depending on the data range and distribution
    )
    fig.show()

num_entries = df_grouped.shape[0]
print(f"Number of entries in the DataFrame: {num_entries}")


In [None]:
import plotly.graph_objects as go

# Initial plot with default properties shown
fig = px.scatter(
    df_grouped,
    x=properties[0],  # Default x-axis property
    y=properties[1],  # Default y-axis property
    title="Dynamic Scatter Plot of Battery Properties",
    labels={prop: prop.replace('_', ' ') for prop in properties},  # Clean up labels
    hover_data=['DOI', 'iupac_name']  # Example hover data
)

# Update layout to add dropdowns
fig.update_layout(
    updatemenus=[
        dict(
            buttons=list([
                dict(
                    args=[{"x": [df_grouped[prop]]}],
                    label=prop.replace('_', ' '),
                    method="restyle"
                ) for prop in properties
            ]),
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.1,
            yanchor="top"
        ),
        dict(
            buttons=list([
                dict(
                    args=[{"y": [df_grouped[prop]]}],
                    label=prop.replace('_', ' '),
                    method="restyle"
                ) for prop in properties
            ]),
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.3,
            xanchor="left",
            y=1.1,
            yanchor="top"
        )
    ]
)

fig.show()


In [None]:
# Normalize the properties
normalized_data = {}
annotations = []

for prop in properties:
    min_value = df_grouped[prop].min()
    max_value = df_grouped[prop].max()
    # Normalize the data
    normalized_data[prop] = (df_grouped[prop] - min_value) / (max_value - min_value)
    # Prepare annotations with the original min and max values
    annotations.append(f"{max_value}")

# Convert the dictionary to DataFrame for plotting
df_normalized = pd.DataFrame(normalized_data)

# Create a box plot
fig = go.Figure()

for idx, prop in enumerate(properties):
    fig.add_trace(go.Box(y=df_normalized[prop], name=prop.replace('_', ' ')))

# Adding annotations to the plot
for idx, ann in enumerate(annotations):
    fig.add_annotation(
        x=idx, y=1.05,  # Position for the annotation, adjust y to position above the boxes
        text=ann,
        showarrow=False,
        xref="x",
        yref="paper",
        align="center"
    )

fig.update_layout(
    title="Normalized Properties with Original Values",
    yaxis_title="Normalized Value",
    xaxis_title="Property",
    boxmode='group'  # Allows grouping of boxes if more than one group exists
)

fig.show()


In [None]:
from plotly.subplots import make_subplots

# Count non-NaN values for each property
non_nan_counts = {prop: df_grouped[prop].dropna().shape[0] for prop in properties}

# Convert the dictionary to DataFrame for plotting
df_non_nan_counts = pd.DataFrame(list(non_nan_counts.items()), columns=['Property', 'Non-NaN Count'])

# Calculate non-NaN counts per compound (row)
df_grouped['Non-NaN Count'] = df_grouped[properties].apply(lambda x: x.count(), axis=1)

# Aggregate counts
count_distribution = df_grouped['Non-NaN Count'].value_counts().sort_index()

# Prepare data for the second pie chart
df_count_distribution = pd.DataFrame({
    'Number of Values': count_distribution.index,
    'Count of Compounds': count_distribution.values
})

# Create subplots
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'pie'}, {'type': 'pie'}]], 
                    subplot_titles=["Property Data Availability", "Compound Data Completeness"])

# Add first pie chart (Property Non-NaN counts)
fig.add_trace(
    go.Pie(labels=df_non_nan_counts['Property'], values=df_non_nan_counts['Non-NaN Count'], 
           name="Properties Non-NaN", title='Distribution of Non-NaN Counts Across Properties'),
    row=1, col=1
)

# Add second pie chart (Compound count per number of values)
fig.add_trace(
    go.Pie(labels=df_count_distribution['Number of Values'], values=df_count_distribution['Count of Compounds'], 
           name="Compound Counts", title='Number of Non-NaN Properties per Compound'),
    row=1, col=2
)

# Update layout for clear display
fig.update_layout(title_text="Overview of Data Completeness in the Dataset")

# Show the figure
fig.show()


## Schmeas

To adapt the schema concept for a battery database, we can define a new schema called BatteryProperties that encapsulates various physical and chemical properties relevant to battery materials. This schema can include properties like capacity, voltage, energy density, and other metrics crucial for battery performance evaluation.

### Defining the structure for battery data

In [18]:
from nomad.metainfo import Quantity
from nomad.datamodel.data import Schema
import numpy as np

class BatteryProperties(Schema):
    """
    A schema describing key properties of battery materials, extracted from 
    experimental data or computational simulations. This schema aims to provide a 
    structured and standardized way to store battery performance metrics.
    """
    capacity = Quantity(
        type=np.float64,
        unit='mA*hour/g',
        description='''
        The electric charge a battery can deliver at the rated voltage, per unit mass.
        This is a measure of the energy storage capability of the battery material.
        '''
    )
    voltage = Quantity(
        type=np.float64,
        unit='V',
        description='''
        The electrical potential difference between the positive and negative terminals 
        when the battery is connected in a circuit. It represents the battery's ability 
        to drive electric currents through external devices.
        '''
    )
    coulombic_efficiency = Quantity(
        type=np.float64,
        unit='',
        description='''
        The ratio of the charge extracted from the battery to the charge put into the 
        battery over a complete charge/discharge cycle. It is a measure of the energy 
        efficiency of the battery.
        '''
    )
    energy_density = Quantity(
        type=np.float64,
        unit='W*hour/kg',
        description='''
        The amount of energy stored in a battery per unit mass. This metric is crucial 
        for evaluating the effectiveness of a battery in terms of its potential 
        applications in portable electronic devices and electric vehicles.
        '''
    )
    conductivity = Quantity(
        type=np.float64,
        unit='S/m',
        description='''
        The ability of a battery material to conduct electric current. This property is 
        fundamental for assessing the internal resistance of the battery and its 
        performance in electrical circuits.
        '''
    )

Further data can be add using below example

In [None]:
from nomad.units import ureg
from IPython.display import JSON

# Creating an instance of BatteryProperties
battery = BatteryProperties(
    capacity = 250.0,  # 250 mAh/g
    voltage = 3.7      # 3.7 Volts
)

# Setting additional properties
battery.coulombic_efficiency = 95.0  # 95 percent
battery.energy_density = 500.0       # 500 Wh/kg

# Unit information is carried over when setting/getting quantities.
# Here we will add conductivity with a unit using ureg
battery.conductivity = 1.5 * ureg('S/m')

# Displaying some of the properties with units
print(f"Voltage: {battery.voltage} V")
print(f"Conductivity: {battery.conductivity}")

# Using JSON to display the structured data from our schema instance
JSON(battery.m_to_dict())


### creating class using base section

In [20]:
from nomad.metainfo import Quantity
from nomad.datamodel.data import Schema
from nomad.datamodel.metainfo.basesections import PublicationReference
import numpy as np

class BatteryData(Schema, PublicationReference):
    """
    A schema describing key properties of battery materials and devices, with
    information extracted from the literature. This schema facilitates the documentation
    of battery performance metrics alongside their source publications.
    """
    capacity = Quantity(
        type=np.float64,
        unit='mA*hour/g',
        description='''
        The electric charge a battery can deliver at the rated voltage, per unit mass.
        This is a measure of the energy storage capability of the battery material.
        '''
    )
    voltage = Quantity(
        type=np.float64,
        unit='V',
        description='''
        The electrical potential difference between the positive and negative terminals 
        when the battery is connected in a circuit. It represents the battery's ability 
        to drive electric currents through external devices.
        '''
    )
    coulombic_efficiency = Quantity(
        type=np.float64,
        unit='',
        description='''
        The ratio of the charge extracted from the battery to the charge put into the 
        battery over a complete charge/discharge cycle. It is a measure of the energy 
        efficiency of the battery.
        '''
    )
    energy_density = Quantity(
        type=np.float64,
        unit='W*hour/kg',
        description='''
        The amount of energy stored in a battery per unit mass. This metric is crucial 
        for evaluating the effectiveness of a battery in terms of its potential 
        applications in portable electronic devices and electric vehicles.
        '''
    )
    conductivity = Quantity(
        type=np.float64,
        unit='S/m',
        description='''
        The ability of a battery material to conduct electric current. This property is 
        fundamental for assessing the internal resistance of the battery and its 
        performance in electrical circuits.
        '''
    )

In [21]:
battery.IUPAC = "Lithium hexafluorophosphate"
battery.DOI_number = "10.1000/182"

## Define automation through a `normalize` function

In addition to defining the structure for our data, schemas can also contain a `normalize` function. This function is run automatically whenever the data changes and it can help in automating certain tasks. You could use a normalize function to read in files, post- or pre-process data, fill in information that can be derived from other fields etc. Let's add a `normalize` function to our schema:

In [22]:
from nomad.metainfo import Quantity
from nomad.datamodel.data import Schema
from nomad.datamodel.metainfo.basesections import PublicationReference
import numpy as np

class BatteryData(Schema, PublicationReference):
    """
    A schema describing key properties of battery materials and devices, with
    information extracted from the literature. This schema facilitates the documentation
    of battery performance metrics alongside their source publications.
    """
    capacity = Quantity(
        type=np.float64,
        unit='mA*hour/g',
        description='''
        The electric charge a battery can deliver at the rated voltage, per unit mass.
        This is a measure of the energy storage capability of the battery material.
        '''
    )
    voltage = Quantity(
        type=np.float64,
        unit='V',
        description='''
        The electrical potential difference between the positive and negative terminals 
        when the battery is connected in a circuit. It represents the battery's ability 
        to drive electric currents through external devices.
        '''
    )
    coulombic_efficiency = Quantity(
        type=np.float64,
        unit='',
        description='''
        The ratio of the charge extracted from the battery to the charge put into the 
        battery over a complete charge/discharge cycle. It is a measure of the energy 
        efficiency of the battery.
        '''
    )
    energy_density = Quantity(
        type=np.float64,
        unit='W*hour/kg',
        description='''
        The amount of energy stored in a battery per unit mass. This metric is crucial 
        for evaluating the effectiveness of a battery in terms of its potential 
        applications in portable electronic devices and electric vehicles.
        '''
    )
    conductivity = Quantity(
        type=np.float64,
        unit='S/m',
        description='''
        The ability of a battery material to conduct electric current. This property is 
        fundamental for assessing the internal resistance of the battery and its 
        performance in electrical circuits.
        '''
    )
    iupac_name = Quantity(
        type=str,
        description="The IUPAC name for the chemical composition of the battery material."
    )
    DOI_number = Quantity(
        type=str,
        description="The Digital Object Identifier (DOI) for the publication referencing this battery data."
    )

    def normalize(self, archive, logger: None) -> None:
        # Here you can trigger base class normalization
        super().normalize(archive, logger)

        # Custom normalization process for battery data
        self.normalize_battery_properties()

    '''def normalize_battery_properties(self):
        """Custom normalization logic for battery properties."""
        # Example normalization: Ensure no value exceeds practical physical limits
        if self.capacity.value > 500:  # just an example in mAh/g
            self.capacity.value = 500
        if self.voltage.value > 5:  # just an example in V
            self.voltage.value = 5'''

In [None]:
# Example Usage:
battery = BatteryData(
    capacity  =200,  # 200 mAh/g
    voltage   =3.7,   # 3.7 Volts
    iupac_name="Lithium hexafluorophosphate",
    DOI_number="10.1000/182"
)

print(f"IUPAC Name: {battery.iupac_name}")
print(f"DOI: {battery.DOI_number}")

In [None]:
! pip install rdkit

### Populating the `results` section

In addition to the completely custom `archive.data` subsection, there is also a pre-defined `archive.results` subsection. This is a section that is found in all entries, and it can be used to provide data that is cross-schema compatible. It contains things like formulas, compositions, methodologies, and a subset of relevant physical properties. NOMAD contains built-in tooling that can pick up on data contained in this section to make queries and visualizations.

We should use our `normalize` function to populate this part of the archive. By populating this section, we make sure that people can find our data e.g. by querying `results.material.elements:all: ['C', 'H', 'O', 'N']` and we can also store the full 3D structure of the molecule in `results.material.topology` to visualize it automatically. Let's populate `results` by extending our `normalize` function:

In [27]:
from ase import Atoms
from nomad.datamodel.results import Material, System
from nomad.normalizing.common import nomad_atoms_from_ase_atoms
from nomad.normalizing.topology import add_system_info, add_system
import logging

# Set up logging
logger = logging.getLogger(__name__)

def normalize_battery_properties(self, archive, logger=None) -> None:
    if self.iupac_name:
        # Assume we have a method to convert IUPAC to 3D structure using ASE, which is not straightforward
        # This is a placeholder for the conversion logic:
        # Example structure for Lithium Cobalt Oxide (simplified)
        if 'Lithium Cobalt Oxide' in self.iupac_name:
            positions = [(0, 0, 0), (1, 1, 1), (2, 2, 2)]  # Hypothetical positions
            atomic_numbers = [3, 27, 8]  # Li, Co, O
            ase_atoms = Atoms(numbers=atomic_numbers, positions=positions)

            # Convert ASE atoms to NOMAD atoms
            nomad_atoms = nomad_atoms_from_ase_atoms(ase_atoms)
            
            # Ensure the results section is initialized
            if not archive.results.material:
                archive.results.material = Material()
            
            # Create a System: this stores structural and chemical information
            system = System(
                atoms=nomad_atoms,
                label='3D structure',
                description='3D reconstruction of the material generated from the IUPAC name.',
                structural_type='material',
                dimensionality='3D',
            )
            
            # Populate the topology and add the system to the archive
            if not hasattr(archive.results, 'topology'):
                archive.results.topology = {}
            add_system_info(system, archive.results.topology)
            add_system(system, archive.results.topology)
            
            if logger:
                logger.info("3D structure generated and stored in the NOMAD archive.")


In [None]:
# Example use

from nomad.datamodel.datamodel import EntryArchive
import logging
logger = logging.getLogger(__name__)

archive = EntryArchive(
    data=BatteryData(
        iupac_name='Lithium Cobalt Oxide',
        DOI_number="10.1021/jp5126624"
    )
)
archive.data.normalize(archive, logger)
JSON(archive.data.m_to_dict())