Use this notebook to generate the features or format the dataset for the various models.

# Random Forest

In [8]:
# =============================================================================
# This module contains featurizers from Matminer for use with Random Forest models.
# =============================================================================

import matminer.featurizers.composition as cf
from matminer.featurizers.conversions import CompositionToOxidComposition
import os
from pymatgen.core.composition import Composition
import local_pkgs.proj_pkg.preprocessing as preprocess
import local_pkgs.proj_pkg.data_handler as dh
import pandas as pd
import numpy as np
import local_pkgs.proj_pkg.utils as utils

# taken from MODNet (no oxidation state) - CompositionOnlyMatminer2023Featurizer
# DOI: 10.1038/s41524-021-00552-2

comp_feat_ls = [
    cf.AtomicOrbitals(),
    cf.AtomicPackingEfficiency(),
    cf.BandCenter(),
    cf.ElementFraction(),
    cf.ElementProperty.from_preset("magpie"),
    cf.IonProperty(),
    cf.Miedema(),
    cf.Stoichiometry(),
    cf.TMetalFraction(),
    cf.ValenceOrbital(),
    cf.YangSolidSolution(),
]

df = pd.read_excel('data/sysTEm_dataset.xlsx')

df['reduced_compositions'] = df['reduced_compositions'].apply(lambda x: Composition(str(Composition(x))))

# featurization
feat_df, errors_df = preprocess.batch_matminer_featurize_comp_to_df(df['reduced_compositions'], comp_feat_ls, oxidation_states=False, ignore_err=True)

feat_df = preprocess.clean_df_cols(feat_df)

# save the combined host-dopant dataframe to a csv file
file_dir = "data/matminer_features.csv"
feat_df.to_csv(file_dir, index=False)

  mixing_dataset = pd.read_csv(


There will be a maximum of 271 types of features generated.


MultipleFeaturizer:   0%|          | 0/8458 [00:00<?, ?it/s]

  from pkg_resources import DistributionNotFound, get_distribution
  from pkg_resources import DistributionNotFound, get_distribution
  from pkg_resources import DistributionNotFound, get_distribution
  from pkg_resources import DistributionNotFound, get_distribution
  from pkg_resources import DistributionNotFound, get_distribution
  from pkg_resources import DistributionNotFound, get_distribution
  from pkg_resources import DistributionNotFound, get_distribution
  from pkg_resources import DistributionNotFound, get_distribution
  from pkg_resources import DistributionNotFound, get_distribution
  from pkg_resources import DistributionNotFound, get_distribution
  from pkg_resources import DistributionNotFound, get_distribution
  from pkg_resources import DistributionNotFound, get_distribution
  from pkg_resources import DistributionNotFound, get_distribution
  from pkg_resources import DistributionNotFound, get_distribution
  from pkg_resources import DistributionNotFound, get_distribu

Shape of the DataFrame: (8458, 271)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8458 entries, 0 to 8457
Columns: 271 entries, AtomicOrbitals|HOMO_character to YangSolidSolution|Yang delta
dtypes: float64(232), int64(38), object(1)
memory usage: 17.5+ MB
None


# DopNet
We shall split the dataset into individual target properties, so that we can largely reuse the example code from the DopNet repository

In [1]:
import numpy
import torch
import itertools
import pandas
import local_pkgs.dopnet_pkg.autoencoder as ae
import local_pkgs.proj_pkg.preprocessing as preprocess
from torch.utils.data import DataLoader
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from local_pkgs.dopnet_pkg.ml import get_k_folds_list, get_k_folds_by_index_ls
import os
import local_pkgs.dopnet_pkg.dopnet as dp 
from local_pkgs.proj_pkg.preprocessing import CompositionKFold
import yaml
import pandas as pd
from pymatgen.core import Composition


with open("thermoelectric_properties.yaml", "r", encoding="utf-8") as file:
    properties = yaml.safe_load(file)

  from pkg_resources import DistributionNotFound, get_distribution


In [3]:
# for dataset generation
for target, v in properties.items():

    df = pd.read_excel('data/sysTEm_dataset.xlsx')

    target_col = v['column_name']

    composition_col = "reduced_compositions"

    # drop NaN target values
    print(f"Original data size: {len(df)}")
    df = df.dropna(subset=[target_col])
    print(f"Data size after removing rows with missing target values: {len(df)}\n")

    # Average duplicates
    compositions_and_temp_count = preprocess.count_identical_rows(df, composition_col, 'Temperature (K)')
    df_copy = df.copy()
    df = preprocess.average_duplicates(df_copy, compositions_and_temp_count, target_col)
    avg_replacements = ['AVG' + str(i + 1) for i in range(df['#'].isna().sum())]
    df['#'] = df['#'].apply(lambda x: avg_replacements.pop(0) if pd.isna(x) else str(int(x)) if pd.notna(x) else '')


    # we need to expand the brackets as dopnet's parse formula function does not support brackets
    df[composition_col] = df[composition_col].apply(lambda x: str(Composition(x)))

    # columns to keep
    cols_to_keep = ['#', 'Source Paper', 'Initial Dataset', composition_col, 'Temperature (K)', target_col]
    df = df[cols_to_keep]


    dataset_path = f'data/TE_reduced_comp_DopNet_{target}.xlsx'

    df.to_excel(dataset_path, index=False)

Original data size: 8458
Data size after removing rows with missing target values: 7840

Removed 275 duplicate rows. Generated 175 averaged rows.
Original data size: 8458
Data size after removing rows with missing target values: 2036

Removed 122 duplicate rows. Generated 95 averaged rows.
Original data size: 8458
Data size after removing rows with missing target values: 2036

Removed 122 duplicate rows. Generated 95 averaged rows.
Original data size: 8458
Data size after removing rows with missing target values: 8387

Removed 322 duplicate rows. Generated 207 averaged rows.
Original data size: 8458
Data size after removing rows with missing target values: 8392

Removed 322 duplicate rows. Generated 207 averaged rows.
Original data size: 8458
Data size after removing rows with missing target values: 8424

Removed 331 duplicate rows. Generated 215 averaged rows.
Original data size: 8458
Data size after removing rows with missing target values: 7807

Removed 274 duplicate rows. Generated

# MT CrabNet
We need to expand the dataset for MT purposes

In [5]:
import pandas as pd
import yaml

def expand_targets(df, target_columns, additional_columns=None):
    """
    Expand target columns of a DataFrame into multiple rows for each target.
    Additional metadata columns can be appended. Excludes missing values.
    Prints the number of non-missing values for each target column.

    Parameters:
        df (pd.DataFrame): Input DataFrame containing target columns and optional metadata.
        target_columns (list): List of column names representing the targets to compress.
        additional_columns (list, optional): List of additional columns to retain (e.g., 'temperature', 'composition').

    Returns:
        pd.DataFrame: Compressed DataFrame with 'target', 'target_name', and optional additional columns.
    """
    if not target_columns:
        raise ValueError("Please provide a list of target column names.")
    
    # Print the number of non-missing values in each target column
    print("Non-missing values count for each target column:")
    for col in target_columns:
        non_missing_count = df[col].notna().sum()
        print(f"{col}: {non_missing_count} non-missing values")

    # Melt the target columns into 'target' and 'target_name', excluding missing values
    expanded_df = df.melt(
        id_vars=additional_columns if additional_columns else None,
        value_vars=target_columns,
        var_name='target_name',
        value_name='target'
    ).dropna(subset=['target'])  # Exclude rows where 'target' is NaN

    return expanded_df

# Load the YAML file
with open("thermoelectric_properties.yaml", "r", encoding="utf-8") as file:
    properties = yaml.safe_load(file)

# Extract column names from the YAML file
column_names = [details['column_name'] for details in properties.values()]


df = pd.read_excel("data/sysTEm_dataset.xlsx")

print(f'original dataset size : {len(df)}')

expanded_df = expand_targets(df, target_columns=column_names, additional_columns=['#', 'Initial Dataset', 'Source Paper',  'Pymatgen Composition', 'Pretty Formula', 'Year', 'Temperature (K)'])

print(f'expanded dataset size : {len(expanded_df)}')

original dataset size : 8458
Non-missing values count for each target column:
Total Thermal Conductivity (W/mK): 7840 non-missing values
Electronic Thermal Conductivity (W/mK): 2036 non-missing values
Lattice Thermal Conductivity (W/mK): 2036 non-missing values
Electrical Conductivity (S/cm): 8387 non-missing values
Seebeck Coefficient (µV/K): 8392 non-missing values
Power Factor (µW/cmK²): 8424 non-missing values
zT: 7807 non-missing values
expanded dataset size : 44922


In [6]:
expanded_df.to_excel('data/dataset_expanded.xlsx', index=False)