In [8]:
import re
import pandas as pd
from collections import defaultdict

# Function to extract data from the structure info file and group by space group
def extract_structure_data(file_path, functional_name, grouped_data, energy_data):
    with open(file_path, 'r') as file:
        data = file.read()
    
    # Split data by separators
    phases = data.split('-----------------------------------------')
    
    for phase in phases:
        # Extract space group, lattice constants, angles, and energy (E0)
        space_group_match = re.search(r'International:\s*(\S+)', phase)
        lattice_constants_match = re.search(r'Lattice Constants:\s*([\d\.]+\s+[\d\.]+\s+[\d\.]+)', phase)
        lattice_angles_match = re.search(r'Lattice Angles:\s*([\d\.]+\s+[\d\.]+\s+[\d\.]+)', phase)
        energy_match = re.search(r'E0=\s*([-\d\.E+]+)', phase)  # Extracting the E0 value
        
        if space_group_match and lattice_constants_match and lattice_angles_match and energy_match:
            space_group = space_group_match.group(1)
            lattice_constants = list(map(float, lattice_constants_match.group(1).split()))
            lattice_angles = list(map(float, lattice_angles_match.group(1).split()))
            energy = float(energy_match.group(1))
            
            # Store the data in the grouped_data dictionary for lattice constants and angles
            if space_group not in grouped_data:
                grouped_data[space_group] = {'PBE': [], 'PBESol': [], 'HSE': []}
            
            grouped_data[space_group][functional_name] = lattice_constants + lattice_angles
            
            # Store the energy in the energy_data dictionary for the space group
            if space_group not in energy_data:
                energy_data[space_group] = {'PBE': None, 'PBESol': None, 'HSE': None}
            
            energy_data[space_group][functional_name] = energy

# Function to save the grouped data into a CSV file for lattice constants and angles
def save_to_csv(file_paths, output_csv):
    grouped_data = defaultdict(lambda: {'PBE': [], 'PBESol': [], 'HSE': []})
    energy_data = defaultdict(lambda: {'PBE': None, 'PBESol': None, 'HSE': None})
    
    # Extract data from each file
    for file_path, functional_name in file_paths:
        extract_structure_data(file_path, functional_name, grouped_data, energy_data)
    
    # Prepare the data for the CSV file for structure data
    all_data = []
    headers = ["Space Group", 
               "PBE a", "PBE b", "PBE c", "PBE alpha", "PBE beta", "PBE gamma", 
               "PBESol a", "PBESol b", "PBESol c", "PBESol alpha", "PBESol beta", "PBESol gamma", 
               "HSE a", "HSE b", "HSE c", "HSE alpha", "HSE beta", "HSE gamma"]
    
    # Organize data for each space group
    for space_group, data in grouped_data.items():
        row = [space_group]
        
        for functional in ['PBE', 'PBESol', 'HSE']:
            if data[functional]:
                row.extend(data[functional])
            else:
                row.extend([float('nan')] * 6)  # 6 columns for lattice constants and angles
        
        all_data.append(row)
    
    # Save to CSV for structure data
    df = pd.DataFrame(all_data, columns=headers)
    df.to_csv(output_csv, index=False)
    
    # Prepare the data for the CSV file for energy data
    energy_rows = []
    energy_headers = ["Space Group", "PBE Energy", "PBESol Energy", "HSE Energy"]
    
    # Organize energy data for each space group
    for space_group, energies in energy_data.items():
        energy_row = [space_group]
        
        for functional in ['PBE', 'PBESol', 'HSE']:
            energy_row.append(energies[functional] if energies[functional] is not None else float('nan'))
        
        energy_rows.append(energy_row)
    
    # Convert to DataFrame for energy data
    energy_df = pd.DataFrame(energy_rows, columns=energy_headers)
    
    # Calculate the relative energies by subtracting the minimum energy in each column
    for functional in ['PBE', 'PBESol', 'HSE']:
        min_energy = energy_df[functional + " Energy"].min()
        
        # Print the space group corresponding to the minimum energy
        min_energy_space_group = energy_df.loc[energy_df[functional + " Energy"] == min_energy, "Space Group"].values[0]
        print(f"Minimum energy for {functional}: {min_energy}, corresponding to space group: {min_energy_space_group}")
        
        # Subtract the minimum energy from all values in that column
        energy_df[functional + " Energy"] = energy_df[functional + " Energy"] - min_energy
    
    # Sort the energy data by PBE Energy
    energy_df = energy_df.sort_values(by="PBE Energy")
    
    # Save to CSV for relative energy data
    energy_df.to_csv("different_xc_energy_data.csv", index=False)

# File paths and corresponding functional names
file_paths = [
    ('HSE_structure_info.txt', 'HSE'),
    ('PBE_structure_info.txt', 'PBE'),
    ('PBESol_structure_info.txt', 'PBESol')
]

# Output CSV file for structure data
output_csv = 'structure_data_grouped_.csv'
save_to_csv(file_paths, output_csv)


Minimum energy for PBE: -133.13938, corresponding to space group: P2_1/c
Minimum energy for PBESol: -138.4667, corresponding to space group: P2_1/c
Minimum energy for HSE: -156.62427, corresponding to space group: P2_1/c
