In [3]:
import re
import pandas as pd
from pathlib import Path

# Set the directory path containing .log files
log_dir = Path("ML_dataset")  # Update this path
log_files = list(log_dir.glob("*.log"))

# Full periodic table for element lookup
periodic_table = {
    1: "H", 2: "He", 3: "Li", 4: "Be", 5: "B", 6: "C", 7: "N", 8: "O", 9: "F", 10: "Ne",
    11: "Na", 12: "Mg", 13: "Al", 14: "Si", 15: "P", 16: "S", 17: "Cl", 18: "Ar", 19: "K", 20: "Ca",
    21: "Sc", 22: "Ti", 23: "V", 24: "Cr", 25: "Mn", 26: "Fe", 27: "Co", 28: "Ni", 29: "Cu", 30: "Zn",
    31: "Ga", 32: "Ge", 33: "As", 34: "Se", 35: "Br", 36: "Kr", 37: "Rb", 38: "Sr", 39: "Y", 40: "Zr",
    41: "Nb", 42: "Mo", 43: "Tc", 44: "Ru", 45: "Rh", 46: "Pd", 47: "Ag", 48: "Cd", 49: "In", 50: "Sn",
    51: "Sb", 52: "Te", 53: "I", 54: "Xe", 55: "Cs", 56: "Ba", 57: "La", 58: "Ce", 59: "Pr", 60: "Nd",
    61: "Pm", 62: "Sm", 63: "Eu", 64: "Gd", 65: "Tb", 66: "Dy", 67: "Ho", 68: "Er", 69: "Tm", 70: "Yb",
    71: "Lu", 72: "Hf", 73: "Ta", 74: "W", 75: "Re", 76: "Os", 77: "Ir", 78: "Pt", 79: "Au", 80: "Hg",
    81: "Tl", 82: "Pb", 83: "Bi", 84: "Po", 85: "At", 86: "Rn", 87: "Fr", 88: "Ra", 89: "Ac", 90: "Th",
    91: "Pa", 92: "U", 93: "Np", 94: "Pu", 95: "Am", 96: "Cm", 97: "Bk", 98: "Cf", 99: "Es", 100: "Fm",
    101: "Md", 102: "No", 103: "Lr", 104: "Rf", 105: "Db", 106: "Sg", 107: "Bh", 108: "Hs", 109: "Mt", 
    110: "Ds", 111: "Rg", 112: "Cn", 113: "Nh", 114: "Fl", 115: "Mc", 116: "Lv", 117: "Ts", 118: "Og"
}

all_data = []

for file_path in log_files:
    with file_path.open() as f:
        lines = f.readlines()

    # Check for error termination
    has_error_termination = any("Error termination" in line for line in lines)

    # Count imaginary frequencies
    imaginary_freqs = []
    for line in lines:
        match = re.search(r"Frequencies --\s+(.+)", line)
        if match:
            freqs = [float(f) for f in match.group(1).split()]
            imag_count = sum(1 for f in freqs if f < 0)
            if imag_count > 0:
                imaginary_freqs.append(imag_count)
    num_imaginary = sum(imaginary_freqs)

    # Skip files with any imaginary frequencies or errors
    if has_error_termination or num_imaginary > 0:
        print(f"Skipping: {file_path.name} (Imaginary: {num_imaginary}, Error: {has_error_termination})")
        continue

    # --- Extract standard orientation blocks ---
    std_blocks = []
    block = []
    capture = False
    for line in lines:
        if 'Standard orientation:' in line:
            if block:
                std_blocks.append(block)
                block = []
            capture = True
        elif capture:
            if '---------------------------------------------------------------------' in line:
                if block:
                    std_blocks.append(block)
                    block = []
                    capture = False
                continue
            elif re.match(r"\s*\d+\s+\d+\s+\d+\s+[-\d.Ee]+\s+[-\d.Ee]+\s+[-\d.Ee]+", line):
                block.append(line.strip())
    std_blocks = std_blocks[1:]

    # --- Extract force blocks ---
    force_blocks = []
    block = []
    capture = False
    for line in lines:
        if 'Forces (Hartrees/Bohr)' in line:
            if block:
                force_blocks.append(block)
                block = []
            capture = True
        elif capture:
            if '-------------------------------------------------------------------' in line:
                if block:
                    force_blocks.append(block)
                    block = []
                    capture = False
                continue
            elif re.match(r"\s*\d+\s+\d+\s+[-\d.Ee]+\s+[-\d.Ee]+\s+[-\d.Ee]+", line):
                block.append(line.strip())

    # --- Extract SCF energies ---
    scf_energies = [float(m.group(1)) for m in (re.search(r'SCF Done:  E\(\w+\) =\s*(-?\d+\.\d+)', l) for l in lines) if m]

    # Only take the final optimization step
    if std_blocks and force_blocks and scf_energies:
        std_block = std_blocks[-1]
        force_block = force_blocks[-1]
        i = len(std_blocks) - 1  # Final step index

        atomic_numbers, xs, ys, zs = [], [], [], []
        fxs, fys, fzs = [], [], []

        for entry in std_block:
            parts = entry.split()
            atomic_numbers.append(int(parts[1]))
            xs.append(float(parts[3]))
            ys.append(float(parts[4]))
            zs.append(float(parts[5]))

        for entry in force_block:
            parts = entry.split()
            fxs.append(float(parts[2]))
            fys.append(float(parts[3]))
            fzs.append(float(parts[4]))

        for j in range(len(atomic_numbers)):
            all_data.append({
                "Filename": file_path.name if j == 0 else None,
                "Step": i,
                "Element": periodic_table.get(atomic_numbers[j], f"Z{atomic_numbers[j]}"),
                "X": xs[j],
                "Y": ys[j],
                "Z": zs[j],
                "Fx": fxs[j],
                "Fy": fys[j],
                "Fz": fzs[j],
                "SCF_Energy": scf_energies[i] if j == 0 else None,
                "ImaginaryFrequency": num_imaginary if j == 0 else None,
                "ErrorTermination": has_error_termination if j == 0 else None
            })
        
        # Append a blank row to separate files
        all_data.append({
            "Filename": None, "Step": None, "Element": None,
            "X": None, "Y": None, "Z": None,
            "Fx": None, "Fy": None, "Fz": None,
            "SCF_Energy": None, "ImaginaryFrequency": None, "ErrorTermination": None
        })


# --- Final combined DataFrame ---
df = pd.DataFrame(all_data)

# --- Save result to CSV ---
output_path = log_dir / "combined_converged_cleaned_output.csv"
df.to_csv(output_path, index=False)

print(f"\n✅ Cleaned combined output saved to: {output_path}")

Skipping: 2,6-PCB_rdkit_conf_1.log (Imaginary: 1, Error: False)
Skipping: Azintamide_rdkit_conf_4.log (Imaginary: 0, Error: True)
Skipping: Ethirimol_rdkit_conf_12.log (Imaginary: 0, Error: True)
Skipping: m-Xylene__rdkit_conf_1.log (Imaginary: 1, Error: False)
Skipping: phenothrin_rdkit_conf_23.log (Imaginary: 0, Error: True)
Skipping: phenothrin_rdkit_conf_24.log (Imaginary: 1, Error: False)

✅ Cleaned combined output saved to: ML_dataset\combined_converged_cleaned_output.csv


In [4]:
df

Unnamed: 0,Filename,Step,Element,X,Y,Z,Fx,Fy,Fz,SCF_Energy,ImaginaryFrequency,ErrorTermination
0,"1,1,2,2-Tetrachloroethane_rdkit_conf_1.log",5.0,Cl,1.489917,1.621225,-0.117794,7.265000e-06,0.000003,5.914000e-06,-1894.274377,0.0,False
1,,5.0,C,0.748630,-0.026527,-0.225010,1.240400e-05,-0.000005,-1.789900e-05,,,
2,,5.0,Cl,1.668336,-1.194270,0.807756,7.419000e-06,0.000002,6.325000e-06,,,
3,,5.0,C,-0.748630,0.026527,0.225010,-1.240400e-05,0.000005,1.789900e-05,,,
4,,5.0,Cl,-1.489917,-1.621225,0.117794,-7.265000e-06,-0.000003,-5.914000e-06,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
26341,,13.0,H,3.143091,0.404970,-0.566556,1.290500e-05,-0.000004,5.870000e-07,,,
26342,,13.0,H,4.596085,0.121664,1.447520,-1.600000e-07,-0.000003,6.276000e-06,,,
26343,,13.0,H,2.952600,0.553299,1.918884,2.429000e-06,0.000004,-8.559000e-06,,,
26344,,13.0,H,3.508429,-1.114685,2.079297,1.826000e-06,0.000002,1.988000e-06,,,
