In [None]:
# csv_to_cif.py
import os, ast, json, math
from collections import Counter
import numpy as np
import pandas as pd


def vec_len(v): return float(np.linalg.norm(np.array(v, float)))

def angle_deg(u, v):
    u = np.array(u, float); v = np.array(v, float)
    cosang = float(np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)))
    cosang = max(-1.0, min(1.0, cosang))
    return float(np.degrees(math.acos(cosang)))

def cart_to_frac(a_vec, b_vec, c_vec, r_cart):
    M = np.column_stack([a_vec, b_vec, c_vec])
    f = np.linalg.solve(M, r_cart)
    return f - np.floor(f)  # wrap to [0,1)


def normalize_space_group(sg):
    if not sg or not isinstance(sg, str): return "P 1"
    return sg.replace("-3", " -3 ").replace(":", " : ").strip()

def make_formula_sum(elements):
    c = Counter(elements)
    return " ".join(f"{el}{c[el]}" for el in sorted(c))


def parse_structure_field(value):
    """
    Parse the 'structure' column: JSON → ast.literal_eval → list/dict
    Returns: list[dict]
    """
    if isinstance(value, (list, dict)):
        s = value
    elif pd.isna(value):
        raise ValueError("structure is NaN")
    else:
        t = str(value).strip()
        try:
            s = json.loads(t)
        except Exception:
            try:
                s = ast.literal_eval(t)
            except Exception as e:
                raise ValueError(f"cannot parse structure: {e}")
    if isinstance(s, dict):
        s = [s]
    if not isinstance(s, list) or not s:
        raise ValueError("parsed structure is neither list nor non-empty list")
    return s


def detect_and_to_angstrom(vec):
    """Convert lattice vector to Å. (If norm < 1e-6, assume meter and divide by 1e-10)"""
    v = np.array(vec, float)
    return v/1e-10 if np.linalg.norm(v) < 1e-6 else v

def detect_pos_and_to_angstrom(pos):
    """Convert atomic position to Å. (If norm < 1e-6, assume meter and divide by 1e-10)"""
    r = np.array(pos, float)
    return r/1e-10 if np.linalg.norm(r) < 1e-6 else r


def build_cif_text(formula_structural, space_group,
                   a_len, b_len, c_len, alpha, beta, gamma, volume,
                   site_rows, atom_types, data_prefix=""):
    lines = []
    lines.append("# generated using pymatgen")
    lines.append(f"data_{data_prefix}{formula_structural}")
    lines.append(f"_symmetry_space_group_name_H-M   '{space_group}'")
    lines.append(f"_cell_length_a   {a_len:.8f}")
    lines.append(f"_cell_length_b   {b_len:.8f}")
    lines.append(f"_cell_length_c   {c_len:.8f}")
    lines.append(f"_cell_angle_alpha   {alpha:.8f}")
    lines.append(f"_cell_angle_beta   {beta:.8f}")
    lines.append(f"_cell_angle_gamma   {gamma:.8f}")
    lines.append(f"_chemical_formula_structural   {formula_structural}")
    lines.append(f"_chemical_formula_sum   '{make_formula_sum(atom_types)}'")
    lines.append(f"_cell_volume   {volume:.8f}")

    lines.append("loop_")
    lines.append(" _symmetry_equiv_pos_site_id")
    lines.append(" _symmetry_equiv_pos_as_xyz")
    lines.append("  1  'x, y, z'")

    lines.append("loop_")
    lines.append(" _atom_type_symbol")
    for el in sorted(set(atom_types)):
        lines.append(f"  {el}")

    lines.append("loop_")
    lines.append(" _atom_site_type_symbol")
    lines.append(" _atom_site_label")
    lines.append(" _atom_site_symmetry_multiplicity")
    lines.append(" _atom_site_fract_x")
    lines.append(" _atom_site_fract_y")
    lines.append(" _atom_site_fract_z")
    lines.append(" _atom_site_occupancy")
    for r in site_rows:
        lines.append(
            f"  {r['type_symbol']}  {r['label']}  {r['mult']}  "
            f"{r['fx']:.8f}  {r['fy']:.8f}  {r['fz']:.8f}  {r['occ']:.0f}"
        )
    return "\n".join(lines) + "\n"


def write_cif_from_row(row, out_dir: str, number: int) -> str:
    s0 = parse_structure_field(row["structure"])[0]
    if "data" not in s0:
        raise ValueError("structure[0] has no 'data'")
    cell = s0["data"]
    for k in ("a", "b", "c", "atoms"):
        if k not in cell:
            raise ValueError(f"cell missing '{k}'")

    atoms = cell["atoms"]

    # Lattice parameters
    a_vec = detect_and_to_angstrom(cell["a"])
    b_vec = detect_and_to_angstrom(cell["b"])
    c_vec = detect_and_to_angstrom(cell["c"])
    a_len, b_len, c_len = vec_len(a_vec), vec_len(b_vec), vec_len(c_vec)
    alpha, beta, gamma = angle_deg(b_vec, c_vec), angle_deg(a_vec, c_vec), angle_deg(a_vec, b_vec)
    volume = float(np.dot(a_vec, np.cross(b_vec, c_vec)))

    # Coordinates → Å → fractional coordinates
    elements, atom_cart = [], []
    for at in atoms:
        if not all(k in at for k in ("x", "y", "z", "element")):
            raise ValueError("atom missing one of x,y,z,element")
        elements.append(at["element"])
        atom_cart.append(detect_pos_and_to_angstrom([at["x"], at["y"], at["z"]]))
    atom_frac = [cart_to_frac(a_vec, b_vec, c_vec, rc) for rc in atom_cart]

    # Site labels
    cnt = Counter(); site_rows = []
    for el, f in zip(elements, atom_frac):
        label = f"{el}{cnt[el]}"; cnt[el] += 1
        site_rows.append({"type_symbol": el, "label": label, "mult": 1,
                          "fx": f[0], "fy": f[1], "fz": f[2], "occ": 1.0})

    formula_structural = (row.get("formula") or "".join(sorted(set(elements)))).strip()
    sg = normalize_space_group(row.get("space_group"))

    cif_text = build_cif_text(
        formula_structural, sg, a_len, b_len, c_len, alpha, beta, gamma, volume,
        site_rows, elements, data_prefix=f"{number}_"
    )

    safe_formula = "".join(ch if ch.isalnum() or ch in "-_." else "_" for ch in formula_structural) or "structure"
    out_path = os.path.join(out_dir, f"{number}_{safe_formula}.cif")
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(cif_text)
    return out_path


def convert_csv_to_cif(csv_path: str, out_dir: str = "result") -> list[str]:
    os.makedirs(out_dir, exist_ok=True)
    df = pd.read_csv(csv_path)

    out_files = []
    skipped = []  # (row_number(2-based), df_index, reason)

    # 🔢 Start numbering from 2 → row 2 (Ni4Ta6) gets number 2
    for row_number, (df_index, row) in enumerate(df.iterrows(), start=2):
        # ✅ If formula is empty, stop immediately (do not record skipped)
        formula_val = row.get("formula")
        if pd.isna(formula_val) or str(formula_val).strip() == "":
            print(f"Stop at row {row_number}: empty 'formula'.")
            break

        # Normal processing: if failed, record in skipped and continue
        try:
            if "structure" not in row or pd.isna(row["structure"]):
                raise ValueError("structure is NaN")
            out_files.append(write_cif_from_row(row, out_dir, number=row_number))
        except Exception as e:
            skipped.append((row_number, df_index, str(e)))
            # keep going (no stop)

    # Save skipped records
    if skipped:
        pd.DataFrame(skipped, columns=["row_number(2-based)", "df_index", "reason"])\
          .to_csv(os.path.join(out_dir, "skipped_rows.csv"), index=False)

    print(f"Done. Created {len(out_files)} CIFs. Skipped {len(skipped)} rows.")
    return out_files


if __name__ == "__main__":
    # Example: your uploaded file name
    csv_path = "1_MatDX_EF.csv"
    files = convert_csv_to_cif(csv_path, out_dir="result")
    for p in files:
        print(" -", p)


Done. Created 4471 CIFs. Skipped 529 rows.
 - result\2_Ni4Ta6.cif
 - result\3_Mn4Sb2.cif
 - result\4_Ir2Na2.cif
 - result\5_MoSm.cif
 - result\6_Sn17P12.cif
 - result\7_Sr6Br4.cif
 - result\8_Ge4P12.cif
 - result\9_Mn8Sb.cif
 - result\10_Ba2Bi3.cif
 - result\11_Mo3W3.cif
 - result\12_InSn5.cif
 - result\13_Ga2Sr.cif
 - result\14_Pd2Ta2.cif
 - result\15_CuNb.cif
 - result\16_Ac4I6.cif
 - result\17_Fe6Sb18.cif
 - result\18_InSn5.cif
 - result\19_In4Sn2.cif
 - result\20_Sr6Br4.cif
 - result\21_In2Sn4.cif
 - result\22_RhGe3.cif
 - result\23_CuEu.cif
 - result\24_Cu2Ga2.cif
 - result\25_Fe4Na8.cif
 - result\26_PtSm.cif
 - result\27_Cr4Sr5.cif
 - result\28_CeSb.cif
 - result\29_Nb2W6.cif
 - result\30_In5Br.cif
 - result\31_CsNi.cif
 - result\32_In4Br12.cif
 - result\33_Ir3K.cif
 - result\34_Nb4W8.cif
 - result\35_Ag4Sn8.cif
 - result\36_Nb6Pt2.cif
 - result\37_AlMn.cif
 - result\38_SbP.cif
 - result\39_CrTa3.cif
 - result\40_Al2Rh.cif
 - result\41_TaV5.cif
 - result\42_Au2P.cif
 - result\43_