In [7]:
import pickle
import gzip
import sys
import pandas as pd

def open_and_modify_pkl_gz(file_path):
    """
    Opens a .pkl.gz file, modifies column names if it's a DataFrame, 
    and saves the changes to a new .pkl.gz file.
    """
    try:
        # Open and load the .pkl.gz file
        with gzip.open(file_path, 'rb') as gz_file:
            data = pickle.load(gz_file)
        
        print("File loaded successfully.")

        # Check if the data is a Pandas DataFrame
        if isinstance(data, pd.DataFrame):
            # Rename columns if they exist
            data = data.rename(columns={"rwp_clean": "rwp", "ws_clean": "wd"})
            print("Columns renamed successfully.")
        else:
            print("Data is not a Pandas DataFrame. No columns to rename.")
        
        # Save the modified data back to .pkl.gz
        modified_file_path = file_path.replace(".pkl.gz", "_modified.pkl.gz")
        with gzip.open(modified_file_path, 'wb') as gz_file:
            pickle.dump(data, gz_file, protocol=pickle.HIGHEST_PROTOCOL)
        
        print(f"Modified file saved as {modified_file_path}")
        return data
    except Exception as e:
        print(f"Error: {e}")
        return None

In [10]:
open_and_modify_pkl_gz("../experiments/model__conditioned_mlp_augmentation__context_3076/baseline/deCIFerU_CompSG.pkl.gz")

File loaded successfully.
Columns renamed successfully.
Modified file saved as ../experiments/model__conditioned_mlp_augmentation__context_3076/baseline/deCIFerU_CompSG_modified.pkl.gz


Unnamed: 0,rwp_dirty,rwp,s12_dirty,s12_clean,hd_clean,wd,r2_dirty,r2_clean,soap_small_distance,soap_large_distance,...,spacegroup_sym_gen,spacegroup_num_gen,cif_sample,cif_gen,seq_len,formula_validity,spacegroup_validity,bond_length_validity,site_multiplicity_validity,validity
0,1.112630,1.112630,0.396529,0.396528,1.120046,0.114724,-0.402474,-0.402474,0.999629,0.992233,...,F-43m,216.0,# generated using pymatgen\ndata_Ca4CrPb\nloop...,data_Ca16Cr4Pb4\nloop_\n_atom_type_symbol\n_at...,331,True,True,True,True,True
1,1.166893,1.166893,0.304807,0.304807,5.731557,0.116283,-0.541161,-0.541161,0.999853,0.993417,...,R-3m,166.0,# generated using pymatgen\ndata_Sc2CdPt\nloop...,data_Sc6Cd3Pt3\nloop_\n_atom_type_symbol\n_ato...,332,True,True,True,True,True
2,1.248801,1.248802,0.302432,0.302432,17.409782,0.115931,-0.839260,-0.839264,0.999898,0.990321,...,Cm,8.0,# generated using pymatgen\ndata_In2TeSe\nloop...,data_In4Te2Se2\nloop_\n_atom_type_symbol\n_ato...,362,True,True,True,True,True
3,0.273393,0.273395,0.974780,0.974780,1.016977,0.012880,0.921040,0.921039,0.999981,0.999533,...,P4/mmm,123.0,# generated using pymatgen\ndata_BeRuSe2\nloop...,data_Be1Ru1Se2\nloop_\n_atom_type_symbol\n_ato...,327,True,True,True,True,True
4,1.270040,1.270041,0.174354,0.174355,1.433289,0.141093,-0.748189,-0.748192,0.999972,0.979130,...,I4/mmm,131.0,# generated using pymatgen\ndata_Sr3Ca\nloop_\...,data_Sr12Ca4\nloop_\n_atom_type_symbol\n_atom_...,337,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4967,0.657943,0.657943,0.791772,0.791771,1.642272,0.039449,0.532219,0.532218,0.999654,0.999654,...,Immm,71.0,# generated using pymatgen\ndata_HfCr2Ag\nloop...,data_Hf2Cr4Ag2\nloop_\n_atom_type_symbol\n_ato...,327,True,True,True,True,True
4968,0.529643,0.529643,0.871516,0.871516,0.084410,0.021520,0.711968,0.711968,0.999734,0.999739,...,Fm-3m,225.0,# generated using pymatgen\ndata_ScMn2Cl\nloop...,data_Sc4Mn8Cl4\nloop_\n_atom_type_symbol\n_ato...,328,True,True,True,True,True
4969,0.433146,0.433138,0.911859,0.911862,0.127536,0.018553,0.798771,0.798778,0.999958,0.999606,...,Fm-3m,225.0,# generated using pymatgen\ndata_LiAc2Co\nloop...,data_Li4Ac8Co4\nloop_\n_atom_type_symbol\n_ato...,328,True,True,True,True,True
4970,1.231889,1.231889,0.191623,0.191623,22.116280,0.256647,-0.664917,-0.664916,0.998316,0.989905,...,P4mm,99.0,# generated using pymatgen\ndata_AlNi2Se\nloop...,data_Al1Ni2Se1\nloop_\n_atom_type_symbol\n_ato...,361,True,True,True,True,True
