In [30]:
import os
import pandas as pd

# Directory containing your files
csv_directory = "directory_norm"  # Replace with your directory path

# Manually get the list of all files in the directory
csv_files = [f for f in os.listdir(csv_directory) if f.startswith("id_prop_") and f.endswith("_FiM.csv")]

# Exclude the specific files 'id_prop_Fe_FM.csv' and 'id_prop_all_FM.csv'
csv_files = [f for f in csv_files if "id_prop_Mo_FiM" not in f and "id_prop_all_FiM" not in f]

print(f"Filtered files for processing: {csv_files}")

# Initialize an empty list to hold the merged data
merged_data = []

# Loop through each file and append its content to the merged list
for csv_file in csv_files:
    try:
        # Build the full path to the file
        file_path = os.path.join(csv_directory, csv_file)
        
        # Read the file using pandas to better handle data
        df = pd.read_csv(file_path, header=None)    
        
        # Check if this is the first file; if it is, include the header
        if merged_data:
            # Skip the first row for subsequent files to avoid duplicate headers
            merged_data.append(df.iloc[1:])
        else:
            # For the first file, include all rows (no header skipping)
            merged_data.append(df)
        
        print(f"Appended data from file: {csv_file}, lines: {len(df)}")

    except Exception as e:
        print(f"Error processing file {csv_file}: {e}")

# Concatenate all the dataframes in merged_data into a single dataframe
final_df = pd.concat(merged_data, ignore_index=True)

# Save the merged data into a single file
output_file = os.path.join(csv_directory, "excluded_Mo_FiM.csv")
final_df.to_csv(output_file, index=False, header=False)

# Print the shape and head of the merged data
print(f"Shape of the merged file: {final_df.shape}")
print("\nFirst 5 rows of the merged file:")
print(final_df.head())


Filtered files for processing: ['id_prop_Co_FiM.csv', 'id_prop_Cr_FiM.csv', 'id_prop_Cu_FiM.csv', 'id_prop_Fe_FiM.csv', 'id_prop_Mn_FiM.csv', 'id_prop_Ni_FiM.csv', 'id_prop_V_FiM.csv']
Appended data from file: id_prop_Co_FiM.csv, lines: 978
Appended data from file: id_prop_Cr_FiM.csv, lines: 311
Appended data from file: id_prop_Cu_FiM.csv, lines: 334
Appended data from file: id_prop_Fe_FiM.csv, lines: 1265
Appended data from file: id_prop_Mn_FiM.csv, lines: 833
Appended data from file: id_prop_Ni_FiM.csv, lines: 484
Appended data from file: id_prop_V_FiM.csv, lines: 384
Shape of the merged file: (4583, 3)

First 5 rows of the merged file:
            0                                                  1         2
0  mp-1310548  Full Formula (Ca8 Co4 Ir4 O24)\nReduced Formul...  0.004852
1  mp-1219037  Full Formula (Sm2 Ga2 Co2)\nReduced Formula: S...  0.006517
2  mp-1275349  Full Formula (Li4 Co2 Cu2 O8)\nReduced Formula...  0.000182
3  mp-1221741  Full Formula (Mn3 Co1 P4)\nReduced For

In [31]:
import pandas as pd

# Path to the merged CSV output file
output_file = "directory_norm/excluded_Mo_FiM.csv"  # Replace with your actual path

def check_file_integrity(file_path):
    # Read the file into a pandas DataFrame
    df = pd.read_csv(file_path, header=None)

    # 1. Check for missing (NaN) values in the file
    missing_values = df.isna().sum().sum()
    if missing_values > 0:
        print(f"Warning: There are {missing_values} missing values in the file.")
    else:
        print("No missing values detected.")

    # 2. Check for duplicate rows
    duplicate_rows = df.duplicated().sum()
    if duplicate_rows > 0:
        print(f"Warning: There are {duplicate_rows} duplicate rows in the file.")
    else:
        print("No duplicate rows detected.")

    # 3. Check for rows with the wrong number of columns
    num_columns = df.shape[1]
    inconsistent_columns = df[df.apply(lambda row: len(row) != num_columns, axis=1)]
    if not inconsistent_columns.empty:
        print(f"Warning: There are {inconsistent_columns.shape[0]} rows with inconsistent number of columns.")
    else:
        print("All rows have the correct number of columns.")

    # 4. Check for empty rows
    empty_rows = df.isnull().all(axis=1).sum()
    if empty_rows > 0:
        print(f"Warning: There are {empty_rows} empty rows in the file.")
    else:
        print("No empty rows detected.")

    # Summary of the dataframe structure
    print("\nSummary of the DataFrame:")
    print(f"Shape of the DataFrame: {df.shape}")
    print(f"First 5 rows of the DataFrame:")
    print(df.head())

# Run the check on the merged file
check_file_integrity(output_file)


No missing values detected.
All rows have the correct number of columns.
No empty rows detected.

Summary of the DataFrame:
Shape of the DataFrame: (4583, 3)
First 5 rows of the DataFrame:
            0                                                  1         2
0  mp-1310548  Full Formula (Ca8 Co4 Ir4 O24)\nReduced Formul...  0.004852
1  mp-1219037  Full Formula (Sm2 Ga2 Co2)\nReduced Formula: S...  0.006517
2  mp-1275349  Full Formula (Li4 Co2 Cu2 O8)\nReduced Formula...  0.000182
3  mp-1221741  Full Formula (Mn3 Co1 P4)\nReduced Formula: Mn...  0.034877
4  mp-1188309  Full Formula (Tb6 Co1 Br10)\nReduced Formula: ...  0.001868


In [32]:
# Path to the merged CSV output file (update to your actual path)
#output_file = "directory_norm/excluded_Fe_FiM.csv"  # Replace with your actual path

def remove_duplicates(file_path):
    # Read the file into a pandas DataFrame
    df = pd.read_csv(file_path, header=None)

    # Check for and remove duplicate rows
    df_no_duplicates = df.drop_duplicates()

    # Save the cleaned DataFrame without duplicates back to the file
    cleaned_output_file = file_path.replace(".csv", "_cleaned.csv")
    df_no_duplicates.to_csv(cleaned_output_file, index=False, header=False)

    print(f"Duplicates removed. Cleaned file saved as: {cleaned_output_file}")
    print(f"Shape of the cleaned file: {df_no_duplicates.shape}")
    print(f"First 5 rows of the cleaned file:")
    print(df_no_duplicates.head())

# Run the function to remove duplicates and save the cleaned file
remove_duplicates(output_file)


Duplicates removed. Cleaned file saved as: directory_norm/excluded_Mo_FiM_cleaned.csv
Shape of the cleaned file: (3784, 3)
First 5 rows of the cleaned file:
            0                                                  1         2
0  mp-1310548  Full Formula (Ca8 Co4 Ir4 O24)\nReduced Formul...  0.004852
1  mp-1219037  Full Formula (Sm2 Ga2 Co2)\nReduced Formula: S...  0.006517
2  mp-1275349  Full Formula (Li4 Co2 Cu2 O8)\nReduced Formula...  0.000182
3  mp-1221741  Full Formula (Mn3 Co1 P4)\nReduced Formula: Mn...  0.034877
4  mp-1188309  Full Formula (Tb6 Co1 Br10)\nReduced Formula: ...  0.001868


In [33]:
import pandas as pd

# Path to the cleaned CSV file
#cleaned_file = "directory_norm/excluded_Mn_FiM_cleaned.csv"  # Replace with your actual path
cleaned_file = "directory_norm/excluded_Mo_FiM_cleaned.csv"

def remove_fe_materials(file_path):
    # Read the cleaned CSV file into a pandas DataFrame
    df = pd.read_csv(file_path, header=None)

    # Assuming the 2nd column (index 1) contains material structure data, check for 'Fe' in that column
    df_filtered = df[~df[1].str.contains("Mo", na=False)]  # ~ means not contains

    # Save the filtered DataFrame back to a new CSV file
    filtered_output_file = file_path.replace(".csv", "_no_Mo.csv")
    df_filtered.to_csv(filtered_output_file, index=False, header=False)

    print(f"Rows containing 'Fe' in the formula have been removed.")
    print(f"Filtered file saved as: {filtered_output_file}")
    print(f"Shape of the filtered file: {df_filtered.shape}")
    print(f"First 5 rows of the filtered file:")
    print(df_filtered.head())

# Run the function to remove materials containing 'Fe' and save the filtered file
remove_fe_materials(cleaned_file)


Rows containing 'Fe' in the formula have been removed.
Filtered file saved as: directory_norm/excluded_Mo_FiM_cleaned_no_Mo.csv
Shape of the filtered file: (3687, 3)
First 5 rows of the filtered file:
            0                                                  1         2
0  mp-1310548  Full Formula (Ca8 Co4 Ir4 O24)\nReduced Formul...  0.004852
1  mp-1219037  Full Formula (Sm2 Ga2 Co2)\nReduced Formula: S...  0.006517
2  mp-1275349  Full Formula (Li4 Co2 Cu2 O8)\nReduced Formula...  0.000182
3  mp-1221741  Full Formula (Mn3 Co1 P4)\nReduced Formula: Mn...  0.034877
4  mp-1188309  Full Formula (Tb6 Co1 Br10)\nReduced Formula: ...  0.001868
