# Dependencies

In [1]:
import numpy as np
import pandas as pd
import os
import re
from concurrent.futures import ThreadPoolExecutor

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from scipy.interpolate import PchipInterpolator

from tqdm import tqdm

In [2]:
def drop_rows_with_nan(df, how='any', subset=None):
    """
    Removes rows containing NaN values from a Pandas DataFrame.

    Args:
        df: The Pandas DataFrame to process.
        how: 'any' to drop rows containing *any* NaN values, 'all' to drop only rows where *all* values are NaN.
        subset: An optional list of column names to consider. If None, all columns are checked.

    Returns:
        pandas.DataFrame: A new DataFrame with the NaN-containing rows removed.
                          The original DataFrame is not modified.
    """
    # Create a copy to avoid modifying the original DataFrame
    df_cleaned = df.copy()
    df_cleaned = df_cleaned.dropna(axis=0, how=how, subset=subset)
    return df_cleaned

# Working on...

In [None]:
# Specify the directory path
dir_path = "../files/data/RAW"

# Get folder names
master_folders = [f for f in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, f))]

print(master_folders)

# Specify the directory path
dir_path = "../files/data/RAW/{}".format(master_folders[4])

# # Get folder names
# folders = [f for f in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, f))]

# Get folder names that start with "DOUBLE_"
folders = [f for f in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, f)) and f.startswith('DOUBLE_')]

print(folders)

# .DAT to .csv

In [None]:
def process_file(file_info):
    input_file_path, output_file_path = file_info
    
    rows = []
    with open(input_file_path, 'r') as file:
        lines = file.readlines()

        W_R, a_c, a_t, r_t, b_t = None, None, None, None, None
        c_index = 0

        for i, line in enumerate(lines):
            values = line.strip().split()
            if not values:
                continue
            
            if values[0] == "Geometry:":
                crack = False
                # Extract the coefficient multiplying R (which is W/R)
                wr_match = re.search(r"W=h=\s*([\d.]+)\*R", line)
                if wr_match:
                    W_R = float(wr_match.group(1))

                # Extract b/t value
                bt_match = re.search(r"with\s*b/t=([\d.]+)", line)
                if bt_match:
                    b_t = float(bt_match.group(1))

            if values[0] == "Scenario:":
                crack = False
                c_index = values[1]

            elif values[0] == "ndom" and (values[1] == "a1/c1" or values[1] == "a2/c2"):
                crack = False
                next_values = lines[i + 1].strip().split()
                a_c, a_t, r_t = map(float, next_values[1:4])

            elif values[0] == "crack":
                crack = True

            elif values[0].isdigit() and crack:
                row = [c_index] + list(map(float, values[0])) + [W_R, a_c, a_t, r_t, b_t] + list(map(float, values[5:9]))
                rows.append(row)

    df = pd.DataFrame(rows, columns=['c_index', 'crack', 'W/R', 'a/c', 'a/t', 'r/t', 'b/t', 'phi', 'K-T', 'K-B', 'K-P'])
    df.to_csv(output_file_path, index=False)

master_folder = master_folders[4]
tasks = []

for folder in folders:
    dir_path = os.path.join("../files/data/RAW", master_folder, folder)
    files = [f for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f))]

    for file_name in files:
        input_file_path = os.path.join(dir_path, file_name)
        output_file_path = os.path.join(dir_path, f"{file_name[:-6]}.csv")
        tasks.append((input_file_path, output_file_path))


for i in tqdm(range(len(tasks))):
    process_file(tasks[i])

# with ThreadPoolExecutor() as executor:
#     list(tqdm(executor.map(process_file, tasks), total=len(tasks), desc="Processing Files"))

# Cleaning CSV

In [None]:
# Assume drop_rows_with_nan is defined elsewhere and works correctly
# Example placeholder:
def drop_rows_with_nan(df):
    """Drops rows with any NaN values."""
    return df.dropna().copy() # Added .copy() to avoid SettingWithCopyWarning


# --- Function to process a single c_index group ---
# This function will handle the logic for one unique c_index
def process_c_index_group(group_df):
    """
    Processes a single DataFrame group corresponding to a unique c_index.
    Performs crack splitting, parameter extraction, interpolation,
    and constructs the 128x16 output block.

    Returns the 128x16 numpy array block or None if processing fails.
    """
    c_index = group_df['c_index'].iloc[0] # Get the c_index value for this group

    # Split the group by crack type (assuming 'crack' is the column name)
    try:
        crack1_df = group_df[group_df['crack'] == 1]
        crack2_df = group_df[group_df['crack'] == 2]

        if crack1_df.empty or crack2_df.empty:
             # This case should ideally be caught by the initial corrupt_indices check,
             # but as a safeguard within the group processing.
            print(f"Warning: c_index {c_index} does not have data for both cracks 1 and 2 after grouping.")
            return None

    except KeyError:
        print(f"Error: 'crack' column not found for c_index {c_index}.")
        return None
    except Exception as e:
        print(f"Error splitting crack data for c_index {c_index}: {e}")
        return None


    # --- Extract Unique Parameters ---
    # Assuming these parameters are constant for a given c_index
    # and match the column names implied by your original index access
    try:
        # Use .iloc[0] to get the first value, assuming they are constant within the group
        wr = crack1_df['W/R'].iloc[0] # Original: data[:,1]
        rt = crack1_df['r/t'].iloc[0] # Original: data[:,6]
        bt = crack1_df['b/t'].iloc[0] # Original: data[:,7]

        # These are crack-specific parameters
        a1c1 = crack1_df['a/c'].iloc[0] # Original: data[:,2] (crack 1)
        a1t = crack1_df['a/t'].iloc[0]  # Original: data[:,3] (crack 1)

        a2c2 = crack2_df['a/c'].iloc[0] # Original: data[:,4] (crack 2)
        a2t = crack2_df['a/t'].iloc[0]  # Original: data[:,5] (crack 2)

    except (KeyError, IndexError) as e:
         print(f"Error extracting parameters for c_index {c_index}: {e}")
         return None
    except Exception as e:
        print(f"Error extracting parameters for c_index {c_index}: {e}")
        return None

    # --- Prepare for Interpolation ---
    # Extract relevant columns and convert to numpy *once* per crack group
    try:
        # Assuming 'phi', 'K-T', 'K-B', 'K-P' are the last 4 columns as per original indexing logic
        crack1_np = crack1_df[['phi', 'K-T', 'K-B', 'K-P']].to_numpy()
        crack2_np = crack2_df[['phi', 'K-T', 'K-B', 'K-P']].to_numpy()
    except KeyError:
        print(f"Error: Missing expected K or phi columns for c_index {c_index}.")
        return None
    except Exception as e:
        print(f"Error converting crack data to numpy for c_index {c_index}: {e}")
        return None


    # --- Interpolation Logic ---

    # Determine the target phi grid (same for both cracks per c_index)
    # Based on the original code's logic taking min/max phi from each crack data
    try:
        phi_vals1 = crack1_np[:, 0]
        phi_vals2 = crack2_np[:, 0]

        phi_min_combined = min(phi_vals1.min(), phi_vals2.min()) + 0.035
        phi_max_combined = max(phi_vals1.max(), phi_vals2.max()) - 0.035

        # Ensure valid range for linspace
        if phi_min_combined >= phi_max_combined:
            print(f"Warning: Calculated phi_min >= phi_max for c_index {c_index}. Skipping interpolation.")
            return None

        # Generate the target phi grid (128 points as per original code structure)
        phi_grid_target = np.linspace(phi_min_combined, phi_max_combined, 132)[2:-2]

    except Exception as e:
        print(f"Error calculating phi grid for c_index {c_index}: {e}")
        return None

    # Helper function for interpolation (reduces code repetition)
    def interpolate_crack_data(crack_np, phi_grid_target, phi_min_combined, phi_max_combined):
        phi_vals = crack_np[:, 0]
        k_t_vals = crack_np[:, 1]
        k_b_vals = crack_np[:, 2]
        k_p_vals = crack_np[:, 3]

        # Apply phi range filtering using the combined min/max + offset
        filtered_indices = (phi_vals >= phi_min_combined) & (phi_vals <= phi_max_combined)
        phi_vals_filtered = phi_vals[filtered_indices]
        k_t_vals_filtered = k_t_vals[filtered_indices]
        k_b_vals_filtered = k_b_vals[filtered_indices]
        k_p_vals_filtered = k_p_vals[filtered_indices]


        # Need at least 2 data points for Pchip interpolation
        if len(phi_vals_filtered) < 2:
             print(f"Warning: Not enough valid data points ({len(phi_vals_filtered)}) for interpolation.")
             return None, None, None

        # Sort phi values for monotonic input required by PchipInterpolator
        sort_indices = np.argsort(phi_vals_filtered)
        phi_vals_sorted = phi_vals_filtered[sort_indices]

        # Create monotonic indices (in case of identical phi values after filtering)
        monotonic_indices = [0]
        for i in range(1, len(phi_vals_sorted)):
             if phi_vals_sorted[i] > phi_vals_sorted[monotonic_indices[-1]]:
                 monotonic_indices.append(i)

        if len(monotonic_indices) < 2:
             print(f"Warning: Not enough monotonic phi points ({len(monotonic_indices)}) for interpolation.")
             return None, None, None

        # Perform Pchip interpolation for each K value
        try:
            interp_kt = PchipInterpolator(phi_vals_sorted[monotonic_indices], k_t_vals_filtered[sort_indices][monotonic_indices], extrapolate=False)
            kt_interp = interp_kt(phi_grid_target)

            interp_kb = PchipInterpolator(phi_vals_sorted[monotonic_indices], k_b_vals_filtered[sort_indices][monotonic_indices], extrapolate=False)
            kb_interp = interp_kb(phi_grid_target)

            interp_kp = PchipInterpolator(phi_vals_sorted[monotonic_indices], k_p_vals_filtered[sort_indices][monotonic_indices], extrapolate=False)
            kp_interp = interp_kp(phi_grid_target)

            return kt_interp, kb_interp, kp_interp

        except Exception as e:
            print(f"Error during Pchip interpolation: {e}")
            return None, None, None


    # Perform interpolation for both cracks
    kt1_interp, kb1_interp, kp1_interp = interpolate_crack_data(crack1_np, phi_grid_target, phi_min_combined, phi_max_combined)
    kt2_interp, kb2_interp, kp2_interp = interpolate_crack_data(crack2_np, phi_grid_target, phi_min_combined, phi_max_combined)

    # Check if interpolation was successful for both cracks
    if any(k is None for k in [kt1_interp, kb1_interp, kp1_interp, kt2_interp, kb2_interp, kp2_interp]):
        print(f"Interpolation failed for c_index {c_index}. Skipping.")
        return None


    # --- Construct the 128x16 Output Block ---
    # Pre-allocate the block for this c_index
    data_block = np.zeros((128, 16))

    # Populate the block based on the desired column structure
    data_block[:, 0] = c_index
    data_block[:, 1] = wr
    data_block[:, 2] = a1c1
    data_block[:, 3] = a1t
    data_block[:, 4] = a2c2 # a2/c2 from crack 2
    data_block[:, 5] = a2t  # a2/t from crack 2
    data_block[:, 6] = rt
    data_block[:, 7] = bt
    data_block[:, 8] = phi_grid_target # phi_1 is the target grid
    data_block[:, 9] = phi_grid_target # phi_2 is the target grid
    data_block[:, 10] = kt1_interp
    data_block[:, 11] = kt2_interp
    data_block[:, 12] = kb1_interp
    data_block[:, 13] = kb2_interp
    data_block[:, 14] = kp1_interp
    data_block[:, 15] = kp2_interp

    # Final check for NaNs resulting from interpolation outside valid range (extrapolate=False)
    if np.isnan(data_block).any():
        # print(f"Warning: NaN values generated in final data block for c_index {c_index}. Skipping.")
        return None

    return data_block


# --- Main Processing Loop for Files ---

dir_path = "../files/data/TWIN/CS"
# Create the output directory if it doesn't exist
output_dir = os.path.join(dir_path, "CLEANED")
os.makedirs(output_dir, exist_ok=True)


csv_files = [f for f in os.listdir(dir_path) if f.endswith(".csv")]


for csv_index in csv_files: # Continue processing from file 70
    print(f"Working on: {csv_index}")
    file_path = os.path.join(dir_path, csv_index)
    output_path = os.path.join(output_dir, f"{csv_index[:-4]}-CLEANED.csv")

    # Skip if the cleaned file already exists (optional, for resuming)
    if os.path.exists(output_path):
        print(f"Cleaned file {output_path} already exists. Skipping.")
        continue

    try:
        # Read the CSV file
        df = pd.read_csv(file_path)

        # Drop rows with NaN values (assuming drop_rows_with_nan is efficient)
        initial_rows = len(df)
        df = drop_rows_with_nan(df)
        if len(df) < initial_rows:
            print(f"Dropped {initial_rows - len(df)} rows with NaN in {csv_index}")

        if df.empty:
             print(f"No data left in {csv_index} after dropping NaNs. Skipping.")
             continue

        # Check for corrupt indices where 'crack' count is not 2 per 'c_index'
        # Assuming 'crack' is the correct column name here based on your code
        if 'c_index' not in df.columns or 'crack' not in df.columns:
             print(f"Error: Missing 'c_index' or 'crack' column in {csv_index}. Skipping.")
             continue

        unique_counts = df.groupby('c_index')['crack'].nunique()
        corrupt_indices = unique_counts[unique_counts != 2].index

        # Filter out rows belonging to corrupt c_index values
        df = df[~df['c_index'].isin(corrupt_indices)].copy() # Use .copy() to avoid SettingWithCopyWarning

        unique_index_list = df['c_index'].unique().tolist()
        num_valid_indices = len(unique_index_list)

        if num_valid_indices == 0:
             print(f"No valid c_index entries left in {csv_index} after cleaning. Skipping.")
             continue

        # --- Process Groups and Collect Results ---
        processed_blocks = []

        # Group by c_index and iterate through groups
        grouped = df.groupby('c_index')

        # Use tqdm to show progress over the groups within the file
        for c_index, group_df in tqdm(grouped, desc=f"Processing {csv_index}", unit="group"):
            # Process each group using the dedicated function
            data_block = process_c_index_group(group_df) # Pass the group DataFrame

            if data_block is not None:
                processed_blocks.append(data_block)

        # --- Combine and Save ---
        if not processed_blocks:
             print(f"No valid processed blocks generated for {csv_index}. Skipping save.")
             continue

        # Concatenate all collected blocks into the final array
        d_final = np.concatenate(processed_blocks, axis=0)

        # Convert the final numpy array to a DataFrame
        # Ensure column names match the structure of d_final
        output_df = pd.DataFrame(d_final, columns=[
            "c_index", "W/R", "a1/c1", "a1/t", "a2/c2", "a2/t",
            "r/t", "b/t", "phi_1", "phi_2", "K1-T", "K2-T",
            "K1-B", "K2-B", "K1-P", "K2-P"
        ])

        # Save the cleaned and processed DataFrame to a new CSV file
        output_df.to_csv(output_path, index=False) # index=False prevents writing the DataFrame index
        print(f"Successfully saved cleaned data for {csv_index} to {output_path}")

    except FileNotFoundError:
        print(f"Error: File not found at {file_path}. Skipping.")
        continue
    except Exception as e:
        print(f"An unexpected error occurred while processing {csv_index}: {e}")
        # Optional: print traceback for debugging
        # import traceback
        # traceback.print_exc()
        continue # Continue to the next file


print("Processing complete.")

In [None]:
# Specify the directory path
dir_path = "../files/data/TWIN/CS"

# Get all .csv files
csv_files = [f for f in os.listdir(dir_path) if f.endswith(".csv")]


for csv_index in csv_files[70:]:
    print("Working on: ", csv_index)
    df = pd.read_csv("../files/data/TWIN/CS/{}".format(csv_index))
    df = drop_rows_with_nan(df)

    unique_index = df['c_index'].unique()
    
    column_to_check = "crack" # Or replace with the actual column name string, e.g., 'crack_type'

    # --- Optimized Logic ---

    # 1. Group by 'c_index' and count unique values in the target column for each group
    unique_counts = df.groupby('c_index')[column_to_check].nunique()

    # 2. Filter the result to find indices where the unique count is NOT 2
    corrupt_indices = unique_counts[unique_counts != 2].index

    # 3. Convert the resulting index object to a list (if you need a list)
    corrupt_list = corrupt_indices.tolist()

    df = df[~df['c_index'].isin(corrupt_list)]

    unique_index = df['c_index'].unique()

    d_final = np.zeros((len(unique_index)*128, 16))

    ui = 0
    for unq_index in tqdm(unique_index):
        data = np.zeros((128, 16))
        filtered_df = df[df["c_index"] == unq_index]

        cracks = np.unique(filtered_df.to_numpy()[:,1])

        data[:,0] = unq_index
        for crack in cracks:
            filtered_df_ = filtered_df[filtered_df["crack"] == crack]

            filtered_d_ = filtered_df_.to_numpy()

            W_R = np.unique(filtered_d_[:,2])
            a_c = np.unique(filtered_d_[:,3])
            a_t = np.unique(filtered_d_[:,4])
            r_t = np.unique(filtered_d_[:,5])
            b_t = np.unique(filtered_d_[:,6])

            assert len(W_R) == 1, W_R
            assert len(a_c) == 1, a_c
            assert len(a_t) == 1, a_t
            assert len(r_t) == 1, r_t
            assert len(b_t) == 1, b_t

            phi_vals = filtered_d_[:,-4]
            phi_min = phi_vals.min() + 0.035
            phi_max = phi_vals.max() - 0.035
            # Filter indices where phi values lie within [phi_min, phi_max]
            filtered_indices = (phi_vals >= phi_min) & (phi_vals <= phi_max)
            phi_vals = phi_vals[filtered_indices]
            assert len(phi_vals) > 16, len(phi_vals)
            phi_idxes = np.argsort(phi_vals)
            
            monotonic_phi_idxes = [phi_idxes[0]]
            prev_phi = phi_vals[phi_idxes[0]]
            for index in phi_idxes[1:]:
                now_phi = phi_vals[index]
                if now_phi > prev_phi:
                    monotonic_phi_idxes.append(index)
                    prev_phi = phi_vals[index]

            phi_regular_128 = np.linspace(phi_min, phi_max, 132)

            # Tension
            K_vals = filtered_d_[:,-3]
            K_vals = K_vals[filtered_indices]
            
            interp_func = PchipInterpolator(phi_vals[monotonic_phi_idxes], K_vals[monotonic_phi_idxes], extrapolate=False)
            KT_regular_128 = interp_func(phi_regular_128)

            # Bending
            K_vals = filtered_d_[:,-2]
            K_vals = K_vals[filtered_indices]
            
            interp_func = PchipInterpolator(phi_vals[monotonic_phi_idxes], K_vals[monotonic_phi_idxes], extrapolate=False)
            KB_regular_128 = interp_func(phi_regular_128)

            # Pin
            K_vals = filtered_d_[:,-1]
            K_vals = K_vals[filtered_indices]
            
            interp_func = PchipInterpolator(phi_vals[monotonic_phi_idxes], K_vals[monotonic_phi_idxes], extrapolate=False)
            KP_regular_128 = interp_func(phi_regular_128)

            if crack == 1:
                data[:,1] = W_R[0]
                data[:,2] = a_c[0]
                data[:,3] = a_t[0]
                data[:,6] = r_t[0]
                data[:,7] = b_t[0]
                data[:,8] = phi_regular_128[2:-2]
                data[:,10] = KT_regular_128[2:-2]
                data[:,12] = KB_regular_128[2:-2]
                data[:,14] = KP_regular_128[2:-2]
            
            elif crack == 2:
                data[:,1] = W_R[0]
                data[:,4] = a_c[0]
                data[:,5] = a_t[0]
                data[:,6] = r_t[0]
                data[:,7] = b_t[0]
                data[:,9] = phi_regular_128[2:-2]
                data[:,11] = KT_regular_128[2:-2]
                data[:,13] = KB_regular_128[2:-2]
                data[:,15] = KP_regular_128[2:-2]
            
            else:
                print("Something is wrong with crack")

        if np.isnan(data).any():
            continue
        else:
            d_final[ui*128:ui*128+128] = data
            ui += 1

    d_final = d_final[~np.all(d_final == 0, axis=1)]

    df = pd.DataFrame(d_final, columns=["c_index", "W/R", "a1/c1", "a1/t", "a2/c2", "a2/t", 
                                    "r/t", "b/t", "phi_1", "phi_2", "K1-T", "K2-T", 
                                    "K1-B", "K2-B", "K1-P", "K2-P"])

    df.to_csv("../files/data/TWIN/CS/CLEANED/{}-CLEANED.csv".format(csv_index[:-4]))

In [None]:
np.random.seed(0)
fig, axs = plt.subplots(2, 5, figsize=(30,8))
unique_index = np.unique(d_final[:,0])
print(len(unique_index))
for i in range(2):
    for j in range(5):
        idx = np.random.randint(0, len(unique_index))
        data = d_final[idx*128:idx*128+128]
        W_R = np.unique(data[:,1])
        a1_c1 = np.unique(data[:,2])
        a1_t = np.unique(data[:,3])
        a2_c2 = np.unique(data[:,4])
        a2_t = np.unique(data[:,5])
        r_t = np.unique(data[:,6])

        assert len(W_R) == 1, W_R
        assert len(a1_c1) == 1
        assert len(a1_t) == 1
        assert len(a2_c2) == 1
        assert len(a2_t) == 1
        assert len(r_t) == 1

        axs[i,j].scatter(data[:,8], data[:,10], label="K1-T", color='purple', s=10)
        axs[i,j].plot(data[:,8], data[:,10], color='purple', linestyle=":")

        axs[i,j].scatter(data[:,9], data[:,11], label="K2-T", color='green', s=10)
        axs[i,j].plot(data[:,9], data[:,11], color='green', linestyle=":")

        axs[i,j].set_title("W/R:{} a1/c1:{} a1/t:{} a2/c2:{} a2/t:{} r/t: {}".format(W_R[0],
                                                                              a1_c1[0],
                                                                              a1_t[0],
                                                                              a2_c2[0],
                                                                              a2_t[0],
                                                                              r_t[0]))
        
        if i == 0 and j == 0:
            axs[i,j].legend()

plt.show()