In [12]:
import pandas as pd, glob, re, os

# --- Path containing your .txt files ---
folder_path = r"C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1"

# --- Create output folder ---
output_folder = os.path.join(folder_path, "Potholes_clean_csvs")
os.makedirs(output_folder, exist_ok=True)

# --- Get list of all .txt files ---
txt_files = glob.glob(os.path.join(folder_path, "*.txt"))
print(f"Found {len(txt_files)} .txt files in {folder_path}")

for txt_file in txt_files:
    try:
        # --- Read file ---
        df = pd.read_csv(txt_file, skiprows=1, sep=',', engine='python')
        df = df[["Name", "X", "Y", "Z"]]

        # --- Extract type (P, C, E) and number ---
        def extract_type_and_number(name):
            m = re.search(r'([PCE])(\d+)', name)
            return (m.group(1), int(m.group(2))) if m else (None, -1)
        df[['Type', 'Number']] = df['Name'].apply(lambda x: pd.Series(extract_type_and_number(x)))

        # --- Extract base name (without _S / _F) for proper grouping ---
        df['Base'] = df['Name'].apply(lambda x: re.sub(r'_[SF]$', '', x))

        # --- Define sort keys ---
        df['SF'] = df['Name'].apply(lambda x: 0 if x.endswith('_S') else 1)  # S before F
        order_map = {'P': 1, 'C': 2, 'E': 3}
        df['TypeOrder'] = df['Type'].map(order_map).fillna(99)

        # --- Sort by site type, number, then ensure S comes before F ---
        df = df.sort_values(by=['TypeOrder', 'Number', 'Base', 'SF']).drop(columns=['TypeOrder', 'SF', 'Type', 'Number', 'Base'])

        # --- Define output file path ---
        base_name = os.path.basename(txt_file).replace(".txt", "_clean.csv")
        csv_file = os.path.join(output_folder, base_name)

        # --- Save cleaned CSV ---
        df.to_csv(csv_file, index=False, float_format="%.3f")
        print(f"‚úÖ Saved cleaned CSV ‚Üí {csv_file}")

    except Exception as e:
        print(f"‚ùå Error processing {txt_file}: {e}")

print(f"\nüéØ All cleaned CSVs saved in: {output_folder}")


Found 15 .txt files in C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1
‚úÖ Saved cleaned CSV ‚Üí C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1\Potholes_clean_csvs\Site1_0_clean.csv
‚úÖ Saved cleaned CSV ‚Üí C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1\Potholes_clean_csvs\Site1_1_clean.csv
‚úÖ Saved cleaned CSV ‚Üí C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1\Potholes_clean_csvs\Site1_10_clean.csv
‚úÖ Saved cleaned CSV ‚Üí C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1\Potholes_clean_csvs\Site1_11_clean.csv
‚úÖ Saved cleaned CSV ‚Üí C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1\Potholes_clean_csvs\Site1_12_clean.csv
‚úÖ Saved cleaned CSV ‚Üí C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1\Potholes_clean_csvs\Site1_13_clean.csv
‚úÖ Saved cleaned CSV ‚Üí C:/Users/umair.muhammad/Documen

In [14]:
import pandas as pd, glob, re, os

# --- Path containing your .txt files ---
folder_path = r"C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1"  # change as needed

# --- Create output folder inside the same directory ---
output_folder = os.path.join(folder_path, "clean_csvs")
os.makedirs(output_folder, exist_ok=True)

# --- Get list of all .txt files ---
txt_files = glob.glob(os.path.join(folder_path, "*.txt"))
print(f"Found {len(txt_files)} .txt files in {folder_path}")

for txt_file in txt_files:
    try:
        # --- Read file ---
        df = pd.read_csv(txt_file, skiprows=1, sep=',', engine='python')
        df = df[["Name", "X", "Y", "Z"]]

        # --- Extract type (P, C, E) and number ---
        def extract_type_and_number(name):
            m = re.search(r'([PCE])_(\d+)', str(name))
            return (m.group(1), int(m.group(2))) if m else (None, -1)

        df[['Type', 'Number']] = df['Name'].apply(lambda x: pd.Series(extract_type_and_number(x)))

        # --- Identify S/F flag ---
        def sf_flag(name):
            if '_S' in name:
                return 0  # S first
            elif '_F' in name:
                return 1  # F second
            else:
                return 2  # others (if any)
        df['SF'] = df['Name'].apply(sf_flag)

        # --- Create a "Base" key to ensure S and F stay together ---
        # Example: Site1_0_P1_S ‚Üí Site1_0_P1
        df['Base'] = df['Name'].apply(lambda x: re.sub(r'(_[SF])$', '', str(x)))

        # --- Define type ordering ---
        order_map = {'P': 1, 'C': 2, 'E': 3}
        df['TypeOrder'] = df['Type'].map(order_map).fillna(99)

        # --- Sort so that:
        #     1. Type: P ‚Üí C ‚Üí E
        #     2. Number ascending
        #     3. Within each pair, S before F
        df = df.sort_values(by=['TypeOrder', 'Number', 'Base', 'SF']).drop(columns=['TypeOrder', 'SF', 'Type', 'Number', 'Base'])

        # --- Define output file path inside new folder ---
        base_name = os.path.basename(txt_file).replace(".txt", "_clean.csv")
        csv_file = os.path.join(output_folder, base_name)

        # --- Save cleaned CSV ---
        df.to_csv(csv_file, index=False, float_format="%.3f")
        print(f"‚úÖ Saved cleaned CSV ‚Üí {csv_file}")

    except Exception as e:
        print(f"‚ùå Error processing {txt_file}: {e}")

print(f"\nüéØ All cleaned CSVs saved in: {output_folder}")


Found 15 .txt files in C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1
‚úÖ Saved cleaned CSV ‚Üí C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1\clean_csvs\Site1_0_clean.csv
‚úÖ Saved cleaned CSV ‚Üí C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1\clean_csvs\Site1_1_clean.csv
‚úÖ Saved cleaned CSV ‚Üí C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1\clean_csvs\Site1_10_clean.csv
‚úÖ Saved cleaned CSV ‚Üí C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1\clean_csvs\Site1_11_clean.csv
‚úÖ Saved cleaned CSV ‚Üí C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1\clean_csvs\Site1_12_clean.csv
‚úÖ Saved cleaned CSV ‚Üí C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1\clean_csvs\Site1_13_clean.csv
‚úÖ Saved cleaned CSV ‚Üí C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/site1\clean_csv

In [2]:
import os
import re
import pandas as pd

# ----------------------------
# 1. FIXED & SAFE REGEX PATTERNS
# ----------------------------
PATTERNS = [

    # Pattern A ‚Äî Site1_1_P4_S
    re.compile(
        r'^(?P<site>Site\d+)(?:[_-](?P<sub>\d+))?[_-]'
        r'(?P<type>P|C|E|Fl|FL|fl|Flushed|FLUSHED)'
        r'(?P<num>\d+)[_-](?P<sf>[SF])$',
        re.IGNORECASE
    ),

    # Pattern B ‚Äî Site1_4_P_5_F
    re.compile(
        r'^(?P<site>Site\d+)(?:[_-](?P<sub>\d+))?[_-]'
        r'(?P<type>P|C|E|Fl|FL|fl|Flushed|FLUSHED)[_-]'
        r'(?P<num>\d+)[_-](?P<sf>[SF])$',
        re.IGNORECASE
    ),

    # Pattern C ‚Äî Site2_1_P_S_11
    re.compile(
        r'^(?P<site>Site\d+)(?:[_-](?P<sub>\d+))?[_-]'
        r'(?P<type>P|C|E|Fl|FL|fl|Flushed|FLUSHED)[_-]'
        r'(?P<sf>[SF])[_-](?P<num>\d+)$',
        re.IGNORECASE
    ),

    # Pattern D ‚Äî fallback: Site1_1_P4F
    re.compile(
        r'^(?P<site>Site\d+)(?:[_-](?P<sub>\d+))?[_-]'
        r'(?P<type>P|C|E|Fl|FL|fl|Flushed|FLUSHED)'
        r'(?P<num>\d+)(?P<sf>[SF])$',
        re.IGNORECASE
    ),
]


# -------------------------------------
# 2. FUNCTION TO PARSE A SINGLE FILENAME
# -------------------------------------
def parse_line(line):
    line = line.strip()

    for pat in PATTERNS:
        match = pat.match(line)
        if match:
            site = match.group("site")
            sub = match.group("sub")
            tp = match.group("type")
            num = match.group("num")
            sf = match.group("sf")

            # Strong filtering: S or F must be present
            if tp is None or num is None or sf is None:
                return None

            return {
                "site": site,
                "sub": sub if sub else "",
                "type": tp.upper(),
                "num": int(num),
                "SF": sf.upper(),
                "raw": line
            }

    return None  # Not matching any pattern ‚Üí discard


# -------------------------------------------
# 3. PROCESS ONE TXT FILE ‚Üí CREATE ONE CSV OUT
# -------------------------------------------
def process_txt_file(txt_path, out_dir):
    filename = os.path.basename(txt_path)

    with open(txt_path, "r") as f:
        lines = f.readlines()

    parsed = [parse_line(L) for L in lines]
    parsed = [p for p in parsed if p is not None]  # remove failures

    if not parsed:
        print(f"[WARNING] All lines rejected in {filename}")
        return

    df = pd.DataFrame(parsed)

    # --- FORM PAIRS (S + F) -----------------
    df["pair_id"] = df["site"] + "_" + df["sub"].astype(str) + "_" + df["type"] + "_" + df["num"].astype(str)

    # Keep only pair_ids where both S and F exist
    valid_pairs = df.groupby("pair_id")["SF"].nunique()
    keep_ids = valid_pairs[valid_pairs == 2].index

    df_clean = df[df["pair_id"].isin(keep_ids)].copy()

    # Save CSV same name as txt
    out_csv = os.path.join(out_dir, filename.replace(".txt", ".csv"))
    df_clean.to_csv(out_csv, index=False)

    print(f"[OK] Saved: {out_csv}   Rows: {len(df_clean)}")


# ----------------------------
# 4. MAIN EXECUTION
# ----------------------------
input_folder = "C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/All_Nome/txt_lables"            # your TXT folder
output_folder = "C:/Users/umair.muhammad/Documents/PhD/Research Work/FedLearn/training/All_Nome/txt_lables/cleancsvs"      # output folder

os.makedirs(output_folder, exist_ok=True)

for file in os.listdir(input_folder):
    if file.lower().endswith(".txt"):
        process_txt_file(os.path.join(input_folder, file), output_folder)

print("\nDONE! All CSVs saved inside /cleancsvs/")



DONE! All CSVs saved inside /cleancsvs/
