In [1]:
import pandas as pd
import numpy as np
import os

In [None]:
def get_bad_coils(file_path):
    xls = pd.ExcelFile(file_path)
    
    # Initialize a dictionary to hold lists from each sheet
    pickle_id_lists = []

    # Loop through sheets except "All Data"
    for sheet_name in xls.sheet_names:
        if sheet_name != "All Data":
            df = pd.read_excel(xls, sheet_name=sheet_name)
            # Extract Pickle Ids as list and store it
            if 'Pickle ID' in df.columns:
                pickle_id_lists.extend(df['Pickle ID'].dropna().astype(int).tolist())
    return set(pickle_id_lists)

In [None]:
# Path to folder where all coil CSVs are stored
data_folder = r"D:\Stelco\Work\Dynamic Correlation\Key\Master whole Phase"
bad_coil_path = r"D:\Stelco\Work\Dynamic Correlation\QVW Data for S&R 2024-7-2025 - 1008GCQ.xlsx"

In [None]:
# List of Bad Coil IDs
bad_coil_ids = get_bad_coils(bad_coil_path)

# Features with high correlation in Phase 2
selected_features_phase2 = ['Neet Oil Concentration', 'Exit Tension Reel Tension Reference','Stand 1-3 Solution Temperature','Stand 4 - Operator Side Force','Stand 4 DS Total Bending Feedback','Stand 4 Drive Speed Feedback','Stand 4 OS Total Bending Feedback']

# === LOADING DATA ===
X_list = []
y_list = []

bad_count = 0
good_count = 0
MAX_PER_CLASS = 50

all_files = [f for f in os.listdir(data_folder) if f.endswith('.csv')]

for file in all_files:
    coil_id = int(file.replace('.csv', ''))

    label = 'Bad' if coil_id in bad_coil_ids else 'Good'

    if label == 'Bad' and bad_count >= MAX_PER_CLASS:
        continue
    if label == 'Good' and good_count >= MAX_PER_CLASS:
        continue

    try:
        df = pd.read_csv(os.path.join(data_folder, file))

        # Ensure selected features exist
        if not set(selected_features).issubset(df.columns):
            continue

        # Use all data (no phase split)
        feature_row = df[selected_features].mean()

        X_list.append(feature_row)
        y_list.append(label)

        if label == 'Bad':
            bad_count += 1
        else:
            good_count += 1

        if bad_count >= MAX_PER_CLASS and good_count >= MAX_PER_CLASS:
            break

    except Exception as e:
        print(f"Error reading {file}: {e}")

# === CREATE DATASET ===
X = pd.DataFrame(X_list)
y = pd.Series(y_list, name='Label')

print(f"\nLoaded {len(X)} samples: {y.value_counts().to_dict()}\n")

# === TRAIN MODELS ===
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "CatBoost": CatBoostClassifier(verbose=0),
    "LightGBM": LGBMClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n{name} Classification Report:\n")
    print(classification_report(y_test, y_pred))
