# This is the Final Merged data 

### The process to have a balanced datasets are:
- The common iterations or combinations values of ["Vbus","Rg","Ls4","Ls5","Ls6","Ls7","Ls8","Ls9","Ls10","Ls11"] across all the outliers removed cleaned MOSFETs Data extracted
- Then merged the 6 MOSFETs  with the common iterations
- Also additionally saved 5 MOSFET for training and 1 UNSEEN MOSFET for generalisation test 

In [None]:
import os
import glob
import pandas as pd

# Folders
input_folder = "mosfets_step3_final_cleaned"
output_folder = "MERGED_ORIGINAL"
os.makedirs(output_folder, exist_ok=True)

# The combinations or iterations performed in the simulations 
# creating a balanced dataset 
sim_columns = ["Vbus","Rg","Ls4","Ls5","Ls6","Ls7","Ls8","Ls9","Ls10","Ls11"]

# input folder
print("Input folder:", os.path.abspath(input_folder))
files = sorted(glob.glob(os.path.join(input_folder, "*_cleaned.csv")))
if not files:
    print("No files found")
    exit()
for f in files:
    print(" -", os.path.basename(f))

# get all the outlier removed clean MOSFETs Data csv file 
dataframes = {}
for file in files:
    name = os.path.basename(file).replace("_cleaned.csv", "")
    df = pd.read_csv(file)
    dataframes[name] = df

# Finding common simulation rows across all MOSFETs
common_rows = None
for name, df in dataframes.items():
    unique_rows = df[sim_columns].drop_duplicates()
    if common_rows is None:
        common_rows = unique_rows
    else:
        common_rows = pd.merge(common_rows, unique_rows, on=sim_columns, how="inner")

print("Common simulation setups:", len(common_rows))

# Filtering each of the MOSFET to only get those rows
balanced_data = {}
for name, df in dataframes.items():
    balanced_df = pd.merge(common_rows, df, on=sim_columns, how="inner")
    balanced_data[name] = balanced_df
    print(f"{name}: {len(balanced_df)} rows after balancing")

# To have a complete dataset of all 6 MOSFETs - All_6_MOSFETs.csv
all_6 = pd.concat(balanced_data.values(), ignore_index=True)
all_6.to_csv(os.path.join(output_folder, "All_6_MOSFETs.csv"), index=False)
print("Saved All_6_MOSFETs.csv")

# For unseen test mosfet for generalisation check
# one MOSFET as unseen test
mosfet_names = sorted(balanced_data.keys())
test_mosfet = mosfet_names[-1] # last one 
# Rest as training set
train_mosfets = [m for m in mosfet_names if m != test_mosfet]

# Saving the train and test splits
train_5 = pd.concat([balanced_data[m] for m in train_mosfets], ignore_index=True)
test_1 = balanced_data[test_mosfet]

train_5.to_csv(os.path.join(output_folder, "Train_5_MOSFETs.csv"), index=False)
test_1.to_csv(os.path.join(output_folder, "Test_1_MOSFET.csv"), index=False)

print("Saved Train_5_MOSFETs.csv")
print(f"Saved Test_1_MOSFET.csv ({test_mosfet})")
print("Done! Output folder:", os.path.abspath(output_folder))


Input folder: c:\Users\pc\Desktop\TRAIL\mosfets_step3_final_cleaned
 - C2M0025120D_cleaned.csv
 - C2M0040120D_cleaned.csv
 - C2M0080120D_cleaned.csv
 - C2M0160120D_cleaned.csv
 - C2M0280120D_cleaned.csv
 - C2M1000170D_cleaned.csv
Common simulation setups: 86335
C2M0025120D: 86335 rows after balancing
C2M0040120D: 86335 rows after balancing
C2M0080120D: 86335 rows after balancing
C2M0160120D: 86335 rows after balancing
C2M0280120D: 86335 rows after balancing
C2M1000170D: 86335 rows after balancing
Saved All_6_MOSFETs.csv
Saved Train_5_MOSFETs.csv
Saved Test_1_MOSFET.csv (C2M1000170D)
Done! Output folder: c:\Users\pc\Desktop\TRAIL\MERGED_ORIGINAL


## Merging the feature engineered MOSFET data
similarly for the Feature engineered dataset so we can have a clean raw input dataset seperate for explorations and addign different derived features and one for just testing and understanding the feature importance of the derived merged dataset 

In [None]:
import os
import glob
import pandas as pd

# input and ouput 
input_folder = "feature_engineered_MOSFET_data"
output_folder = "merged_feature_engineered"
os.makedirs(output_folder, exist_ok=True)

# the combinations or iterations made in the simulations
sim_columns = ["Vbus","Rg","Ls4","Ls5","Ls6","Ls7","Ls8","Ls9","Ls10","Ls11"]


print("Input folder:", os.path.abspath(input_folder))
files = sorted(glob.glob(os.path.join(input_folder, "*_feature_engineered.csv")))
if not files:
    print("No files found")
    exit()
for f in files:
    print(" -", os.path.basename(f))

# reading all the feature engineered MOSFETs Data csv file
dataframes = {}
for file in files:
    name = os.path.basename(file).replace("_feature_engineered.csv", "")
    df = pd.read_csv(file)
    dataframes[name] = df

# Finding the common simulation rows across all the MOSFETs
common_rows = None
for name, df in dataframes.items():
    unique_rows = df[sim_columns].drop_duplicates()
    if common_rows is None:
        common_rows = unique_rows
    else:
        common_rows = pd.merge(common_rows, unique_rows, on=sim_columns, how="inner")

print("Common simulation setups:", len(common_rows))

# Filtering each of the MOSFET to only get those rows
balanced_data = {}
for name, df in dataframes.items():
    balanced_df = pd.merge(common_rows, df, on=sim_columns, how="inner")
    balanced_data[name] = balanced_df
    print(f"{name}: {len(balanced_df)} rows after balancing")

# Merged all 6 MOSFETs - Complete data 
all_6 = pd.concat(balanced_data.values(), ignore_index=True)
all_6.to_csv(os.path.join(output_folder, "All_6_MOSFETs.csv"), index=False)
print("Saved All_6_MOSFETs.csv")

#one MOSFET as unseen test
mosfet_names = sorted(balanced_data.keys())
test_mosfet = mosfet_names[-1] 
train_mosfets = [m for m in mosfet_names if m != test_mosfet]

# Saving the training and testing splits
train_5 = pd.concat([balanced_data[m] for m in train_mosfets], ignore_index=True)
test_1 = balanced_data[test_mosfet]

train_5.to_csv(os.path.join(output_folder, "Train_5_MOSFETs.csv"), index=False)
test_1.to_csv(os.path.join(output_folder, "Test_1_MOSFET.csv"), index=False)

print("Saved Train_5_MOSFETs.csv")
print(f"Saved Test_1_MOSFET.csv ({test_mosfet})")
print("Done! Output folder:", os.path.abspath(output_folder))


Input folder: c:\Users\pc\Desktop\TRAIL\feature_engineered_MOSFET_data
 - C2M0025120D_cleaned_feature_engineered.csv
 - C2M0040120D_cleaned_feature_engineered.csv
 - C2M0080120D_cleaned_feature_engineered.csv
 - C2M0160120D_cleaned_feature_engineered.csv
 - C2M0280120D_cleaned_feature_engineered.csv
 - C2M1000170D_cleaned_feature_engineered.csv
Common simulation setups: 86335
C2M0025120D_cleaned: 86335 rows after balancing
C2M0040120D_cleaned: 86335 rows after balancing
C2M0080120D_cleaned: 86335 rows after balancing
C2M0160120D_cleaned: 86335 rows after balancing
C2M0280120D_cleaned: 86335 rows after balancing
C2M1000170D_cleaned: 86335 rows after balancing
Saved All_6_MOSFETs.csv
Saved Train_5_MOSFETs.csv
Saved Test_1_MOSFET.csv (C2M1000170D_cleaned)
Done! Output folder: c:\Users\pc\Desktop\TRAIL\merged_feature_engineered
