In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE


# Read Excel file
file_path = 'ALL.xlsx'
data = pd.read_excel(file_path, sheet_name='Sheet1')


# Set target column name
target_column = 'labels'

# 1. Split dataset (6:2:2 ratio)
X = data.drop(columns=[target_column])
y = data[target_column]

# First split: 60% training set, 40% temporary set
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# Second split: Split 40% temporary set into 20% validation set and 20% test set
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# 2. Balance training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# 3. Combine into DataFrames
train_balanced = pd.concat([pd.DataFrame(X_train_balanced), pd.Series(y_train_balanced, name=target_column)], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

# 4. Save as Excel files
train_balanced.to_excel('training_set_balanced.xlsx', index=False)
val_data.to_excel('validation_set.xlsx', index=False)
test_data.to_excel('test_set.xlsx', index=False)

print("Processing completed!")
print(f"Training set size: {train_balanced.shape}")
print(f"Validation set size: {val_data.shape}")
print(f"Test set size: {test_data.shape}")

Processing completed!
Training set size: (3240, 32)
Validation set size: (817, 32)
Test set size: (817, 32)
