In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTETomek
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
file_path = '/IS6502.PROJECT/ai4i2020.csv'  # Update the path to your CSV file
data = pd.read_csv(file_path)

# Define the features and target variables
X = data[['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']]
y = data[['TWF', 'HDF', 'PWF', 'OSF', 'RNF']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTETomek to handle class imbalance for each target variable
smotetomek = SMOTETomek(random_state=42)
X_train_resampled_list = []
y_train_resampled = pd.DataFrame()

for column in y_train.columns:
    if y_train[column].nunique() > 1:
        X_res, y_res = smotetomek.fit_resample(X_train_scaled, y_train[column])
        X_train_resampled_list.append(pd.DataFrame(X_res, columns=X.columns))
        y_train_resampled[column] = y_res
    else:
        X_train_resampled_list.append(pd.DataFrame(X_train_scaled, columns=X.columns))
        y_train_resampled[column] = y_train[column]

# Truncate all resampled feature sets and target variables to the minimum number of samples
min_samples = min(len(x) for x in X_train_resampled_list)
X_train_resampled = X_train_resampled_list[0].iloc[:min_samples].reset_index(drop=True)
for i in range(1, len(X_train_resampled_list)):
    X_train_resampled = pd.concat([X_train_resampled, X_train_resampled_list[i].iloc[:min_samples].reset_index(drop=True)], axis=0)
y_train_resampled = y_train_resampled.iloc[:min_samples].reset_index(drop=True)

# Check for NaN values and drop them
X_train_resampled = X_train_resampled.dropna()
y_train_resampled = y_train_resampled.dropna()

# Ensure that the shapes match
X_train_resampled = X_train_resampled.iloc[:len(y_train_resampled)]
y_train_resampled = y_train_resampled.iloc[:len(X_train_resampled)]

# Define the base models with hyperparameter tuning
param_grid = {
    'rf__n_estimators': [50, 100],
    'rf__max_depth': [None, 10, 20],
    'lr__C': [0.1, 1, 10],
    'svm__C': [0.1, 1, 10],
    'gb__n_estimators': [50, 100],
    'gb__learning_rate': [0.01, 0.1, 0.2]
}

base_models = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('lr', LogisticRegression(random_state=42, max_iter=1000)),
    ('svm', SVC(kernel='linear', probability=True, random_state=42)),('gb', GradientBoostingClassifier(random_state=42))
]

voting_clf = VotingClassifier(estimators=base_models, voting='soft')

# Create the Voting Classifier with soft voting wrapped in MultiOutputClassifier
voting_clf_soft = MultiOutputClassifier(GridSearchCV(voting_clf, param_grid, cv=3, n_jobs=-1, verbose=1))
voting_clf_soft.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred_voting_soft = voting_clf_soft.predict(X_test_scaled)

# Evaluate the Voting Classifier with soft voting
classification_reports_voting_soft = {}
confusion_matrices_voting_soft = {}
for i, mode in enumerate(['TWF', 'HDF', 'PWF', 'OSF', 'RNF']):
    classification_reports_voting_soft[mode] = classification_report(y_test[mode], y_pred_voting_soft[:, i])
    confusion_matrices_voting_soft[mode] = confusion_matrix(y_test[mode], y_pred_voting_soft[:, i])

# Display classification reports and confusion matrices
for mode in ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']:
    print(f"Classification Report for {mode}:\n{classification_reports_voting_soft[mode]}")
    print(f"Confusion Matrix for {mode}:\n{confusion_matrices_voting_soft[mode]}\n")


ModuleNotFoundError: No module named 'imblearn'