In [10]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTETomek
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, confusion_matrix
from scipy.stats import uniform, randint
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense
from tensorflow.keras.utils import to_categorical

In [11]:
# Load the dataset
file_path = 'ai4i2020.csv'  # Update the path to your CSV file
data = pd.read_csv(file_path)
print("Original Data:\n", data.head().to_markdown(index=False, numalign='left', stralign='left'))  # Display first 5 rows


Original Data:
 | UDI   | Product ID   | Type   | Air temperature [K]   | Process temperature [K]   | Rotational speed [rpm]   | Torque [Nm]   | Tool wear [min]   | Machine failure   | TWF   | HDF   | PWF   | OSF   | RNF   |
|:------|:-------------|:-------|:----------------------|:--------------------------|:-------------------------|:--------------|:------------------|:------------------|:------|:------|:------|:------|:------|
| 1     | M14860       | M      | 298.1                 | 308.6                     | 1551                     | 42.8          | 0                 | 0                 | 0     | 0     | 0     | 0     | 0     |
| 2     | L47181       | L      | 298.2                 | 308.7                     | 1408                     | 46.3          | 3                 | 0                 | 0     | 0     | 0     | 0     | 0     |
| 3     | L47182       | L      | 298.1                 | 308.5                     | 1498                     | 49.4          | 5                 |

In [12]:
# Define the features and target variables
X = data[['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']]
y = data[['TWF', 'HDF', 'PWF', 'OSF', 'RNF']]

In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nTrain Data Shapes: X:", X_train.shape, "y:", y_train.shape)
print("Test Data Shapes: X:", X_test.shape, "y:", y_test.shape)


Train Data Shapes: X: (8000, 5) y: (8000, 5)
Test Data Shapes: X: (2000, 5) y: (2000, 5)


In [14]:
# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nExample of Scaled X_train (First 5 Rows):\n", pd.DataFrame(X_train_scaled, columns=X.columns).head().to_markdown(numalign='left', stralign='left'))



Example of Scaled X_train (First 5 Rows):
 |    | Air temperature [K]   | Process temperature [K]   | Rotational speed [rpm]   | Torque [Nm]   | Tool wear [min]   |
|:---|:----------------------|:--------------------------|:-------------------------|:--------------|:------------------|
| 0  | -0.854066             | -0.609589                 | 0.427634                 | -0.892696     | 1.37504           |
| 1  | -0.904014             | -1.08053                  | -0.834945                | 1.38219       | 0.45762           |
| 2  | -0.904014             | -1.48419                  | -0.0596769               | -0.892696     | 1.35922           |
| 3  | 0.444571              | 0.534121                  | 0.333495                 | -0.702288     | -1.59866          |
| 4  | 0.694309              | 0.33229                   | 0.178441                 | -0.612094     | 1.58066           |


In [31]:
# Apply SMOTETomek separately for X and y
smotetomek = SMOTETomek(random_state=42)
X_train_resampled_list = []
y_train_resampled = pd.DataFrame()
for column in y_train.columns:
    X_res, y_res = smotetomek.fit_resample(X_train, y_train[column])
    X_train_resampled_list.append(pd.DataFrame(X_res, columns=X.columns))
    y_train_resampled = pd.concat([y_train_resampled, pd.DataFrame(y_res, columns=[column])], axis=0)

# Concatenate and shuffle resampled data
X_train_resampled = pd.concat(X_train_resampled_list, axis=0)
X_train_resampled, y_train_resampled = shuffle(X_train_resampled, y_train_resampled, random_state=42)

# Scale the resampled features
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
print("\nResampled Train Data Shapes: X:", X_train_resampled.shape, "y:", y_train_resampled.shape)

# Convert back to DataFrame for display
X_train_resampled_df = pd.DataFrame(X_train_resampled, columns=X.columns)
print("Example of Resampled X_train (First 5 Rows):\n", X_train_resampled_df.head().to_markdown(index=False, numalign='left', stralign='left'))


Resampled Train Data Shapes: X: (79324, 5) y: (79324, 5)
Example of Resampled X_train (First 5 Rows):
 | Air temperature [K]   | Process temperature [K]   | Rotational speed [rpm]   | Torque [Nm]   | Tool wear [min]   |
|:----------------------|:--------------------------|:-------------------------|:--------------|:------------------|
| -0.735487             | 0.000984106               | -0.10875                 | -0.229793     | -1.04547          |
| -1.68162              | -1.55424                  | -0.656051                | 1.2623        | 1.13828           |
| 1.56525               | 1.17545                   | -0.648754                | 0.439461      | -0.503101         |
| 0.276051              | 0.806866                  | -0.0394249               | -0.280988     | -1.48793          |
| 0.94394               | 0.199082                  | -0.553888                | 0.551887      | -0.631556         |


In [32]:
# Handle missing values and re-clip to ensure 0/1 values in y_train_resampled
y_train_resampled = y_train_resampled.fillna(0)  # Or fill with another appropriate value
for column in y_train_resampled.columns:
    y_train_resampled[column] = np.clip(y_train_resampled[column], 0, 1)

In [33]:
# Ensure all resampled data have the same number of samples
min_samples = min(len(x) for x in X_train_resampled_list)
X_train_resampled = pd.concat([x.iloc[:min_samples].reset_index(drop=True) for x in X_train_resampled_list], axis=0)
y_train_resampled = y_train_resampled.iloc[:min_samples].reset_index(drop=True)


In [34]:
# Standardize features (after resampling)
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

In [35]:
# Reshape the data for CNN-LSTM model
X_train_reshaped = X_train_resampled.values.reshape(X_train_resampled.shape[0], X_train_resampled.shape[1], 1)
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)


AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [28]:
print(X_train_reshaped.shape)
print(y_train_encoded.shape) 

(78950, 5, 1)


NameError: name 'y_train_encoded' is not defined

In [36]:
# Define the CNN-LSTM model
def create_cnn_lstm_model(input_shape, num_classes):
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape, padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(50, activation='relu', return_sequences=True))  
    model.add(Dropout(0.2))  
    model.add(LSTM(50, activation='relu'))  
    model.add(Dropout(0.2))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [37]:
# Train CNN-LSTM model for each target variable
cnn_lstm_models = {}
cnn_lstm_predictions = np.zeros((X_test_reshaped.shape[0], len(y_train.columns)))

for i, target in enumerate(y_train.columns):
    # Convert target variable to categorical
    y_train_encoded = to_categorical(y_train_resampled[target], num_classes=2)
    y_test_encoded = to_categorical(y_test[target], num_classes=2)
    
    # Create and train the model for the current target variable
    input_shape = (X_train_reshaped.shape[1], 1)
    cnn_lstm_model = create_cnn_lstm_model(input_shape, num_classes=2)

    # Slice y_train_encoded to match the current target variable
    cnn_lstm_model.fit(X_train_reshaped, y_train_encoded[:, i], epochs=10, batch_size=32, verbose=1)  

    # Store the trained model
    cnn_lstm_models[target] = cnn_lstm_model

    # Get predictions for the current target variable
    cnn_lstm_predictions[:, i] = np.argmax(cnn_lstm_model.predict(X_test_reshaped), axis=1)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 78950
'y' sizes: 15790


In [84]:
# Get predictions from the CNN-LSTM models
cnn_lstm_predictions = np.zeros((X_test_reshaped.shape[0], len(y_train.columns)))
for i, target in enumerate(y_train.columns):
    cnn_lstm_predictions[:, i] = np.argmax(cnn_lstm_models[target].predict(X_test_reshaped), axis=1)

# Define base models with hyperparameter grids
param_dist = {
    'estimator__rf__n_estimators': randint(50, 100),
    'estimator__rf__max_depth': [None, 10, 20],
    'estimator__lr__C': uniform(0.1, 10),
    'estimator__svm__C': uniform(0.1, 10),
    'estimator__gb__n_estimators': randint(50, 100),
    'estimator__gb__learning_rate': uniform(0.01, 0.2)
}

base_models = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('lr', LogisticRegression(random_state=42, max_iter=1000)),
    ('svm', SVC(kernel='linear', probability=True, random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42))
]

# Create a Voting Classifier
voting_clf = VotingClassifier(estimators=base_models, voting='soft')

# Wrap the voting classifier in a MultiOutputClassifier
multi_voting_clf = MultiOutputClassifier(voting_clf)

# Perform Grid Search
grid_search = RandomizedSearchCV(estimator=multi_voting_clf, param_distributions=param_dist, n_iter=20, cv=3, n_jobs=-1, random_state=42, verbose=1)
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Extract best parameters for each model
best_rf_params = {key.split('__')[-1]: value for key, value in best_params.items() if 'rf' in key}
best_lr_params = {key.split('__')[-1]: value for key, value in best_params.items() if 'lr' in key}
best_svm_params = {key.split('__')[-1]: value for key, value in best_params.items() if 'svm' in key}
best_gb_params = {key.split('__')[-1]: value for key, value in best_params.items() if 'gb' in key}

# Define the base models with the best parameters
rf_best = RandomForestClassifier(random_state=42, **best_rf_params)
lr_best = LogisticRegression(random_state=42, max_iter=1000, **best_lr_params)
svm_best = SVC(kernel='linear', probability=True, random_state=42, **best_svm_params)
gb_best = GradientBoostingClassifier(random_state=42, **best_gb_params)

# Create a new Voting Classifier with the best models
voting_clf_best = VotingClassifier(estimators=[
    ('rf', rf_best),
    ('lr', lr_best),
    ('svm', svm_best),
    ('gb', gb_best)
], voting='soft')

# Train the Voting Classifier
multi_voting_clf_best = MultiOutputClassifier(voting_clf_best)
multi_voting_clf_best.fit(X_train_resampled, y_train_resampled)

# Get predictions from the Voting Classifier
voting_predictions = multi_voting_clf_best.predict(X_test_scaled)

# Combine predictions using majority voting or averaging
final_predictions = np.zeros_like(cnn_lstm_predictions)
for i in range(cnn_lstm_predictions.shape[0]):
    for j in range(cnn_lstm_predictions.shape[1]):
        combined_prediction = [cnn_lstm_predictions[i, j], voting_predictions[i, j]]
        final_predictions[i, j] = np.argmax(np.bincount(combined_prediction))

# Evaluate the combined model
combined_classification_reports = {}
combined_confusion_matrices = {}
for i, mode in enumerate(['TWF', 'HDF', 'PWF', 'OSF', 'RNF']):
    combined_classification_reports[mode] = classification_report(y_test[mode], final_predictions[:, i])
    combined_confusion_matrices[mode] = confusion_matrix(y_test[mode], final_predictions[:, i])

# Display classification reports and confusion matrices for the combined model
for mode in ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']:
    print(f"Classification Report for {mode}:\n{combined_classification_reports[mode]}")
    print(f"Confusion Matrix for {mode}:\n{combined_confusion_matrices[mode]}\n")


KeyError: 'TWF'