In [14]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")  # for final submission

# Feature engineering
df_train["Temperature ratio"] = df_train['Process temperature [K]'] / df_train['Air temperature [K]']
df_train['Torque * Rotational speed'] = df_train['Torque [Nm]'] * df_train['Rotational speed [rpm]']
df_train['Torque * Tool wear'] = df_train['Torque [Nm]'] * df_train['Tool wear [min]']

df_test["Temperature ratio"] = df_test['Process temperature [K]'] / df_test['Air temperature [K]']
df_test['Torque * Rotational speed'] = df_test['Torque [Nm]'] * df_test['Rotational speed [rpm]']
df_test['Torque * Tool wear'] = df_test['Torque [Nm]'] * df_test['Tool wear [min]']

x_train = df_train.copy()
x_train.drop('id', axis=1, inplace=True)
x_train.drop('Product ID', axis=1, inplace=True)
x_train.drop('Machine failure', axis=1, inplace=True)
y_train = df_train["Machine failure"]

x_test = df_test.copy()
x_test.drop('id', axis=1, inplace=True)
x_test.drop('Product ID', axis=1, inplace=True)

x_train = np.array(x_train)
y_train = np.array(y_train, dtype=np.float32)
x_test = np.array(x_test)

lm_column = 0

mapping = {'L': 0.0, 'M': 0.5, 'H': 1.0}
x_train[:, lm_column] = np.where(x_train[:, lm_column] == 'L', 0.0, np.where(x_train[:, lm_column] == 'M', 0.5, 1.0))
x_test[:, lm_column] = np.where(x_test[:, lm_column] == 'L', 0.0, np.where(x_test[:, lm_column] == 'M', 0.5, 1.0))

# Standardization
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

x_train, x_cv, y_train, y_cv = train_test_split(x_train, y_train, test_size=0.2, random_state=42)


In [10]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, recall_score

# Initialize the Gaussian Naive Bayes model
gnb = GaussianNB()

# Train the model on the training data
gnb.fit(x_train, y_train)

# Predict on the cross-validation data
y_cv_pred = gnb.predict(x_cv)

# Calculate metrics for cross-validation predictions
accuracy = accuracy_score(y_cv, y_cv_pred)
f1 = f1_score(y_cv, y_cv_pred)
recall = recall_score(y_cv, y_cv_pred)

# Print cross-validation metrics
print("Cross-validation performance:")
print("Accuracy:", accuracy)
print("F1 score:", f1)
print("Recall:", recall)

# Predict on the test data
y_test_pred = gnb.predict(x_test)

# Print predictions for the test data
print("Test data predictions:")
print(y_test_pred)


Cross-validation performance:
Accuracy: 0.9961518727552591
F1 score: 0.865211810012837
Recall: 0.769406392694064
Test data predictions:
[0. 0. 0. ... 0. 0. 0.]


In [12]:
indices = np.array(df_test["id"], dtype=int).reshape(-1, 1)

y_test_pred = y_test_pred.reshape(-1, 1)  

submission = np.hstack((indices, y_test_pred))

submission_df = pd.DataFrame(submission, columns=['id', 'Machine failure'])
submission_df['id'] = submission_df['id'].astype(int)  # Convert "id" column to integer
submission_df.to_csv("submissions/gnb.csv", index=False)

In [8]:
from sklearn.ensemble import RandomForestClassifier

# train model
rfc = RandomForestClassifier(n_estimators=10).fit(x_train, y_train)

# predict on test set
rfc_pred = rfc.predict(x_cv)

from sklearn.metrics import accuracy_score, f1_score, recall_score

print("Random Forest Classifier", "\n\tAccuracy:", accuracy_score(y_cv, rfc_pred), "\n\tF1:", f1_score(y_cv, rfc_pred), "\n\tRecall:", recall_score(y_cv, rfc_pred))

Random Forest Classifier 
	Accuracy: 0.9960052774316499 
	F1: 0.8604353393085787 
	Recall: 0.7671232876712328


In [15]:
import tensorflow as tf
from tensorflow import keras

num_features = x_train.shape[1] 

model = keras.Sequential([
    keras.layers.Dense(11, activation='relu', input_shape=(num_features,)),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f327c556640>

In [17]:
# Evaluate the model on the cross-validation set
cv_loss, cv_accuracy = model.evaluate(x_cv, y_cv)

# Predict probabilities for the cross-validation set
cv_predictions = model.predict(x_cv)

# Convert probabilities to binary predictions
cv_predictions_binary = np.round(cv_predictions)

# Calculate F1 score and recall for the cross-validation set
cv_f1_score = f1_score(y_cv, cv_predictions_binary)
cv_recall = recall_score(y_cv, cv_predictions_binary)

# Print the evaluation metrics
print("CV Loss:", cv_loss)
print("CV Accuracy:", cv_accuracy)
print("CV F1 Score:", cv_f1_score)
print("CV Recall:", cv_recall)


CV Loss: 0.022912541404366493
CV Accuracy: 0.9960419535636902
CV F1 Score: 0.8604651162790699
CV Recall: 0.7602739726027398
