In [1]:
import numpy as np
import pandas as pd 

df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")  # for final submission

x_train = df_train.copy()
x_train.drop('id', axis=1, inplace=True)
x_train.drop('Product ID', axis=1, inplace=True)
x_train.drop('Machine failure', axis=1, inplace=True)
y_train = df_train["Machine failure"]

x_test = df_test.copy()
x_test.drop('id', axis=1, inplace=True)
x_test.drop('Product ID', axis=1, inplace=True)

x_train = np.array(x_train)
y_train = np.array(y_train, dtype=np.float32)
x_test = np.array(x_test)

lm_column = 0
x_train[:, lm_column] = np.where(x_train[:, lm_column] == 'L', 0, 1)
x_test[:, lm_column] = np.where(x_test[:, lm_column] == 'L', 0, 1)

x_train = x_train.astype(np.float32)
x_test = x_test.astype(np.float32)

print(x_train[:5, :])

[[0.000e+00 3.006e+02 3.096e+02 1.596e+03 3.610e+01 1.400e+02 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [1.000e+00 3.026e+02 3.121e+02 1.759e+03 2.910e+01 2.000e+02 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 2.993e+02 3.085e+02 1.805e+03 2.650e+01 2.500e+01 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 3.010e+02 3.109e+02 1.524e+03 4.430e+01 1.970e+02 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [1.000e+00 2.980e+02 3.090e+02 1.641e+03 3.540e+01 3.400e+01 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00]]


In [4]:
from sklearn.ensemble import IsolationForest

# Split the training set into x_train and x_cv
split_index = int(len(x_train) * 0.8)  # 80% for training, 20% for cross-validation

x_train_data = x_train[:split_index]
x_cv_data = x_train[split_index:]

# Train the anomaly detection model
anomaly_detector = IsolationForest(contamination=0.05)  # Adjust the contamination parameter as needed
anomaly_detector.fit(x_train_data)

# Predict anomalies for x_cv_data
cv_predictions = anomaly_detector.predict(x_cv_data)

# Evaluate the performance of the anomaly detection model on x_cv_data
cv_accuracy = (cv_predictions == 1).mean()  # Percentage of non-anomalous data points

# Print the accuracy of the anomaly detection model
print("Cross-validation accuracy:", cv_accuracy)

# Predict anomalies for x_test
test_predictions = anomaly_detector.predict(x_test)

# Print the predictions for x_test
print("Test predictions:", test_predictions)


Cross-validation accuracy: 0.9510005130836326
Test predictions: [1 1 1 ... 1 1 1]


In [7]:
indices = np.array(df_test["id"], dtype=int).reshape(-1, 1)
predictions = np.where(test_predictions == -1, 1, 0)  # Consider -1 as anomalies (machine failure)
submission = np.hstack((indices, predictions.reshape(-1, 1)))

submission_df = pd.DataFrame(submission, columns=['id', 'Machine failure'])
submission_df['id'] = submission_df['id'].astype(int)  # Convert "id" column to integer
submission_df.to_csv("submissions/anomaly_detection.csv", index=False)