In [2]:
import numpy as np
import pandas as pd 

df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")  # for final submission

x_train = df_train.copy()
x_train.drop('id', axis=1, inplace=True)
x_train.drop('Product ID', axis=1, inplace=True)
x_train.drop('Machine failure', axis=1, inplace=True)
y_train = df_train["Machine failure"]

x_test = df_test.copy()
x_test.drop('id', axis=1, inplace=True)
x_test.drop('Product ID', axis=1, inplace=True)

x_train = np.array(x_train)
y_train = np.array(y_train, dtype=np.float32)
x_test = np.array(x_test)

lm_column = 0

mapping = {'L': 0.0, 'M': 0.5, 'H': 1.0}
x_train[:, lm_column] = np.where(x_train[:, lm_column] == 'L', 0.0, np.where(x_train[:, lm_column] == 'M', 0.5, 1.0))
x_test[:, lm_column] = np.where(x_test[:, lm_column] == 'L', 0.0, np.where(x_test[:, lm_column] == 'M', 0.5, 1.0))

x_train = x_train.astype(np.float32)
x_test = x_test.astype(np.float32)

In [3]:
from sklearn.model_selection import train_test_split

x_train, x_cv, y_train, y_cv = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [4]:
from sklearn.ensemble import RandomForestClassifier

# train model
rfc = RandomForestClassifier(n_estimators=10).fit(x_train, y_train)

# predict on test set
rfc_pred = rfc.predict(x_cv)

In [5]:
from sklearn.metrics import accuracy_score, f1_score, recall_score

print("Random Forest Classifier", "\n\tAccuracy:", accuracy_score(y_cv, rfc_pred), "\n\tF1:", f1_score(y_cv, rfc_pred), "\n\tRecall:", recall_score(y_cv, rfc_pred))

Random Forest Classifier 
	Accuracy: 0.9960052774316499 
	F1: 0.8604353393085787 
	Recall: 0.7671232876712328


In [6]:
rfc_pred

array([1., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [7]:
rfc_final = rfc.predict(x_test)
print(rfc_final)

[0. 0. 0. ... 0. 0. 0.]


In [9]:
indices = np.array(df_test["id"], dtype=int).reshape(-1, 1)

rfc_final = rfc_final.reshape(-1, 1)  # Reshape rfc_final to have 2 dimensions

submission = np.hstack((indices, rfc_final))

submission_df = pd.DataFrame(submission, columns=['id', 'Machine failure'])
submission_df['id'] = submission_df['id'].astype(int)  # Convert "id" column to integer
submission_df.to_csv("submissions/rfc.csv", index=False)


In [9]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score

# Define the hyperparameters to be fine-tuned
hidden_layer_sizes = (100, 50)  # Tuple representing the number of neurons in each hidden layer
activation = 'relu'  # Activation function ('relu', 'logistic', 'tanh')
alpha = 0.0001  # L2 regularization parameter
learning_rate_init = 0.001  # Initial learning rate

# Initialize the MLPClassifier with the specified hyperparameters
model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes,
                      activation=activation,
                      alpha=alpha,
                      learning_rate_init=learning_rate_init)

# Train the model on the training data
model.fit(x_train, y_train)

# Evaluate performance on cross-validation data
y_cv_pred = model.predict(x_cv)
accuracy = accuracy_score(y_cv, y_cv_pred)
f1 = f1_score(y_cv, y_cv_pred)
recall = recall_score(y_cv, y_cv_pred)

print("Cross-validation performance:")
print("Accuracy:", accuracy)
print("F1 score:", f1)
print("Recall:", recall)

# Generate predictions for the test data
y_test_pred = model.predict(x_test)

# Print the predictions for the test data
print("Test data predictions:")
print(y_test_pred)


Cross-validation performance:
Accuracy: 0.9959686286007476
F1 score: 0.8575129533678756
Recall: 0.7557077625570776
Test data predictions:
[0. 0. 0. ... 0. 0. 0.]


In [10]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, recall_score

# Initialize the Gaussian Naive Bayes model
gnb = GaussianNB()

# Train the model on the training data
gnb.fit(x_train, y_train)

# Predict on the cross-validation data
y_cv_pred = gnb.predict(x_cv)

# Calculate metrics for cross-validation predictions
accuracy = accuracy_score(y_cv, y_cv_pred)
f1 = f1_score(y_cv, y_cv_pred)
recall = recall_score(y_cv, y_cv_pred)

# Print cross-validation metrics
print("Cross-validation performance:")
print("Accuracy:", accuracy)
print("F1 score:", f1)
print("Recall:", recall)

# Predict on the test data
y_test_pred = gnb.predict(x_test)

# Print predictions for the test data
print("Test data predictions:")
print(y_test_pred)


Cross-validation performance:
Accuracy: 0.9960419262625523
F1 score: 0.8625954198473281
Recall: 0.773972602739726
Test data predictions:
[0. 0. 0. ... 0. 0. 0.]


In [12]:
from sklearn.metrics import accuracy_score, f1_score, recall_score
from xgboost import XGBClassifier

# Initialize the XGBoost classifier
model = XGBClassifier()

# Train the model on the training data
model.fit(x_train, y_train)

# Make predictions on the cross-validation data
y_cv_pred = model.predict(x_cv)

# Evaluate performance on cross-validation data
accuracy = accuracy_score(y_cv, y_cv_pred)
f1 = f1_score(y_cv, y_cv_pred)
recall = recall_score(y_cv, y_cv_pred)

print("Cross-validation performance:")
print("Accuracy:", accuracy)
print("F1 score:", f1)
print("Recall:", recall)

# Generate predictions for the test data
y_test_pred = model.predict(x_test)

# Print the predictions for the test data
print("Test data predictions:")
print(y_test_pred)


Cross-validation performance:
Accuracy: 0.9958586821080407
F1 score: 0.8567807351077312
Recall: 0.771689497716895
Test data predictions:
[0 0 0 ... 0 0 0]
