Machine Learning Exam

This is a classification task.
The two different machine learning models chosen are Linear SVM and Random forest

In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
)
from sklearn.svm import SVC

The dataset is loaded and preprocessed

In [41]:

# Load the dataset
data = pd.read_csv('C:/Users/User/Documents/AAU 7. semester/Machine Learning/Applied Machine Learning mini project 2024-20241115/ML_exam/ai4i2020.csv')

# Display the first 5 rows of the data
print(data.head())
print(data.info())


   UDI Product ID Type  Air temperature [K]  Process temperature [K]  \
0    1     M14860    M                298.1                    308.6   
1    2     L47181    L                298.2                    308.7   
2    3     L47182    L                298.1                    308.5   
3    4     L47183    L                298.2                    308.6   
4    5     L47184    L                298.2                    308.7   

   Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  Machine failure  TWF  \
0                    1551         42.8                0                0    0   
1                    1408         46.3                3                0    0   
2                    1498         49.4                5                0    0   
3                    1433         39.5                7                0    0   
4                    1408         40.0                9                0    0   

   HDF  PWF  OSF  RNF  
0    0    0    0    0  
1    0    0    0    0  
2    0  

In [42]:
# Check if there are missing values in the dataset
if data.isnull().values.any():
    print("Missing values found")
else:
    print("No missing values found")

# Preprocess the data
X = data.drop(columns=['UDI', 'Product ID', 'Type', 'Machine failure'])
y = data['Machine failure']


No missing values found


In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Standardize the features

In [44]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Initializing the SVM classifier using both linear and rbf kernels to see which one is best

In [45]:
linear_svm = SVC(kernel='linear')
rbf_svm = SVC(kernel='rbf')

Performing k-fold cross-validation instead of splitting the dataset before

In [46]:
k = 5  # Number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [47]:
# Cross-validation for linear kernel
linear_cv_scores = cross_val_score(linear_svm, X_train, y_train, cv=kf)
print(f"Linear Kernel - Cross-validation scores: {linear_cv_scores}")
print(f"Linear Kernel - Mean cross-validation score: {linear_cv_scores.mean()}")

# Cross-validation for RBF kernel
rbf_cv_scores = cross_val_score(rbf_svm, X_train, y_train, cv=kf)
print(f"RBF Kernel - Cross-validation scores: {rbf_cv_scores}")
print(f"RBF Kernel - Mean cross-validation score: {rbf_cv_scores.mean()}")

Linear Kernel - Cross-validation scores: [0.999375 0.99875  0.99875  0.999375 0.999375]
Linear Kernel - Mean cross-validation score: 0.999125
RBF Kernel - Cross-validation scores: [0.999375 0.99875  0.99875  0.999375 0.999375]
RBF Kernel - Mean cross-validation score: 0.999125


As seen in the results, it doesn't matter if it's linear or RBF kernel

Now training the SVM with the linear kernel on the entire dataset and evaluating it on the test set. Using the linear kernel is less heavy to compute

In [48]:
linear_svm.fit(X_train, y_train)

Now for the prediction

In [49]:
y_pred = linear_svm.predict(X_test)


And the evaluation

In [50]:

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC AUC: {roc_auc}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.999
Precision: 1.0
Recall: 0.9672131147540983
F1 Score: 0.9833333333333333
ROC AUC: 0.9836065573770492
Confusion Matrix:
[[1939    0]
 [   2   59]]
