In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
import pickle

### Load the data from csv

In [314]:
df = pd.read_csv('turbine_data.csv')

### Examine the proportions of normal to anomalous data entries are in the dataset

In [315]:
df["is_anomaly"].value_counts()

is_anomaly
0    992
1      8
Name: count, dtype: int64

### Resample (bootstrap) the data so that normal and anomalous entries are equally represented

In [316]:
# Subset df to get only the anomalies
anomalous_df = df[df["is_anomaly"] == 1]

In [317]:
# Draw a sample of n=500 anomalous entries
anomalous_df = anomalous_df.sample(500, replace=True)  
# param. replace set to True allows sampling from the same row more than once

In [318]:
# Subset df to get only the normal readings
normal_df = df[df["is_anomaly"] == 0]

In [319]:
# Draw a sample of n=500 normal entries
normal_df = normal_df.sample(500, replace=True)

In [320]:
# Concatenate samples back to df
df = pd.concat([anomalous_df, normal_df])

In [321]:
# Verify the new proportions of anomalous/normal records
df["is_anomaly"].value_counts()

is_anomaly
1    500
0    500
Name: count, dtype: int64

### Train the model

In [322]:
# Divide df into a feature matrix and target variable
X = df.drop(columns="is_anomaly")  # Feature matrix
y = df['is_anomaly']  # Labels (0 for normal, 1 for anomalous)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate the model
model = make_pipeline(
    MinMaxScaler(),  # scale the features
    KNeighborsClassifier(n_neighbors=3)
)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

## Evaluate the model

In [323]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       104
           1       1.00      1.00      1.00        96

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



Sanity check on a completely new dataset

In [324]:
test_df = pd.read_csv('test_turbine_data.csv')

test_df["prediction"] = model.predict(test_df.drop(columns="is_anomaly"))

In [325]:
# See where the model made wrong predictions
print(test_df[test_df["is_anomaly"]!=test_df["prediction"]].shape)
test_df[test_df["is_anomaly"]!=test_df["prediction"]].head()

(2, 5)


Unnamed: 0,sound_volume,humidity,temperature,is_anomaly,prediction
233,72.14,95.8,11.74,0,1
453,59.27,96.02,17.52,0,1


There are just a few wrongly labeled observations, what allows us to assess the overal performance of the model as very good.

### Save the model to a pickle file

In [326]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)