In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import zscore
from sklearn.preprocessing import RobustScaler, LabelEncoder, MinMaxScaler
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier

In [None]:
file_path='/content/drive/MyDrive/combined_wind_farm_A_data_new.csv'
data=pd.read_csv(file_path, delimiter=',', encoding='utf-8', on_bad_lines='skip')
print("File loaded successfully!")

File loaded successfully!


In [None]:
# Replace 0 and 2 with 0, and others with 1
data['status_type_id'] = data['status_type_id'].replace({0: 0, 1: 0, 2: 0, 3: 1, 4: 1, 5: 1})

print(data['status_type_id'].value_counts())

status_type_id
0    898672
1    298075
Name: count, dtype: int64


In [None]:
#Renaming the columns
rename_mapping = {
    'time_stamp': 'time_stamp',
    'asset_id': 'asset_id',
    'id': 'id',
    'train_test': 'train_test',
    'status_type_id': 'status_type_id',
    'sensor_0_avg': 'Ambient temperature avg',
    'sensor_1_avg': 'Wind absolute direction avg',
    'sensor_2_avg': 'Wind relative direction avg',
    'wind_speed_3_avg': 'Windspeed avg',
    'wind_speed_4_avg': 'Estimated windspeed avg',
    'wind_speed_3_max': 'Windspeed max',
    'wind_speed_3_min': 'Windspeed min',
    'wind_speed_3_std': 'Windspeed std',
    'sensor_5_avg': 'Pitch angle avg',
    'sensor_5_max': 'Pitch angle max',
    'sensor_5_min': 'Pitch angle min',
    'sensor_5_std': 'Pitch angle std',
    'sensor_6_avg': 'Hub controller temperature avg',
    'sensor_7_avg': 'Top nacelle controller temperature avg',
    'sensor_8_avg': 'Choke coils on the VCS- section temperature avg',
    'sensor_9_avg': 'VCP-board temperature avg',
    'sensor_10_avg': 'VCS cooling water temperature avg',
    'sensor_11_avg': 'Gearbox bearing on high speed shaft  temperature avg',
    'sensor_12_avg': 'Gearbox oil temperature avg',
    'sensor_13_avg': 'Generator bearing 2(Drive end)  temperature avg',
    'sensor_14_avg': 'Generator bearing 1(Non-drive end) temperature avg',
    'sensor_15_avg': 'Generator stator winding phase 1 temperature avg',
    'sensor_16_avg': 'Generator stator winding phase 2 temperature avg',
    'sensor_17_avg': 'Generator stator winding phase 3 temperature avg',
    'sensor_18_avg': 'Generator rpm avg',
    'sensor_18_max': 'Generator rpm max',
    'sensor_18_min': 'Generator rpm min',
    'sensor_18_std': 'Generator rpm std',
    'sensor_19_avg': 'Split ring chamber temperature avg',
    'sensor_20_avg': 'Busbar section temperature avg',
    'sensor_21_avg': 'IGBT-driver on grid side inverter temperature avg',
    'sensor_22_avg': 'Actual Phase displacement avg',
    'sensor_23_avg': 'Averaged current phase 1',
    'sensor_24_avg': 'Averaged current phase 2',
    'sensor_25_avg': 'Averaged current phase 3',
    'sensor_26_avg': 'Grid frequency',
    'reactive_power_27_avg': 'Possible Grid capacitive reactive power avg',
    'reactive_power_27_max': 'Possible Grid capacitive reactive power max',
    'reactive_power_27_min': 'Possible Grid capacitive reactive power min',
    'reactive_power_27_std': 'Possible Grid capacitive reactive power std',
    'reactive_power_28_avg': 'Possible Grid inductive reactive power avg',
    'reactive_power_28_max': 'Possible Grid inductive reactive power max',
    'reactive_power_28_min': 'Possible Grid inductive reactive power min',
    'reactive_power_28_std': 'Possible Grid inductive reactive power std',
    'power_29_avg': 'Possible Grid active power avg',
    'power_29_max': 'Possible Grid active power max',
    'power_29_min': 'Possible Grid active power min',
    'power_29_std': 'Possible Grid active power std',
    'power_30_avg': 'Grid power avg',
    'power_30_max': 'Grid power max',
    'power_30_min': 'Grid power min',
    'power_30_std': 'Grid power std',
    'sensor_31_avg': 'Grid reactive power avg',
    'sensor_31_max': 'Grid reactive power max',
    'sensor_31_min': 'Grid reactive power min',
    'sensor_31_std': 'Grid reactive power std',
    'sensor_32_avg': 'Averaged voltage phase 1',
    'sensor_33_avg': 'Averaged voltage phase 2',
    'sensor_34_avg': 'Averaged voltage phase 3',
    'sensor_35_avg': 'IGBT-driver on rotor side inverter phase 1 temperature avg',
    'sensor_36_avg': 'IGBT-driver on rotor side inverter phase 2 temperature avg',
    'sensor_37_avg': 'IGBT-driver on rotor side inverter phase 3 temperature avg',
    'sensor_38_avg': 'HV transformer phase L1 temperature avg',
    'sensor_39_avg': 'HV transformer phase L2 temperature avg',
    'sensor_40_avg': 'HV transformer phase L3 temperature avg',
    'sensor_41_avg': 'Hydraulic group oil temperature avg',
    'sensor_42_avg': 'Nacelle direction avg',
    'sensor_43_avg': 'Nacelle temperature avg',
    'sensor_44': 'Generator disconnected active power avg',
    'sensor_45': 'Generator connected in delta active power avg',
    'sensor_46': 'Generator connected in star active power avg',
    'sensor_47': 'Generator disconnected reactive power avg',
    'sensor_48': 'Generator connected in delta reactive power avg',
    'sensor_49': 'Generator connected in star reactive power avg',
    'sensor_50': 'Total active power avg',
    'sensor_51': 'Total reactive power avg',
    'sensor_52_avg': 'Rotor rpm avg',
    'sensor_52_max': 'Rotor rpm max',
    'sensor_52_min': 'Rotor rpm min',
    'sensor_52_std': 'Rotor rpm std',
    'sensor_53_avg': 'Nose cone temperature avg',
}

# Rename the columns in the DataFrame
data.rename(columns=rename_mapping, inplace=True)
# Convert time to python date time format
data['time_stamp']=pd.to_datetime(data['time_stamp'])
#Converting object dtypes to strings
data=data.astype({col: 'string' for col in data.select_dtypes(include='object').columns})

In [None]:
#Handling missing values
data=data.dropna()
missing_data=data.isnull().sum()
print("Missing values: \n", missing_data[missing_data>0])

Missing values: 
 Series([], dtype: int64)


In [None]:
# Remove columns that contain 'min', 'max', or 'std'
filtered_columns = [col for col in data.columns if not any(keyword in col for keyword in ['min', 'max', 'std'])]
data_avg = data[filtered_columns]
data_avg=data_avg.drop(columns=['event_id', 'id', 'event_label', 'event_description', 'train_test'])

In [None]:
# Create a list of unique selected features
unique_selected_features = ['time_stamp', 'status_type_id','asset_id', 'Averaged current phase 1', 'VCP-board temperature avg', 'Possible Grid inductive reactive power avg', 'Generator stator winding phase 1 temperature avg', 'Busbar section temperature avg', 'Generator bearing 2(Drive end)  temperature avg', 'Wind relative direction avg', 'VCS cooling water temperature avg', 'Possible Grid active power avg', 'Windspeed avg', 'Averaged current phase 2', 'Split ring chamber temperature avg', 'Estimated windspeed avg', 'IGBT-driver on grid side inverter temperature avg', 'Averaged current phase 3', 'Generator stator winding phase 3 temperature avg', 'Generator bearing 1(Non-drive end) temperature avg', 'Gearbox bearing on high speed shaft  temperature avg', 'Actual Phase displacement avg', 'Gearbox oil temperature avg', 'Pitch angle avg', 'Possible Grid capacitive reactive power avg', 'Generator stator winding phase 2 temperature avg', 'Ambient temperature avg', 'Choke coils on the VCS- section temperature avg', 'Generator rpm avg']

# Create a new dataset with only the unique selected features
unique_data_avg = data[unique_selected_features]


In [None]:
unique_data_avg=unique_data_avg.drop(columns=['status_type_id', 'asset_id'])

In [None]:
# Create a copy of unique_data_avg
dataset = unique_data_avg.copy()

In [None]:
print(dataset.shape)
print(unique_data_avg.shape)

(1196727, 27)
(1196727, 27)


In [None]:
unique_data_avg.columns

Index(['time_stamp', 'Averaged current phase 1', 'VCP-board temperature avg',
       'Possible Grid inductive reactive power avg',
       'Generator stator winding phase 1 temperature avg',
       'Busbar section temperature avg',
       'Generator bearing 2(Drive end)  temperature avg',
       'Wind relative direction avg', 'VCS cooling water temperature avg',
       'Possible Grid active power avg', 'Windspeed avg',
       'Averaged current phase 2', 'Split ring chamber temperature avg',
       'Estimated windspeed avg',
       'IGBT-driver on grid side inverter temperature avg',
       'Averaged current phase 3',
       'Generator stator winding phase 3 temperature avg',
       'Generator bearing 1(Non-drive end) temperature avg',
       'Gearbox bearing on high speed shaft  temperature avg',
       'Actual Phase displacement avg', 'Gearbox oil temperature avg',
       'Pitch angle avg', 'Possible Grid capacitive reactive power avg',
       'Generator stator winding phase 2 temperat

**SELECTING ONLY NUMERIC COLUMNS AND STANDARDIZING THEM**

In [None]:
# Select only the numerical columns for scaling
numerical_cols = unique_data_avg.select_dtypes(include='number').columns

# Initialize the RobustScaler
scaler = RobustScaler()

# Fit and transform the numerical data using .loc to avoid SettingWithCopyWarning
unique_data_avg.loc[:, numerical_cols] = scaler.fit_transform(unique_data_avg[numerical_cols])

In [None]:
print(dataset.shape)
print(unique_data_avg.shape)

(1196727, 27)
(1196727, 27)


**z SCORE ANOMALY DETECTION**

In [None]:
# Calculate Z-score for numerical columns
z_scores = unique_data_avg[numerical_cols].apply(zscore)

# Flag rows where any Z-score value exceeds the threshold (e.g., 5)
z_threshold = 5
dataset.loc[:, 'z_score_anomaly'] = (z_scores.abs() > z_threshold).any(axis=1)

# Count the number of anomalies
num_anomalies = dataset['z_score_anomaly'].sum()
print(f"Number of anomalies detected: {num_anomalies}")

Number of anomalies detected: 10990


In [None]:
print(dataset.shape)
print(unique_data_avg.shape)

(1196727, 28)
(1196727, 27)


**ISOLATION FOREST ANOMALY DETECTION**

In [None]:
# Initialize Isolation Forest
iso_forest = IsolationForest(contamination=0.01, random_state=42)

# Fit and predict anomalies, creating a new DataFrame to avoid SettingWithCopyWarning
if_anomalies = iso_forest.fit_predict(unique_data_avg[numerical_cols])

# Convert the predictions to a boolean array (True for anomalies)
if_anomalies_bool = if_anomalies == -1

# Assign the results to the DataFrame using .loc
dataset.loc[:, 'if_anomaly'] = if_anomalies_bool.astype(bool)

# Count the number of ML-based anomalies
num_if_anomalies = dataset['if_anomaly'].sum()
print(f"Number of anomalies detected using Isolation Forest: {num_if_anomalies}")

Number of anomalies detected using Isolation Forest: 11966


In [None]:
print(dataset.shape)
print(unique_data_avg.shape)

(1196727, 29)
(1196727, 27)


**TIME SERIES ANOMALY DETECTION**

In [None]:
import pandas as pd

# Assuming unique_data_avg is your DataFrame and 'time_stamp' is your time index
unique_data_avg['time_stamp'] = pd.to_datetime(unique_data_avg['time_stamp'])
unique_data_avg.set_index('time_stamp', inplace=True)

# Ensure the index is unique
if not unique_data_avg.index.is_unique:
    unique_data_avg = unique_data_avg[~unique_data_avg.index.duplicated(keep='first')]

# Define the features to analyze (select all numerical columns)
features = unique_data_avg.select_dtypes(include='number').columns.tolist()

# Set the rolling window size (e.g., for 1 hour with 10-minute intervals)
window_size = 6  # Adjust as needed
threshold = 1.9  # Threshold for anomaly detection

# Create a new DataFrame to store anomalies for each feature
anomalies = pd.DataFrame(index=unique_data_avg.index)

# Create a dictionary to store the number of anomalies detected for each feature
anomaly_counts = {}

# Loop through each feature to calculate rolling averages and detect anomalies
for feature in features:
    # Calculate rolling average and standard deviation
    rolling_avg = unique_data_avg[feature].rolling(window=window_size).mean()
    rolling_std = unique_data_avg[feature].rolling(window=window_size).std()

    # Make a copy of the rolling results before assigning to avoid index issues
    dataset[f'{feature}_rolling_avg'] = rolling_avg.copy()
    dataset[f'{feature}_rolling_std'] = rolling_std.copy()

    # Flag anomalies
    anomalies[feature] = (
        (unique_data_avg[feature] > rolling_avg + threshold * rolling_std) |
        (unique_data_avg[feature] < rolling_avg - threshold * rolling_std)
    )

    # Count the number of anomalies detected for this feature
    num_anomalies = anomalies[feature].sum()
    anomaly_counts[feature] = num_anomalies  # Store in the dictionary

    # Add anomalies column to dataset
    dataset[f'anomaly_{feature}'] = anomalies[feature]

    # # Plotting the anomalies for the current feature
    # plt.figure(figsize=(16, 6))  # Adjust the figure size as needed
    # plt.plot(unique_data_avg.index, unique_data_avg[feature], label=feature, color='blue', alpha=0.6)
    # plt.plot(unique_data_avg.index, dataset[f'{feature}_rolling_avg'], label='Rolling Mean', color='orange', linestyle='--')
    # plt.fill_between(unique_data_avg.index,
    #                  dataset[f'{feature}_rolling_avg'] - threshold * dataset[f'{feature}_rolling_std'],
    #                  dataset[f'{feature}_rolling_avg'] + threshold * dataset[f'{feature}_rolling_std'],
    #                  color='lightgray', alpha=0.5, label='Threshold Range')
    # plt.scatter(unique_data_avg.index[unique_data_avg[f'{feature}_rolling_std'].notna() & anomalies[feature]],
    #             unique_data_avg[feature][anomalies[feature]],
    #             color='red', label='Anomalies', s=10)
    # plt.title(f'{feature} with Anomalies Highlighted')
    # plt.xlabel('Time')
    # plt.ylabel(feature)
    # plt.legend()
    # plt.tight_layout()

    # # Show the plot for the current feature
    # plt.show()

# Print the updated dataset shape
print("Updated dataset shape:", dataset.shape)

# Print the number of anomalies detected for each feature at the end
print("Number of anomalies detected for each feature:")
for feature, count in anomaly_counts.items():
    print(f"{feature}: {count}")


Updated dataset shape: (1196727, 107)
Number of anomalies detected for each feature:
Averaged current phase 1: 18925
VCP-board temperature avg: 26959
Possible Grid inductive reactive power avg: 25530
Generator stator winding phase 1 temperature avg: 16272
Busbar section temperature avg: 24759
Generator bearing 2(Drive end)  temperature avg: 22801
Wind relative direction avg: 5877
VCS cooling water temperature avg: 20870
Possible Grid active power avg: 16624
Windspeed avg: 5805
Averaged current phase 2: 18436
Split ring chamber temperature avg: 25649
Estimated windspeed avg: 5973
IGBT-driver on grid side inverter temperature avg: 19903
Averaged current phase 3: 18622
Generator stator winding phase 3 temperature avg: 16512
Generator bearing 1(Non-drive end) temperature avg: 20766
Gearbox bearing on high speed shaft  temperature avg: 17974
Actual Phase displacement avg: 16911
Gearbox oil temperature avg: 24280
Pitch angle avg: 18750
Possible Grid capacitive reactive power avg: 24805
Gener

In [None]:
# import pandas as pd

# # Ensure time_stamp is datetime and set as index
# unique_data_avg['time_stamp'] = pd.to_datetime(unique_data_avg['time_stamp'])
# unique_data_avg.set_index('time_stamp', inplace=True)

# Reset the index if there are duplicates
if not unique_data_avg.index.is_unique:
    unique_data_avg = unique_data_avg.reset_index(drop=False)  # Keeps original index

# Define the features to analyze (select all numerical columns)
features = unique_data_avg.select_dtypes(include='number').columns.tolist()

# Set the rolling window size (e.g., for 1 hour with 10-minute intervals)
window_size = 6  # Adjust as needed
threshold = 1.9  # Threshold for anomaly detection

# Create a new DataFrame to store anomalies for each feature
anomalies = pd.DataFrame(index=unique_data_avg.index)

# Create a dictionary to store the number of anomalies detected for each feature
anomaly_counts = {}

# Loop through each feature to calculate rolling averages and detect anomalies
for feature in features:
    # Calculate rolling average and standard deviation
    rolling_avg = unique_data_avg[feature].rolling(window=window_size, min_periods=1).mean()
    rolling_std = unique_data_avg[feature].rolling(window=window_size, min_periods=1).std()

    # Fill NaN values in rolling calculations to prevent dropping rows
    rolling_avg = rolling_avg.ffill().bfill()
    rolling_std = rolling_std.ffill().bfill()

    # Flag anomalies
    anomalies[feature] = (
        (unique_data_avg[feature] > rolling_avg + threshold * rolling_std) |
        (unique_data_avg[feature] < rolling_avg - threshold * rolling_std)
    )

    # Count the number of anomalies detected for this feature
    num_anomalies = anomalies[feature].sum()
    anomaly_counts[feature] = num_anomalies  # Store in the dictionary

# Convert anomalies DataFrame to integer (0 or 1) to prevent NaNs
anomalies = anomalies.fillna(0).astype(int)

# Add anomalies columns to dataset without dropping any rows
for feature in features:
    dataset[f'anomaly_{feature}'] = anomalies[feature].values  # Use .values to avoid index alignment issues

# Print the updated dataset shape
print("Updated dataset shape:", dataset.shape)

# Print the number of anomalies detected for each feature at the end
print("Number of anomalies detected for each feature:")
for feature, count in anomaly_counts.items():
    print(f"{feature}: {count}")

Updated dataset shape: (1196727, 55)
Number of anomalies detected for each feature:
Averaged current phase 1: 55618
VCP-board temperature avg: 80716
Possible Grid inductive reactive power avg: 77169
Generator stator winding phase 1 temperature avg: 48795
Busbar section temperature avg: 73059
Generator bearing 2(Drive end)  temperature avg: 68167
Wind relative direction avg: 17799
VCS cooling water temperature avg: 60822
Possible Grid active power avg: 48874
Windspeed avg: 17463
Averaged current phase 2: 54361
Split ring chamber temperature avg: 76036
Estimated windspeed avg: 17815
IGBT-driver on grid side inverter temperature avg: 57859
Averaged current phase 3: 54948
Generator stator winding phase 3 temperature avg: 49564
Generator bearing 1(Non-drive end) temperature avg: 61313
Gearbox bearing on high speed shaft  temperature avg: 54454
Actual Phase displacement avg: 50834
Gearbox oil temperature avg: 73418
Pitch angle avg: 55485
Possible Grid capacitive reactive power avg: 74667
Gen

In [None]:
print(dataset.shape)
print(unique_data_avg.shape)

(1196727, 55)
(1196727, 27)


In [None]:
# List the anomaly columns
anomaly_methods = ['z_score_anomaly', 'if_anomaly']
time_series_anomalies = [col for col in dataset.columns if col.startswith('anomaly_')]

# Combine all anomaly columns into one list
all_anomaly_columns = anomaly_methods + time_series_anomalies

# Calculate the sum of NaN values for each anomaly column
nan_counts = dataset[all_anomaly_columns].isna().sum()

# Print the sum of NaN values for each anomaly column
print("Sum of NaN values for each anomaly column:")
print(nan_counts[nan_counts > 0])  # Only print columns with NaNs


Sum of NaN values for each anomaly column:
Series([], dtype: int64)


In [None]:
import pandas as pd

# Sample features with anomalies
anomaly_methods = ['z_score_anomaly', 'if_anomaly']
time_series_anomalies = [col for col in dataset.columns if col.startswith('anomaly_')]

# Ensure all anomaly columns are binary (0/1)
dataset[anomaly_methods] = dataset[anomaly_methods].apply(lambda x: x.astype(int))
for col in time_series_anomalies:
    dataset[col] = dataset[col].astype(int)

# Create a dictionary to hold the new columns
new_columns = {}

for feature in time_series_anomalies:
    feature_name = feature.replace('anomaly_', '')

    # Create the anomaly comparison column and add it to the dictionary
    new_columns[f'{feature_name}_anomaly_comparison'] = (
        dataset[feature] + dataset['z_score_anomaly'] + dataset['if_anomaly']
    )

# Add all new columns to dataset in one operation
dataset = pd.concat([dataset, pd.DataFrame(new_columns)], axis=1)

# Print the outputs for the new anomaly comparison columns
for col in new_columns.keys():
    print(f"\nColumn: {col}")
    print(dataset[col].value_counts())


Column: Averaged current phase 1_anomaly_comparison
Averaged current phase 1_anomaly_comparison
0    1122187
1      70563
2       3920
3         57
Name: count, dtype: int64

Column: VCP-board temperature avg_anomaly_comparison
VCP-board temperature avg_anomaly_comparison
0    1096602
1      96679
2       3345
3        101
Name: count, dtype: int64

Column: Possible Grid inductive reactive power avg_anomaly_comparison
Possible Grid inductive reactive power avg_anomaly_comparison
0    1099768
1      93800
2       3152
3          7
Name: count, dtype: int64

Column: Generator stator winding phase 1 temperature avg_anomaly_comparison
Generator stator winding phase 1 temperature avg_anomaly_comparison
0    1128453
1      64840
2       3391
3         43
Name: count, dtype: int64

Column: Busbar section temperature avg_anomaly_comparison
Busbar section temperature avg_anomaly_comparison
0    1104419
1      88703
2       3503
3        102
Name: count, dtype: int64

Column: Generator bearing 

In [None]:
dataset.columns

Index(['time_stamp', 'Averaged current phase 1', 'VCP-board temperature avg',
       'Possible Grid inductive reactive power avg',
       'Generator stator winding phase 1 temperature avg',
       'Busbar section temperature avg',
       'Generator bearing 2(Drive end)  temperature avg',
       'Wind relative direction avg', 'VCS cooling water temperature avg',
       'Possible Grid active power avg', 'Windspeed avg',
       'Averaged current phase 2', 'Split ring chamber temperature avg',
       'Estimated windspeed avg',
       'IGBT-driver on grid side inverter temperature avg',
       'Averaged current phase 3',
       'Generator stator winding phase 3 temperature avg',
       'Generator bearing 1(Non-drive end) temperature avg',
       'Gearbox bearing on high speed shaft  temperature avg',
       'Actual Phase displacement avg', 'Gearbox oil temperature avg',
       'Pitch angle avg', 'Possible Grid capacitive reactive power avg',
       'Generator stator winding phase 2 temperat

In [None]:
dataset.shape

(1196727, 81)

In [None]:
unique_data_avg.shape

(1196727, 27)