In [68]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
import pandas as pd
import numpy as np

# Settings
n_samples = 1000
timestamp = pd.date_range(start='2025-04-19', periods=n_samples, freq='10min')

# Simulate normal sensor data
np.random.seed(42)
pH = np.random.normal(loc=7.2, scale=0.05, size=n_samples)             # Stable around 7.2
temperature = np.random.normal(loc=37, scale=0.5, size=n_samples)       # Stable around 37°C
gas_flow = np.random.normal(loc=5.0, scale=0.3, size=n_samples)         # Around 5 L/min
ch4_percent = np.random.normal(loc=62, scale=2, size=n_samples)         # 62% CH4
feeding_rate = np.full(shape=n_samples, fill_value=200)                 # Constant feeding

# Create DataFrame
data = pd.DataFrame({
    'Timestamp': timestamp,
    'pH': pH,
    'Temperature': temperature,
    'GasFlowRate': gas_flow,
    'CH4_Percent': ch4_percent,
    'FeedingRate': feeding_rate
})

data.head()
with open('data.csv', 'w') as f:
    data.to_csv(f, index=False)

In [69]:
n_anomalies = 10
anomaly_indices = np.random.choice(n_samples, n_anomalies, replace=False)

# Acidification event: pH drops
data.loc[anomaly_indices[:3], 'pH'] -= np.random.uniform(0.8, 1.2, size=3)

# Temperature shock: temp drops
data.loc[anomaly_indices[3:6], 'Temperature'] -= np.random.uniform(5, 8, size=3)

# Gas flow crash: flow rate drops
data.loc[anomaly_indices[6:], 'GasFlowRate'] -= np.random.uniform(2, 3, size=4)

# Optionally, tag anomalies
data['is_anomaly'] = 0
data.loc[anomaly_indices, 'is_anomaly'] = 1



In [70]:
data.head(5)

Unnamed: 0,Timestamp,pH,Temperature,GasFlowRate,CH4_Percent,FeedingRate,is_anomaly
0,2025-04-19 00:00:00,7.224836,37.699678,4.797447,58.184385,200,0
1,2025-04-19 00:10:00,7.193087,37.462317,4.956644,60.27923,200,0
2,2025-04-19 00:20:00,7.232384,37.029815,4.762274,61.172789,200,0
3,2025-04-19 00:30:00,7.276151,36.676532,4.907612,65.775375,200,0
4,2025-04-19 00:40:00,7.188292,37.349112,4.431916,63.113106,200,0


In [71]:
data[data['is_anomaly']==1].head(5)

Unnamed: 0,Timestamp,pH,Temperature,GasFlowRate,CH4_Percent,FeedingRate,is_anomaly
22,2025-04-19 03:40:00,7.203376,30.697395,5.408198,61.177147,200,1
337,2025-04-21 08:10:00,7.183931,36.848765,2.074181,61.702061,200,1
407,2025-04-21 19:50:00,6.080601,37.750667,5.24892,62.252356,200,1
431,2025-04-21 23:50:00,5.93396,37.407869,4.694101,64.692452,200,1
592,2025-04-23 02:40:00,7.223949,37.21828,2.225023,66.607278,200,1


In [72]:
data.drop(columns='is_anomaly',inplace=True)

In [73]:
data.head()

Unnamed: 0,Timestamp,pH,Temperature,GasFlowRate,CH4_Percent,FeedingRate
0,2025-04-19 00:00:00,7.224836,37.699678,4.797447,58.184385,200
1,2025-04-19 00:10:00,7.193087,37.462317,4.956644,60.27923,200
2,2025-04-19 00:20:00,7.232384,37.029815,4.762274,61.172789,200
3,2025-04-19 00:30:00,7.276151,36.676532,4.907612,65.775375,200
4,2025-04-19 00:40:00,7.188292,37.349112,4.431916,63.113106,200


In [74]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

model = IsolationForest(
    n_estimators=200,
    max_samples=0.8,
    contamination=0.1,
    max_features=1.0,
    random_state=42,
    n_jobs=-1
)

scaler = StandardScaler()

In [75]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Timestamp    1000 non-null   datetime64[ns]
 1   pH           1000 non-null   float64       
 2   Temperature  1000 non-null   float64       
 3   GasFlowRate  1000 non-null   float64       
 4   CH4_Percent  1000 non-null   float64       
 5   FeedingRate  1000 non-null   int32         
dtypes: datetime64[ns](1), float64(4), int32(1)
memory usage: 43.1 KB


In [76]:
cols_to_scale_=['pH', 'Temperature', 'GasFlowRate', 'CH4_Percent', 'FeedingRate']
scaler.fit_transform(data[cols_to_scale_])


array([[ 0.3440343 ,  1.09916892, -0.56876765, -1.84010685,  0.        ],
       [-0.05903892,  0.71760007, -0.10047397, -0.81984254,  0.        ],
       [ 0.4398702 ,  0.02233317, -0.67223044, -0.38464745,  0.        ],
       ...,
       [ 0.43552468, -0.7344238 , -1.0687622 , -0.29375906,  0.        ],
       [-0.33384559, -0.15666509,  0.32196233,  1.61900145,  0.        ],
       [ 0.39219438, -0.6243293 , -0.10995209,  0.36953064,  0.        ]])

In [77]:
data.head()

Unnamed: 0,Timestamp,pH,Temperature,GasFlowRate,CH4_Percent,FeedingRate
0,2025-04-19 00:00:00,7.224836,37.699678,4.797447,58.184385,200
1,2025-04-19 00:10:00,7.193087,37.462317,4.956644,60.27923,200
2,2025-04-19 00:20:00,7.232384,37.029815,4.762274,61.172789,200
3,2025-04-19 00:30:00,7.276151,36.676532,4.907612,65.775375,200
4,2025-04-19 00:40:00,7.188292,37.349112,4.431916,63.113106,200


In [78]:
X=data[['pH', 'Temperature', 'GasFlowRate', 'CH4_Percent', 'FeedingRate']]

In [79]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [80]:
iso_forest = IsolationForest(
    n_estimators=100,
    max_samples='auto',
    contamination=0.01,   # 2% expected anomalies
    max_features=1.0,
    random_state=42,
    n_jobs=-1
)

iso_forest.fit(X_scaled)

In [81]:
predictions = iso_forest.predict(X_scaled)


In [82]:
dd=pd.DataFrame(predictions)

In [83]:
dd.value_counts()

0 
 1    990
-1     10
Name: count, dtype: int64

In [84]:
data['Anomaly'] = predictions
data['Anomaly'] = data['Anomaly'].map({1: 0, -1: 1})  # 1 for anomaly
data.to_csv("sensor_data_with_anomalies.csv", index=False)
print(" File saved as 'sensor_data_with_anomalies.csv'")

 File saved as 'sensor_data_with_anomalies.csv'
