# LAB 4 - Anomaly Detection & NLP (Oil & Gas Operations)
**Lab:** 4 of 4  
**Last updated:** 2025-12-26

## Goal
A) Detect anomalies in sensor time-series using Isolation Forest  
B) Apply NLP to incident/maintenance reports (keywords + sentiment + optional classifier)

## Recommended datasets
- NAB (time series): https://github.com/numenta/NAB  
- Kaggle incident reports (text): https://www.kaggle.com/datasets (search: incident reports / safety reports / maintenance logs)

This notebook includes synthetic fallbacks for offline delivery.

## 1) Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import IsolationForest

## Part A - Time-series anomaly detection

## 2) Load time-series (NAB CSV or synthetic)

In [None]:
from pathlib import Path

NAB_PATH = Path("data/nab/realKnownCause/nyc_taxi.csv")  # example NAB file

def make_synthetic_timeseries(n=2000, seed=9):
    rng = np.random.default_rng(seed)
    t = pd.date_range("2024-01-01", periods=n, freq="H")
    baseline = 100 + 5*np.sin(np.linspace(0, 24*np.pi, n)) + rng.normal(0, 1.5, n)
    anomalies_idx = rng.choice(np.arange(200, n-200), size=12, replace=False)
    series = baseline.copy()
    series[anomalies_idx] += rng.normal(18, 6, size=len(anomalies_idx))
    df = pd.DataFrame({"timestamp": t, "sensor_value": series})
    return df

if NAB_PATH.exists():
    ts = pd.read_csv(NAB_PATH)
    ts.columns = [c.lower() for c in ts.columns]
    if "timestamp" not in ts.columns:
        ts.rename(columns={ts.columns[0]:"timestamp"}, inplace=True)
    if "value" in ts.columns and "sensor_value" not in ts.columns:
        ts.rename(columns={"value":"sensor_value"}, inplace=True)
    ts["timestamp"] = pd.to_datetime(ts["timestamp"], errors="coerce")
    ts = ts.dropna(subset=["timestamp","sensor_value"]).sort_values("timestamp")
    print("Loaded NAB:", NAB_PATH, "shape:", ts.shape)
else:
    ts = make_synthetic_timeseries()
    print("Using synthetic time-series. shape:", ts.shape)

ts.head()

## 3) Plot the time-series

In [None]:
plt.figure(figsize=(12,3))
plt.plot(ts["timestamp"], ts["sensor_value"])
plt.title("Sensor time-series")
plt.xlabel("timestamp"); plt.ylabel("sensor_value")
plt.show()

## 4) Create rolling features

In [None]:
ts_feat = ts.copy()
ts_feat["rolling_mean"] = ts_feat["sensor_value"].rolling(24, min_periods=6).mean()
ts_feat["rolling_std"]  = ts_feat["sensor_value"].rolling(24, min_periods=6).std()
ts_feat["z_proxy"] = (ts_feat["sensor_value"] - ts_feat["rolling_mean"]) / ts_feat["rolling_std"]
ts_feat = ts_feat.fillna(method="bfill").fillna(method="ffill")
ts_feat.head()

## 5) Isolation Forest anomaly detection

In [None]:
X = ts_feat[["sensor_value","rolling_mean","rolling_std","z_proxy"]]

iso = IsolationForest(n_estimators=300, contamination=0.01, random_state=42)
iso.fit(X)

ts_feat["is_anomaly"] = (iso.predict(X) == -1).astype(int)
ts_feat["is_anomaly"].value_counts()

## 6) Plot detected anomalies

In [None]:
plt.figure(figsize=(12,3))
plt.plot(ts_feat["timestamp"], ts_feat["sensor_value"], label="sensor_value")
anoms = ts_feat[ts_feat["is_anomaly"]==1]
plt.scatter(anoms["timestamp"], anoms["sensor_value"], s=25, label="anomaly")
plt.title("Detected anomalies")
plt.legend()
plt.show()

## Part B - NLP on incident reports

## 7) Load incident text (CSV or synthetic)

In [None]:
from pathlib import Path
import re
from collections import Counter

INCIDENT_PATH = Path("data/incidents.csv")  # must contain 'report_text'

def make_synthetic_incidents(n=250, seed=11):
    rng = np.random.default_rng(seed)
    templates = [
        "Pressure spike detected near valve station {v}. Operator performed controlled shutdown.",
        "Minor leak suspected at joint {v}. Maintenance team dispatched. No injuries reported.",
        "Routine inspection completed at station {v}. No abnormalities detected.",
        "High vibration recorded on pump {v}. Recommended bearing replacement.",
        "Unexpected flow drop recorded. Potential blockage near segment {v}.",
        "Emergency stop triggered due to sensor fault at {v}. System restored after recalibration."
    ]
    severities = ["LOW","MEDIUM","HIGH"]
    rows = []
    for i in range(n):
        v = rng.integers(1, 25)
        txt = rng.choice(templates).format(v=v)
        sev = rng.choice(severities, p=[0.6,0.3,0.1])
        rows.append({"incident_id": i+1, "severity": sev, "report_text": txt})
    return pd.DataFrame(rows)

if INCIDENT_PATH.exists():
    incidents = pd.read_csv(INCIDENT_PATH)
    print("Loaded:", INCIDENT_PATH, "shape:", incidents.shape)
else:
    incidents = make_synthetic_incidents()
    print("Using synthetic incidents. shape:", incidents.shape)

def basic_clean(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

incidents["clean_text"] = incidents["report_text"].apply(basic_clean)
incidents.head()

## 8) Keyword frequency

In [None]:
words = " ".join(incidents["clean_text"]).split()
freq = Counter(words)
pd.DataFrame(freq.most_common(20), columns=["word","count"])

## 9) Simple sentiment scoring (offline baseline)

In [None]:
positive = {"routine","completed","restored","no","normal","controlled"}
negative = {"leak","fault","emergency","unexpected","spike","drop","blockage","vibration"}

def simple_sentiment(text):
    tokens = text.split()
    score = sum(1 for t in tokens if t in positive) - sum(1 for t in tokens if t in negative)
    return score

incidents["sentiment_score"] = incidents["clean_text"].apply(simple_sentiment)
incidents.groupby("severity")["sentiment_score"].agg(["count","mean","min","max"])

## 10) Optional: TF-IDF + Logistic Regression classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

X = incidents["clean_text"]
y = incidents["severity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

clf = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=2)),
    ("lr", LogisticRegression(max_iter=200))
])

clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(classification_report(y_test, pred))

## Wrap-up / Operational value
- Detect abnormal behavior earlier (sensor anomalies)
- Prioritize events using text analytics (incident reports)
- Combine structured + unstructured data for better decisions

## Checkpoint questions
1) Why add rolling features for anomaly detection?  
2) How would you tune contamination for a real operations environment?  
3) What are limitations of rule-based sentiment approaches?