# Day 26: ChatOps-Driven Automated Remediation

Demo: Provide log file(s), predict remediation, execute the command dynamically, and generate a Slack-ready summary using Ollama Llama 3.

In [5]:
# Import libraries
import subprocess
import requests
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import re
import os
from slack_sdk import WebClient

# Improve training data to match real log formats and add more patterns
training_logs = [
    "Pod crashloop detected in deployment <target>",
    "High memory usage in service <target>",
    "Node not ready: <target>",
    "Disk full warning on <target>",
    "Network latency spike in <target> pods",
    "Pod <target> in namespace <ns> is not healthy",
    "Pod event: Reason: ImagePullBackOff",
    "Pod event: Warning Failed to pull image",
    "Pod event: Warning Error: ErrImagePull",
    "Pod event: Warning Error: ImagePullBackOff",
    "Deployment <target> in namespace <ns> has unavailable replicas",
    "Node <target> is not ready",
    "Service <target> in namespace <ns> has no endpoints"
]
remediation_labels = [
    "restart pods",
    "scale pods",
    "cordon node",
    "cleanup disk",
    "restart network pods",
    "restart pods",
    "restart pods",
    "restart pods",
    "restart pods",
    "restart pods",
    "scale pods",
    "cordon node",
    "restart network pods"
]

# Train classifier to predict remediation from log text
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(training_logs)
clf = LogisticRegression()
clf.fit(X_train, remediation_labels)

# Predict remediation from log text (generic, not hardcoded)
def predict_remediation(log_text):
    # Use ML classifier for all errors, fallback to pattern matching for common cases
    common_patterns = [
        (r'ImagePullBackOff|ErrImagePull|Failed to pull image|invalid image|not found', "fix image"),
        (r'crashloop|CrashLoopBackOff', "restart pods"),
        (r'unhealthy|not healthy', "restart pods"),
        (r'unavailable replicas', "scale pods"),
        (r'High memory usage|OOMKilled', "scale pods"),
        (r'Node not ready|NotReady', "cordon node"),
        (r'Disk full warning|No space left', "cleanup disk"),
        (r'Network latency spike|NetworkUnavailable', "restart network pods"),
        (r'no endpoints', "restart network pods")
    ]
    for pat, remediation in common_patterns:
        if re.search(pat, log_text, re.IGNORECASE):
            return remediation
    # Fallback to ML classifier for any other error
    X_test = vectorizer.transform([log_text])
    return clf.predict(X_test)[0]

# Extract target from log text using simple pattern matching
def extract_target(log_text, remediation):
    # Extract app label if present (from log line: app=frontend-app)
    app_match = re.search(r'app=([\w-]+)', log_text)
    if app_match and app_match.group(1):
        return app_match.group(1)
    # Try deployment name
    deploy_match = re.search(r'deployment ([\w-]+)', log_text)
    if deploy_match:
        return deploy_match.group(1)
    # Try pod name and map to deployment (strip hash if present)
    pod_match = re.search(r'Pod ([\w-]+)', log_text)
    if pod_match:
        pod_name = pod_match.group(1)
        base_name = re.sub(r'-[a-z0-9]+$', '', pod_name)
        return base_name
    # ...existing code for service/node...
    svc_match = re.search(r'service ([\w-]+)', log_text)
    if svc_match:
        return svc_match.group(1)
    node_match = re.search(r'node[- ]([\w-]+)', log_text)
    if node_match:
        return node_match.group(1)
    return None

# Map remediation actions to shell commands dynamically
def run_remediation(action, target):
    if action == "fix image" and target:
        # Suggest the correct image fix (manual intervention required)
        return f"Manual intervention required: Update deployment '{target}' to use a valid image instead of 'doesnotexist:latest'. Example: kubectl set image deployment/{target} <container>=<valid-image>"
    elif action == "restart pods" and target:
        cmd = ["kubectl", "rollout", "restart", f"deployment/{target}"]
    elif action == "scale pods" and target:
        cmd = ["kubectl", "scale", f"deployment/{target}", "--replicas=5"]
    elif action == "cordon node" and target:
        cmd = ["kubectl", "cordon", target]
    elif action == "cleanup disk" and target:
        cmd = ["echo", f"rm -rf /var/lib/{target}/tmp/*"]
    elif action == "restart network pods" and target:
        cmd = ["kubectl", "rollout", "restart", f"deployment/{target}"]
    else:
        return "Unknown remediation action or target."
    try:
        result = subprocess.run(cmd, capture_output=True, text=True)
        return result.stdout.strip() if result.returncode == 0 else result.stderr.strip()
    except Exception as e:
        return f"Error executing remediation: {e}"

# LLM confirmation log using Ollama Llama 3
def llm_confirmation_log(log_text, action, target, remediation_output):
    prompt = (
        f"You are a DevOps assistant. Given the following log and remediation, summarise for Slack:\n"
        f"Log: {log_text}\n"
        f"Remediation: {action} on {target}\n"
        f"Output: {remediation_output}\n"
        f"Explain what was detected, what was done, and the result."
    )
    response = requests.post(
        "http://localhost:11434/api/generate",
        json={
            "model": "llama3",
            "prompt": prompt,
            "stream": False
        }
    )
    return response.json().get("response", "No response from Llama 3.")

# Load Slack token from .env file
from dotenv import load_dotenv
load_dotenv()
SLACK_BOT_TOKEN = os.getenv("SLACK_BOT_TOKEN")
SLACK_CHANNEL_ID = os.getenv("SLACK_CHANNEL_ID", "general")
slack_client = WebClient(token=SLACK_BOT_TOKEN)

def send_slack_notification(message):
    try:
        slack_client.chat_postMessage(channel=SLACK_CHANNEL_ID, text=message)
        print(f"[Slack] Notification sent to channel {SLACK_CHANNEL_ID}")
    except Exception as e:
        print(f"[Slack] Failed to send notification: {e}")

# Demo: Provide log file(s) and run full workflow
def process_log_file(log_file_path):
    with open(log_file_path, "r") as f:
        log_lines = f.readlines()
    summary_messages = []
    for log_text in log_lines:
        log_text = log_text.strip()
        if not log_text or log_text.startswith("---"):
            continue
        print(f"[Backend] Read log: {log_text}")
        action = predict_remediation(log_text)
        print(f"[Backend] Predicted remediation: {action}")
        target = extract_target(log_text, action)
        print(f"[Backend] Extracted target: {target}")
        output = run_remediation(action, target)
        print(f"[Backend] Remediation command output: {output}")
        confirmation = llm_confirmation_log(log_text, action, target, output)
        print(f"[Backend] LLM confirmation log: {confirmation}")
        print(confirmation)
        summary_messages.append(confirmation)
    # Send Slack notification with the summary of all actions
    if summary_messages:
        slack_message = "\n\n".join(summary_messages)
        send_slack_notification(slack_message)

# Example usage: process_log_file('incident.log')
# Uncomment and provide your log file path
process_log_file('incident.log')

[Backend] Read log: Pod frontend-app-57f45d48b7-tv28q in namespace default (app=frontend-app) has error:       Reason:       ImagePullBackOff
[Backend] Predicted remediation: fix image
[Backend] Extracted target: frontend-app
[Backend] Remediation command output: Manual intervention required: Update deployment 'frontend-app' to use a valid image instead of 'doesnotexist:latest'. Example: kubectl set image deployment/frontend-app <container>=<valid-image>
[Backend] LLM confirmation log: Here's a summary for Slack:

**Issue Alert**

* **Detection:** The frontend-app pod in namespace `default` experienced an "ImagePullBackOff" error.
* **Remediation:** We need to update the deployment 'frontend-app' to use a valid image instead of `doesnotexist:latest`.
* **Action Required:** Manual intervention is needed to fix this issue. To resolve, we'll need to run the command: `kubectl set image deployment/frontend-app <container>=<valid-image>`
* **Result:** The pod will be updated to use the new v