In [None]:
import os
from dotenv import load_dotenv

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [None]:
from openai import OpenAI

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [None]:
from openai import OpenAI

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

response = client.chat.completions.create(
    model="gpt-5",
    messages=[{"role": "user", "content": "Hello GPT-5, are you working?"}],
)

print(response.choices[0].message.content)

Hi! Yes, I’m here and ready to help. What can I do for you today?


In [None]:
log_data = """
2025-09-18 02:10:21 INFO Starting server on port 8080
2025-09-18 02:10:25 WARNING High memory usage detected: 85%
2025-09-18 02:10:27 ERROR Database connection failed
2025-09-18 02:10:31 INFO Retrying database connection
2025-09-18 02:10:35 ERROR Database connection failed
2025-09-18 02:10:40 CRITICAL Service unavailable due to repeated DB failures
"""

with open("system_logs.txt", "w") as f:
    f.write(log_data)

In [None]:
with open("system_logs.txt", "r") as f:
    logs = f.read()

print("Raw logs:\n", logs)

Raw logs:
 
2025-09-18 02:10:21 INFO Starting server on port 8080
2025-09-18 02:10:27 ERROR Database connection failed
2025-09-18 02:10:31 INFO Retrying database connection
2025-09-18 02:10:35 ERROR Database connection failed
2025-09-18 02:10:40 CRITICAL Service unavailable due to repeated DB failures



In [None]:
response = client.chat.completions.create(
    model="gpt-5",
    messages=[
        {
            "role": "system",
            "content": "You are an expert IT assistant who helps analyze server logs.",
        },
        {
            "role": "user",
            "content": f"Here are some system logs:\n{logs}\n\nPlease explain in simple terms what is happening.",
        },
    ],
)

print(response.choices[0].message.content)

- The server started up on port 8080.
- Right after starting, it reported high memory usage (85%).
- The application tried to connect to its database but failed.
- It attempted a retry, but the database connection failed again.
- After repeated failures, the service declared itself unavailable because it can’t operate without a working database connection.

In short: the service came up, was under memory pressure, couldn’t reach its database despite retrying, and shut itself off to avoid running in a broken state.


In [None]:
classification_prompt = f"""
You are an expert Site Reliability Engineer.
Read the following logs and classify the incident severity as:
- P1 (Critical): service down or customer impact
- P2 (High): service degraded, limited impact
- P3 (Low): minor issue, warning only

Explain rationale behind final classification.

Logs:
{logs}
"""

response = client.chat.completions.create(
    model="gpt-5", messages=[{"role": "user", "content": classification_prompt}]
)


print(response.choices[0].message.content)

Severity: P1 (Critical)

Rationale:
- The logs show repeated database connection failures without recovery.
- The final entry explicitly states "CRITICAL Service unavailable due to repeated DB failures," indicating the service is unavailable.
- Service unavailability implies customer-impacting outage, meeting P1 criteria.


In [None]:
summary_prompt = f"""
You are an expert Incident Response Assistant.
Read the following logs and provide:
1. A short summary (2-3 sentences) in plain English.
2. Possible root cause hints (bullet points).

Logs:
{logs}
"""

response = client.chat.completions.create(
    model="gpt-5", messages=[{"role": "user", "content": summary_prompt}]
)

print(response.choices[0].message.content)

1) Summary:
- The service started on port 8080, then immediately reported high memory usage (85%).
- Attempts to connect to the database failed twice in quick succession, leading to the service becoming unavailable due to repeated DB failures.

2) Possible root cause hints:
- Database endpoint down or unreachable (DB outage, maintenance, crash).
- Misconfigured DB connection settings or secrets (host/port, DNS name, credentials, SSL/TLS parameters).
- Network/DNS issues between app and DB (firewall/security group rules, routing, DNS resolution failure).
- DB connection exhaustion or limits reached (max_connections, pool misconfiguration, connection leak).
- High memory pressure on the app host causing resource starvation or OOM-affected operations.
- Recent deployment or config change impacting connectivity or authentication.
- TLS/certificate issues (expired certs, CA trust not configured, server now requiring SSL).
- Timeouts too aggressive for current latency, causing connection att

In [None]:
import smtplib
import os
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart


def send_email_alert(message: str):
    sender_email = os.getenv("SENDER_EMAIL")
    receiver_email = os.getenv("RECEIVER_EMAIL")
    app_password = os.getenv("EMAIL_APP_PASSWORD")  # Use Google App Password

    subject = "Incident Alert from GPT-5"

    # Build the email
    msg = MIMEMultipart()
    msg["From"] = sender_email
    msg["To"] = receiver_email
    msg["Subject"] = subject
    msg.attach(MIMEText(message, "plain"))

    # Send the email
    try:
        server = smtplib.SMTP("smtp.gmail.com", 587)
        server.starttls()
        server.login(sender_email, app_password)
        server.sendmail(sender_email, receiver_email, msg.as_string())
        server.quit()
        print("[EMAIL SENT] " + message)
    except Exception as e:
        print("[ERROR] Failed to send email:", e)

In [None]:
tools = [
    {
        "type": "function",
        "function": {
            "name": "send_email_alert",
            "description": "Send an email alert to the incident response team",
            "parameters": {
                "type": "object",
                "properties": {"message": {"type": "string"}},
                "required": ["message"],
            },
        },
    }
]

In [None]:
incident_logs = """
2025-09-18 02:10:27 ERROR Database connection failed
2025-09-18 02:10:35 ERROR Database connection failed
2025-09-18 02:10:40 CRITICAL Service unavailable due to repeated DB failures
"""

response = client.chat.completions.create(
    model="gpt-5",
    messages=[
        {"role": "system", "content": "You are an AI that monitors incidents."},
        {"role": "user", "content": f"Analyze these logs:\n{incident_logs}"},
    ],
    tools=tools,
    tool_choice="auto",
)

print(response)

ChatCompletion(id='chatcmpl-CKr7HpNS3iBumO67Zast0R60xtvGU', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here’s a concise incident analysis based on the provided logs.\n\nSummary\n- The application experienced repeated database connection failures culminating in a critical service outage.\n- Timeframe: 2025-09-18 02:10:27 to 02:10:40 (13 seconds from first error to outage).\n- Impact: Service unavailable due to inability to reach the database; likely full outage for the affected service.\n\nTimeline\n- 02:10:27 ERROR: Database connection failed\n- 02:10:35 ERROR: Database connection failed\n- 02:10:40 CRITICAL: Service unavailable due to repeated DB failures\n\nSeverity\n- SEV-1 (customer-impacting outage)\n\nMost likely causes (ranked)\n1) Database unreachable (DB host down, failover in progress, maintenance, crash/restart)\n2) Network issues between app and DB (security group/firewall change, routing/DNS problem, TLS/cert issues

In [None]:
def escalation_workflow(severity: str, summary: str):
    if severity == "P3":
        return "[NO ACTION] Logged only."
    elif severity == "P2":
        # Email ops team
        send_email_alert(f"[P2 Incident] {summary}")
        return "[ACTION] Emailed ops team."
    elif severity == "P1":
        # Email ops team + escalate to manager
        send_email_alert(f"[P1 CRITICAL] {summary} | Escalating to manager.")
        # (Optional: call calendar tool to schedule war room)
        return "[ACTION] Emailed ops + manager. War room escalation triggered."
    else:
        return "[UNKNOWN] No matching workflow."


incident_logs = """
2025-09-18 02:10:27 ERROR Database connection failed
2025-09-18 02:10:35 ERROR Database connection failed
2025-09-18 02:10:40 CRITICAL Service unavailable due to repeated DB failures
"""

classification_prompt = f"""
You are an expert incident classifier.
Classify the severity as P1, P2, or P3 and give a 1-line summary.

Logs:
{incident_logs}
"""

response = client.chat.completions.create(
    model="gpt-5", messages=[{"role": "user", "content": classification_prompt}]
)

ai_output = response.choices[0].message.content
print("GPT-5 Output:", ai_output)

# Example: Parse severity + summary (simplified for demo)
severity = "P1"
summary = "Database unavailable due to repeated failures"

# Run workflow
result = escalation_workflow(severity, summary)
print(result)

GPT-5 Output: Severity: P1
Summary: Service outage caused by repeated database connection failures.
[EMAIL SENT] [P1 CRITICAL] Database unavailable due to repeated failures | Escalating to manager.
[ACTION] Emailed ops + manager. War room escalation triggered.


In [None]:
# Create a fake "large" log with repeating patterns
large_logs = (
    "\n".join(
        [
            f"2025-09-18 02:{i:02d}:00 ERROR Database connection failed"
            for i in range(100)
        ]
    )
    + "\n2025-09-18 03:00:00 CRITICAL Service unavailable"
)

with open("large_logs.txt", "w") as f:
    f.write(large_logs)

print("Log file size:", len(large_logs.splitlines()), "lines")

Log file size: 101 lines


In [None]:
with open("large_logs.txt", "r") as f:
    logs = f.read()

summary_prompt = f"""
You are an expert incident responder.
Read the following logs and provide:
1. A 3-sentence summary in plain English.
2. The final incident severity (P1, P2, P3).
3. Root cause hints in bullet points.

Logs:
{logs}
"""

response = client.chat.completions.create(
    model="gpt-5",  # large context model
    messages=[{"role": "user", "content": summary_prompt}],
)

print(response.choices[0].message.content)

1) Summary:
- Starting at 02:00, the application repeatedly failed to connect to the database for about an hour.
- At 03:00, the issue escalated to a full service outage (CRITICAL: Service unavailable), indicating customer-facing impact.
- The logs include malformed timestamps (02:60–02:99), suggesting a logging/clock or ingestion anomaly in addition to the database issue.

2) Final incident severity: P1

3) Root cause hints:
- Database unavailable: host/cluster outage, maintenance, failover stuck, or storage/CPU exhaustion on the DB.
- Network/access: firewall/security group change, route/DNS issue, VPC peering or load balancer change blocking DB access.
- Authentication/TLS: expired/rotated DB credentials or certificates not deployed to the app.
- Connection exhaustion: app-side connection leak or pool misconfiguration hitting max_connections from ~02:00 onward.
- Recent change: config/deployment at or just before 02:00 affecting DB endpoint, ports, or credentials.
- Observability an

In [None]:
import base64
from openai import OpenAI

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

image_path = "/content/memory_graph.png"


def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


base64_image = encode_image(image_path)

response = client.chat.completions.create(
    model="gpt-5",  # multimodal model
    messages=[
        {"role": "system", "content": "You are a monitoring assistant."},
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Analyze this graph and explain what's happening.",
                },
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base66,{base64_image}"},
                },
            ],
        },
    ],
)


print(response.choices[0].message.content)

- Total RAM is ~14 GB and it’s essentially all in use the whole time, split between “used” (app memory) and “cached/buffer” (reclaimable file cache). That’s normal on Linux—free RAM gets filled with cache.

- Around 05:42 there’s a step-up of “used” memory by ~600–800 MB (likely a service start or workload ramp). The kernel trims a bit of cache and also evicts a small amount of cold pages to swap.

- Swap rises only to ~19 MB and then stays flat—this is tiny and not a sign of real memory pressure.

- Just before 06:00 the red swap line drops abruptly to 0. That usually means one of:
  - the swapped-out pages were touched and paged back in, or
  - swap was toggled (swapoff/swapon) or the workload ended, freeing those pages.
  The change is too small to noticeably move the stacked bars.

Bottom line: brief increase in working set caused a small amount of swapping; the system remained stable and not memory-starved. If you want to confirm what happened at ~06:00, check for a cron/systemd a