# LLM with RCA after Anomaly Detection Using RAG
## Project Overview
This project aims to perform root cause analysis (RCA) using a large language model (LLM) with dynamic Retrieve-and-Generate (RAG) based on AMF, SMF, and UPF telecom metrics. The workflow includes anomaly detection and correlation analysis across the datasets, leveraging system logs for RCA insights.

In [1]:
# Install required dependencies (if not already installed)
# %pip install -r requirements.txt

In [2]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import CSVLoader, TextLoader
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema import StrOutputParser, Document 
from langchain.text_splitter import CharacterTextSplitter
from datetime import datetime

# Load environment variables
load_dotenv()

False

In [3]:
# Function to load the LLM
def get_llm(model='gpt-4'):
    openai_api_key = os.getenv('OPENAI_API_KEY')
    if not openai_api_key:
        openai_api_key = input("Please enter your OpenAI API key: ")
        os.environ["OPENAI_API_KEY"] = openai_api_key
    return ChatOpenAI(temperature=0, model_name=model)

llm = get_llm()

In [4]:
# Load and preprocess telecom metrics (AMF, SMF, UPF)
def load_metrics(filenames):
    datasets = {}
    for name, file in filenames.items():
        df = pd.read_csv(file)
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df.sort_values('timestamp', inplace=True)
        datasets[name] = df
    return datasets

metric_files = {
    'amf': 'data/amf_metrics.csv',
    'smf': 'data/smf_metrics.csv',
    'upf': 'data/upf_metrics.csv'
}

metrics_data = load_metrics(metric_files)

In [5]:
# Detect anomalies using z-score
def detect_anomalies(datasets, threshold=3):
    anomalies = {}
    for name, df in datasets.items():
        numeric_data = df.drop(columns=['timestamp'])
        anomaly_flags = (np.abs((numeric_data - numeric_data.mean()) / numeric_data.std()) > threshold)
        anomalies[name] = pd.concat([df[['timestamp']], anomaly_flags], axis=1)
    return anomalies

anomalies = detect_anomalies(metrics_data)
print(anomalies['amf'].head())
print(anomalies['smf'].head())  
print(anomalies['upf'].head())

                   timestamp  http_connectivity  cpu_utilization  \
0 2025-01-13 10:23:31.133476              False            False   
1 2025-01-13 10:24:31.133476              False            False   
2 2025-01-13 10:25:31.133476              False            False   
3 2025-01-13 10:26:31.133476              False            False   
4 2025-01-13 10:27:31.133476              False            False   

   memory_utilization  registration_rate  session_setup_rate  \
0               False              False               False   
1               False              False               False   
2               False              False               False   
3               False              False               False   
4               False              False               False   

   authentication_success_rate  n1n2_message_rate  registration_success_rate  \
0                        False              False                      False   
1                        False              Fa

In [6]:
# Add a description column to each dataset
def add_description_column(datasets):
    """
    Add a description column to each dataset for embedding purposes.

    Args:
        datasets (dict): A dictionary of dataset names and DataFrames.

    Returns:
        dict: The updated dictionary with description columns added.
    """
    for name, df in datasets.items():
        df['description'] = df.apply(
            lambda row: f"{row['timestamp']} - " +
                        " | ".join([f"{col}: {row[col]}" for col in df.columns if col != 'timestamp']),
            axis=1
        )
    return datasets

# Add descriptions to datasets
metrics_data = add_description_column(metrics_data)

In [7]:
# Merge anomalies for correlation analysis
def correlate_anomalies(anomalies):
    combined = anomalies['amf']
    for name, df in anomalies.items():
        if name != 'amf':
            combined = pd.merge(combined, df, on='timestamp', suffixes=(f'_{name}_amf', f'_{name}'))
    combined['combined_anomaly'] = combined.drop(columns=['timestamp']).any(axis=1)
    return combined[combined['combined_anomaly']]

correlated_anomalies = correlate_anomalies(anomalies)
print(correlated_anomalies.head())

                     timestamp  http_connectivity  cpu_utilization_smf_amf  \
181 2025-01-13 13:24:31.133476               True                    False   
182 2025-01-13 13:25:31.133476               True                    False   
183 2025-01-13 13:26:31.133476               True                    False   

     memory_utilization_smf_amf  registration_rate  session_setup_rate  \
181                       False              False               False   
182                       False              False               False   
183                       False              False               False   

     authentication_success_rate  n1n2_message_rate  \
181                        False              False   
182                        False              False   
183                        False              False   

     registration_success_rate  slice_selection_success_rate  ...  \
181                      False                         False  ...   
182                      False 

In [8]:
# Process metrics for RAG
def process_metrics(datasets, column_to_embed):
    """
    Process multiple metrics datasets to create individual vector stores for RAG.

    Args:
        datasets (dict): A dictionary of dataset names and DataFrames.
        column_to_embed (str): The column containing text or descriptions to embed.

    Returns:
        dict: A dictionary of vector stores for each dataset.
    """
    vector_stores = {}

    for name, df in datasets.items():
        if column_to_embed not in df.columns:
            raise ValueError(f"Column '{column_to_embed}' not found in the {name} DataFrame.")

        # Convert the column to a list of Document objects
        documents = [Document(page_content=row[column_to_embed]) for _, row in df.iterrows()]

        # Split the documents into chunks (if applicable)
        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
        texts = text_splitter.split_documents(documents)

        # Embed the chunks
        embeddings = OpenAIEmbeddings()
        vector_stores[name] = FAISS.from_documents(texts, embeddings)

    return vector_stores

# Example usage
# Assume `metrics_data` is a dictionary containing the AMF, SMF, and UPF DataFrames.
# Each DataFrame should have a column 'description' with text to embed.
vector_stores = process_metrics(metrics_data, column_to_embed='description')

In [9]:
# Load alerts from JSON
def load_alerts(file_path):
    """
    Load alerts from a JSON file.

    Args:
        file_path (str): Path to the JSON file.

    Returns:
        dict: Parsed alert data.
    """
    with open(file_path, 'r') as f:
        return json.load(f)

# Load alerts from the `data/` folder
alerts = load_alerts('data/alerts.json')

In [10]:
# Analyze anomalies using LLM
def analyze_root_cause(llm, vector_stores, anomalies):
    """
    Analyze the root cause of anomalies using LLM and vector stores.

    Args:
        llm (ChatOpenAI): The LLM object for analysis.
        vector_stores (dict): A dictionary of vector stores for AMF, SMF, and UPF.
        anomalies (pd.DataFrame): The DataFrame containing correlated anomalies.

    Returns:
        str: The root cause analysis generated by the LLM.
    """
    prompt_template = """
    Analyze the following anomalies in the metrics and provide a root cause analysis based on the system logs:

    Anomalies:
    {anomalies}

    Relevant AMF metrics information:
    {amf_metrics}

    Relevant SMF metrics information:
    {smf_metrics}

    Relevant UPF metrics information:
    {upf_metrics}

    Provide a detailed root cause analysis.
    """
    
    prompt = PromptTemplate(
        input_variables=["anomalies", "amf_metrics", "smf_metrics", "upf_metrics"],
        template=prompt_template
    )

    # Function to fetch relevant metrics information from vector stores
    def fetch_relevant_metrics(vector_store, anomalies):
        results = []
        for anomaly in anomalies.to_dict(orient="records"):
            results.extend(vector_store.similarity_search(str(anomaly), k=2))
        return "\n".join([res.page_content for res in results])

    # Fetch relevant metrics information
    amf_metrics = fetch_relevant_metrics(vector_stores['amf'], anomalies)
    smf_metrics = fetch_relevant_metrics(vector_stores['smf'], anomalies)
    upf_metrics = fetch_relevant_metrics(vector_stores['upf'], anomalies)

    # Combine data into the chain
    chain = ({
        "anomalies": RunnablePassthrough(),
        "amf_metrics": lambda x: amf_metrics,
        "smf_metrics": lambda x: smf_metrics,
        "upf_metrics": lambda x: upf_metrics
    } | prompt | llm | StrOutputParser())

    return chain.invoke(str(anomalies))

# Example usage
root_cause_analysis = analyze_root_cause(llm, vector_stores, correlated_anomalies)
print(root_cause_analysis)


Based on the provided anomalies and system logs, the root cause of the anomalies seems to be related to the 'cpu_utilization_smf_amf' metric. This metric is showing as 'False' during the timestamps where anomalies are detected. 

The 'cpu_utilization_smf_amf' metric is likely indicating the CPU utilization of the Session Management Function (SMF) and Access and Mobility Management Function (AMF) components of the system. A 'False' value for this metric could indicate that the CPU utilization is exceeding a certain threshold or that there is an issue with the CPU utilization monitoring itself.

Looking at the relevant AMF and SMF metrics information, we can see that the CPU utilization values are fluctuating. For example, the CPU utilization for AMF at '2025-01-13 10:57:31.133476' is 37.088859726476, but at '2025-01-13 10:30:31.133476' it is 39.3344585640864. Similarly, for SMF, the CPU utilization at '2025-01-14 01:22:31.133476' is 35.751351937668645, but at '2025-01-14 03:06:31.133476

In [11]:
# Enhanced RCA with alerts
def analyze_with_alerts(llm, vector_stores, anomalies, alerts):
    """
    Enhance RCA with alerts information.

    Args:
        llm (ChatOpenAI): The LLM for analysis.
        vector_stores (dict): Vector stores for AMF, SMF, and UPF metrics.
        anomalies (pd.DataFrame): DataFrame containing anomalies.
        alerts (dict): Parsed alert data from alerts.json.

    Returns:
        str: Detailed RCA combining anomalies and alert information.
    """
    # Parse alerts
    alert_details = "\n".join([
        f"Type: {alert['type']}, Severity: {alert['severity']}, Component: {alert['component']}, "
        f"Start: {alert['start_time']}, End: {alert['end_time']}, Description: {alert['description']}"
        for alert in alerts['alerts']
    ])

    # Prepare prompt
    prompt_template = """
    Analyze the following anomalies in the metrics and provide a root cause analysis based on the system logs and alerts:

    Anomalies:
    {anomalies}

    Alerts:
    {alerts}

    Relevant AMF metrics information:
    {amf_metrics}

    Relevant SMF metrics information:
    {smf_metrics}

    Relevant UPF metrics information:
    {upf_metrics}

    Provide a detailed root cause analysis.
    """

    prompt = PromptTemplate(
        input_variables=["anomalies", "alerts", "amf_metrics", "smf_metrics", "upf_metrics"],
        template=prompt_template
    )

    # Fetch relevant metrics
    def fetch_metrics(vector_store, anomalies):
        results = []
        for anomaly in anomalies.to_dict(orient="records"):
            results.extend(vector_store.similarity_search(str(anomaly), k=2))
        return "\n".join([res.page_content for res in results])

    amf_metrics = fetch_metrics(vector_stores['amf'], anomalies)
    smf_metrics = fetch_metrics(vector_stores['smf'], anomalies)
    upf_metrics = fetch_metrics(vector_stores['upf'], anomalies)

    # Combine into chain
    chain = ({
        "anomalies": RunnablePassthrough(),
        "alerts": lambda x: alert_details,
        "amf_metrics": lambda x: amf_metrics,
        "smf_metrics": lambda x: smf_metrics,
        "upf_metrics": lambda x: upf_metrics
    } | prompt | llm | StrOutputParser())

    return chain.invoke(str(anomalies))

# Example Usage
rca_with_alerts = analyze_with_alerts(llm, vector_stores, correlated_anomalies, alerts)
print(rca_with_alerts)


Based on the provided system logs, alerts, and metrics, the anomalies seem to be related to the following issues:

1. SMF_CPU_OVERLOAD_CASCADE: This alert indicates that the Session Management Function (SMF) and Access and Mobility Management Function (AMF) components are experiencing a CPU overload. This is a critical issue as it can lead to performance degradation and service disruption. The root cause could be due to an increase in the number of sessions or a sudden surge in network traffic. The CPU utilization metrics for both SMF and AMF show high values, confirming the overload.

2. UPF_DATAPATH_DEGRADATION: This alert indicates that the User Plane Function (UPF) and SMF components are experiencing datapath degradation. This is a major issue as it can affect the data flow in the network. The root cause could be due to issues in the network infrastructure such as faulty hardware or network congestion. The packet_processing_rate and tunnel_establishment_rate metrics for UPF show hi