In [1]:
import numpy as np
import pandas as pd
import json 
import pickle
import torch
import os
from colorama import Style, Fore, Back

import warnings
from sklearn.exceptions import InconsistentVersionWarning
warnings.filterwarnings("ignore", category=InconsistentVersionWarning)
warnings.filterwarnings("ignore", category=UserWarning)

import logging
logger = logging.getLogger(__name__)
logging.basicConfig(
    format=f'{Style.BRIGHT}{Fore.GREEN}%(levelname)s:%(asctime)s{Style.RESET_ALL} {Fore.BLUE}%(message)s{Style.RESET_ALL}', 
    level=logging.INFO
)

In [36]:
from langchain_community.document_loaders  import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma 
from langchain_huggingface import HuggingFaceEmbeddings 
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from transformers import pipeline
from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.output_parsers import StrOutputParser

In [15]:
# STEP 1 ~ Document Loader.
loader = TextLoader("5G_logs_v2.txt")
docs = loader.load()
logging.info(f'Size of docx: {len(docs)}')
logging.info(f'\n{docs[0].page_content}')

# # STEP 2 ~ Split Documents.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000, # Each chunk will have 1000 characters
    chunk_overlap = 200, # 200 characters will overlap from consecutive chunks
    add_start_index = True # Output chunks will include a field that specifies the starting pos. in the orig docx.
)
all_splits = text_splitter.split_documents(docs)
logging.info(f'Size of splits: {len(all_splits)}')
logging.info(f'\n{all_splits[0].page_content}')

[1m[32mINFO:2025-01-11 06:04:18,794[0m [34mSize of docx: 1[0m
[1m[32mINFO:2025-01-11 06:04:18,800[0m [34m
[DEBUG] Everything is working properly.
[INFO] Is there a reason for this happening.
[ERROR] Very dangerous.[0m
[1m[32mINFO:2025-01-11 06:04:18,803[0m [34mSize of splits: 1[0m
[1m[32mINFO:2025-01-11 06:04:18,804[0m [34m
[DEBUG] Everything is working properly.
[INFO] Is there a reason for this happening.
[ERROR] Very dangerous.[0m


In [16]:
model_embed_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(model_name = model_embed_name, \
                           model_kwargs = model_kwargs, \
                           encode_kwargs = encode_kwargs)
vectorstore, vector_db_dir = None, "./5Gdb"
if os.path.exists(vector_db_dir):
    vectorstore = Chroma(embedding_function = hf, \
                         persist_directory = vector_db_dir)
else:
    logging.warning(f'Creating vector-store from scratch.')
    vectorstore = Chroma.from_documents(documents = all_splits, \
                                        embedding = hf, \
                                        persist_directory = vector_db_dir)
logging.info(vectorstore)

[1m[32mINFO:2025-01-11 06:07:18,727[0m [34mLoad pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2[0m
[1m[32mINFO:2025-01-11 06:08:49,599[0m [34mAnonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.[0m
[1m[32mINFO:2025-01-11 06:08:50,930[0m [34m<langchain_chroma.vectorstores.Chroma object at 0x7f964a7d4470>[0m


In [19]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 1})
retrieved_docs = retriever.invoke("Display all [DEBUG] logs.")
logging.info("\n"+retrieved_docs[0].page_content)

[1m[32mINFO:2025-01-11 06:12:17,110[0m [34m
[DEBUG] Everything is working properly.
[INFO] Is there a reason for this happening.
[ERROR] Very dangerous.[0m


In [30]:
template0 = """You are an AI assistant you job is to grade logs into three categories,
BAD, NEUTRAL, GOOD.
LOG --> {log}
GRADE -->"""
prompt = PromptTemplate(input = ["log"], template = template0)

In [31]:
def QA(query):
    # docs = retriever.invoke(query)
    # context = "\n".join(doc.page_content for doc in docs)
    context = query
    return {"log": context}

# model_id = "meta-llama/Llama-3.1-8B-Instruct" -> 16GB model (slow as model cannot be fitted in RAM)
# model_id = "zackli4ai/llama-3.2-1b-instruct-qlora-int4-eo8" -> (4GB but bad results)
# model_id = "meta-llama/Llama-3.2-1B-Instruct" -> (2GB best one)
def llm_resp(response, \
             model_name = "Llama3.2-1B-Instruct/", \
             additional_path = "snapshots/9213176726f574b556790deb65791e0c5aa438b6/"):
    local_dir = "/mnt/d/Desktop/HuggingFaceModels/meta-llama/"
    pipelineX = pipeline("text-generation",
                         model = local_dir + model_name + additional_path,
                         model_kwargs = {"torch_dtype": torch.bfloat16},
                         device_map = "auto",
                         max_length = 128
                        )
    return pipelineX(response.text)[0]['generated_text']

In [43]:
def parse_file(rag_chain, filename, cache_name):
    cache = {}
    try:
      with open(f"{cache_name}", "r") as cfile:
          cache = json.load(cfile)
    except: pass
    with open(f"{filename}", "r") as file:
        for sentence in file:
            hash_key = str(hash(sentence))
            is_present = False
            if hash_key in cache: 
              for sentenceX, verdict in cache.get(hash_key):
                  if sentence == sentenceX:
                      is_present = True 
                      break 
            if not is_present:
                verdict = rag_chain.invoke(f"{sentence}")
                if hash_key not in cache: cache[hash_key] = []
                cache[hash_key].append([sentence, verdict])
            logging.info(f"\n{sentence}")
            logging.info(f"\n{verdict}")
            logging.info(f'='*35)
    with open(f"{cache_name}", "w") as cfile:
        json.dump(cache, cfile, indent = 5)
        cache = {} # RESET cache.

## In-house Llama Model.

In [None]:
rag_chain = (
    QA
    | prompt
    | llm_resp
)
parse_file(rag_chain, "5G_logs_v1.txt", "cache_autoregressive")

## Different Models.

In [50]:
def StopHallucinations(response):
    return response.split("\n")[0]

# model_id = "openai-community/gpt2"
model_id = "HuggingFaceH4/zephyr-7b-beta"
llm = HuggingFaceEndpoint(repo_id = model_id, temperature = 0.1)
rag_chain = (
    QA
    | prompt
    | llm
    | StrOutputParser()
    | StopHallucinations
)

parse_file(rag_chain, "5G_logs_v1.txt", "cache_autoreg_online")

[1m[32mINFO:2025-01-12 07:21:17,569[0m [34m
[DEBUG] Everything is working properly.
[0m
[1m[32mINFO:2025-01-12 07:21:17,573[0m [34m
 GOOD[0m
[1m[32mINFO:2025-01-12 07:21:17,874[0m [34m
[INFO] Is there a reason for this happening.
[0m
[1m[32mINFO:2025-01-12 07:21:17,875[0m [34m
 [NEUTRAL] No clear reason is apparent.[0m
[1m[32mINFO:2025-01-12 07:21:18,155[0m [34m
[0m
[1m[32mINFO:2025-01-12 07:21:18,157[0m [34m
 BAD[0m
[1m[32mINFO:2025-01-12 07:21:18,160[0m [34m
[DEBUG] Everything is working properly.
[0m
[1m[32mINFO:2025-01-12 07:21:18,161[0m [34m
 GOOD[0m


In [51]:
def StopHallucinations(response):
     return response.split("\n")[0]

model_id = "microsoft/Phi-3.5-mini-instruct"
llm = HuggingFaceEndpoint(repo_id = model_id, temperature = 0.1)
rag_chain = (
    QA
    | prompt
    | llm
    | StrOutputParser()
    | StopHallucinations
)

parse_file(rag_chain, "5G_logs_v1.txt", "cache_autoreg_online")

[1m[32mINFO:2025-01-12 07:21:51,451[0m [34m
[DEBUG] Everything is working properly.
[0m
[1m[32mINFO:2025-01-12 07:21:51,452[0m [34m
 GOOD[0m
[1m[32mINFO:2025-01-12 07:21:52,243[0m [34m
[INFO] Is there a reason for this happening.
[0m
[1m[32mINFO:2025-01-12 07:21:52,245[0m [34m
 NEUTRAL[0m
[1m[32mINFO:2025-01-12 07:21:57,400[0m [34m
[0m
[1m[32mINFO:2025-01-12 07:21:57,401[0m [34m
 BAD[0m
[1m[32mINFO:2025-01-12 07:21:57,403[0m [34m
[DEBUG] Everything is working properly.
[0m
[1m[32mINFO:2025-01-12 07:21:57,405[0m [34m
 GOOD[0m
