In [1]:
!pip install geoip2
!pip install faiss-cpu
!pip install sentence_transformers

Collecting geoip2
  Downloading geoip2-4.8.0-py2.py3-none-any.whl.metadata (18 kB)
Collecting maxminddb<3.0.0,>=2.5.1 (from geoip2)
  Downloading maxminddb-2.6.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.0 kB)
Downloading geoip2-4.8.0-py2.py3-none-any.whl (27 kB)
Downloading maxminddb-2.6.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (87 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.7/87.7 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: maxminddb, geoip2
Successfully installed geoip2-4.8.0 maxminddb-2.6.2
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected pack

In [2]:
import random
import os
import time
import json
import geoip2.database
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from pyparsing import Word, nums, alphas, Combine, Suppress, QuotedString, Regex, oneOf, restOfLine
from datetime import datetime, timedelta
from shlex import join



  from tqdm.autonotebook import tqdm, trange


In [3]:
## -- GENERATE LOGS -- ##
# 127.0.0.1 - - [10/Oct/2024:13:55:36 +0000] "GET /index.html HTTP/1.1" 200 1024 "-" "Mozilla/5.0"
#{"time": "17/May/2015:08:05:32 +0000", "remote_ip": "93.180.71.3","request": "GET /downloads/product_1 HTTP/1.1", "response": 304, "bytes": 0, "agent": "Debian APT-HTTP/1.3 (0.8.16~exp12ubuntu10.21)"}

def random_ip():
   return ".".join(str(random.randint(0, 255)) for _ in range(4))

def random_date():
   start_date = datetime.now() - timedelta(days=30)
   random_days = random.randint(0, 30)
   random_seconds = random.randint(0, 86400)
   return (start_date + timedelta(days=random_days, seconds=random_seconds)).strftime('%d/%b/%Y:%H:%M:%S +0000')

def random_method():
   return random.choice(['GET', 'POST', 'PUT', 'DELETE', 'HEAD'])

def random_url():
   return random.choice(['/index.html', '/about', '/contact', '/products', '/api/v1/data'])

def random_status():
   return random.choice([200, 201, 400, 401, 403, 404, 500, 502, 503, 504, 505, 506])

def random_size():
   return random.randint(100, 5000)

def random_agent():
   return random.choice(['Mozilla/5.0', 'Chrome/90.0.4430.93', 'Safari/537.36', 'Edge/90.0.818.62'])

def generate_fake_logs(num_logs):
   logs = []
   for _ in range(num_logs):
       log_entry = f'{random_ip()} - - [{random_date()}] "{random_method()} {random_url()} HTTP/1.1" {random_status()} {random_size()} - {random_agent()}'
       logs.append(str("{"+log_entry+"}"))
   return logs


In [4]:
## -- MANIPULATE LOGS -- ##

#{62.149.159.54 - - [21/Jul/2024:10:35:33 +0000] "POST /contact HTTP/1.1" 502 4400 - Edge/90.0.818.62}

GEOLOC_DATA = geoip2.database.Reader('/content/drive/MyDrive/RAG/GeoLite2-City.mmdb')

def define_log_pattern():
   ipAddress = Combine(Word(nums) + '.' + Word(nums) + '.' + Word(nums) + '.' + Word(nums))
   dash = Suppress('-')
   timeStamp = Combine(Suppress('[') + Word(nums) + '/' + Word(alphas) + '/' + Word(nums) + ':' +
                    Word(nums) + ':' + Word(nums) + ':' + Word(nums) + Suppress(' ') + Word('+-', exact=1) +
                    Word(nums) + Suppress(']'))
   method = oneOf("GET POST PUT DELETE HEAD")
   url = Word(alphas + "/" + "." + "-" + "_")
   protocol = Combine(Word(alphas) + '/' + Word(nums) + '.' + Word(nums))
   statusCode = Word(nums)
   responseSize = Word(nums)
   userAgent = restOfLine

   global logPattern
   logPattern = (
      ipAddress("ip") +
      dash +
      dash +
      timeStamp("timestamp") +
      QuotedString('"')("request") +
      statusCode("status_code") +
      responseSize("response_size") +
      dash +
      userAgent("user_agent")
   )

def parse_log(logEntry):
   global logPattern
   parsedLog = logPattern.parseString(logEntry)
   '''
      print(f"IP Address: {parsedLog.ip}")
      print(f"Timestamp: {parsedLog.timestamp}")
      print(f"Request: {parsedLog.request}")
      print(f"Status Code: {parsedLog.status_code}")
      print(f"Response Size: {parsedLog.response_size}")
      print(f"User Agent: {parsedLog.user_agent}")
   '''
   return parsedLog

def enrich_metadata(parsedLogEntry, geolocData=GEOLOC_DATA):
   metadata = {"country": '', "city": '', "errorCategory": '', "timeOfDay": ''}
   ## Geolocation
   try:
      response = geolocData.city(parsedLogEntry.ip)
      metadata['country'] = response.country.name
      metadata['city'] = response.city.name
   except:
      metadata['country'] = 'Unknown'
      metadata['city'] = 'Unknown'
   ## Error Categories
   if 400 <= int(parsedLogEntry.status_code) < 500:
      metadata['errorCategory'] = "Client Error"
   elif 500 <= int(parsedLogEntry.status_code) < 600:
      metadata['errorCategory'] = "Server Error"
   else:
     metadata['errorCategory'] = "Other"
   ## User Agent

   ## Time of Day
   hour = datetime.strptime(parsedLogEntry.timestamp, '%d/%b/%Y:%H:%M:%S%z').hour
   if 5 <= hour < 12:
      metadata['timeOfDay'] =  "Morning"
   elif 12 <= hour < 17:
      metadata['timeOfDay'] =  "Afternoon"
   elif 17 <= hour < 21:
      metadata['timeOfDay'] =  "Evening"
   else:
      metadata['timeOfDay'] =  "Night"
   return metadata

def integrate_log_as_txt(parsedLogEntry, metadata):
   #Combine logs with metadata
   enrichedText = (
   f"Request from IP {parsedLogEntry['ip']} in {metadata['city']}, {metadata['country']} "
   f"on {parsedLogEntry['timestamp']} {metadata['timeOfDay']} using this agent {parsedLogEntry['user_agent']} "
   f"for {parsedLogEntry['request']} returned status {parsedLogEntry['status_code']} "
   f"({metadata['errorCategory']}) with response size {parsedLogEntry['response_size']}. "
   )
   return enrichedText


In [5]:
## -- CREATE VECTORS -- ##
def create_vector(text):
   model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
   text_embedding = model.encode(text)
   return text_embedding


In [6]:
## -- STORE VECTORS -- ##

def save_indexs(vectors):
   vectors = np.array(vectors).astype('float32')
   dimension = vectors.shape[1]
   index = faiss.IndexFlatL2(dimension)
   index.add(vectors)
   faiss.write_index(index, "/content/drive/MyDrive/RAG/index_file.index")


In [7]:
def generate(quantity):
    allData = []
    enrichedLogs = []
    vectors = []
    define_log_pattern()
    logs = generate_fake_logs(quantity)
    for log in range(len(logs)):
        parsedLog = parse_log(logs[log].strip("{}"))
        enrichedMetadata = enrich_metadata(parsedLog, GEOLOC_DATA)
        integratedLog= integrate_log_as_txt(parsedLog, enrichedMetadata)
        vector = create_vector(integratedLog)
        enrichedLogs.append(integratedLog)
        vectors.append(vector)
    save_indexs(vectors)
    with open('/content/drive/MyDrive/RAG/enriched_logs.json', 'w') as file:
        for i in range(len(logs)):
            data={
                "id": i,
                "log": enrichedLogs[i],
                "vector":str(vectors[i].tolist())
            }
            allData.append(data)

        json.dump(allData, file, indent=4)


In [None]:
generate(10000)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]