In [None]:
import re
import random

# Generate fake log lines
def generate_logs(n):
    logs = []
    for _ in range(n):
        digits = ''.join(random.choices('0123456789', k=10))
        log = f"... NewOrdSOR ... S{digits}OROL01 ... more text ..."
        logs.append(log)
    return logs

# Extract pattern: S[0-9]{10}OROL01
def extract_ids(logs):
    pattern = re.compile(r"S\d{10}OROL01")
    return [match.group() for line in logs for match in pattern.finditer(line)]

# --- Run ---
logs = generate_logs(20)

# Write to file
with open("../data/logs.txt", "w") as f:
    for line in logs:
        f.write(line + "\n")

print("Log lines written to logs.txt")


In [2]:
# With repetitions
import re
import random

# Generate a pool of unique IDs
def generate_ids(num_unique):
    return [f"S{''.join(random.choices('0123456789', k=10))}OROL01" for _ in range(num_unique)]

# Generate log lines with some repeated IDs
def generate_logs(total_lines, num_unique_ids):
    ids = generate_ids(num_unique_ids)
    logs = []
    for _ in range(total_lines):
        selected_id = random.choice(ids)  # Allow repetition
        log = f"... NewOrdSOR ... {selected_id} ... more text ..."
        logs.append(log)
    return logs

# Extract IDs from logs
def extract_ids(logs):
    pattern = re.compile(r"S\d{10}OROL01")
    return [match.group() for line in logs for match in pattern.finditer(line)]

# --- Run ---
logs = generate_logs(total_lines=50, num_unique_ids=10)

# Write to file
with open("logs.txt", mode="w") as f:
    for line in logs:
        f.write(line + "\n")

print("Log lines with repeated IDs written to logs.txt")



Log lines with repeated IDs written to logs.txt


# grep
1. count unique id
   ```
   grep "NewOrdSOR" logs.txt | sed -nE 's/.*\b(S[0-9]{10}OROL01)\b.*/\1/p' | sort | uniq -c
   ```

2. count unique id sorted by count desc
   ```
   grep "NewOrdSOR" logs.txt | sed -nE 's/.*\b(S[0-9]{10}OROL01)\b.*/\1/p' | sort | uniq -c | sort -nr
   ```

   explanation:
   ```
   10 S0001112223OROL01
   ```

   So sort -nr means:

   -n: extract the number from the beginning of each line and use it as the sort key.

   -r: reverse the order (i.e., largest counts first)

   sort -nr does not try to interpret " S5493196488OROL01" as a number — it doesn't need to.


##  The sed part

```
sed -nE 's/.*\b(S[0-9]{10}ROL01)\b.*/\1/'
```

Component	    Meaning
-n	            Suppresses default output (print nothing unless told explicitly)
-E	            Enables extended regex (so you can use +, {} without backslashes)
's/.../.../p'	s/// is the substitute command; p means “print the result if a substitution occurred”


Inside the substitution:

s/.*\b(S[0-9]{10}ROL01)\b.*/\1/

Parts:
Regex Part	        Explanation
.*	                Match anything at the beginning of the line (greedy)
\b	                Word boundary – ensures we're at a word boundary before the ID
(S[0-9]{10}ROL01)	Capturing group that matches IDs like S0000125206ROL01
\b	                Another word boundary after the ID
.*	                Match anything after the ID
/\1/	            Replace the full line with just the match (\1 = first capture group)
p	                Print the result (only lines where the substitution happened)


