In [23]:
import pandas as pd
import re
import json
import os

In [None]:
input_file = "D:/v1.0/train-json/nq-train-00.jsonl"
output_file = "D:/v1.0/Categorias/nq-train-00-test.jsonl"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

pattern_title = re.compile(r"[?&]title=([^&]+)")
pattern_normal = re.compile(r'id="mw-normal-catlinks".*?<UL>(.*?)</UL>', re.S)
pattern_oculta = re.compile(r'id="mw-hidden-catlinks".*?<UL>(.*?)</UL>', re.S)

# Regex para pegar apenas os campos grandes que podem ser pesados
pattern_html_url = re.compile(
    r'"document_html"\s*:\s*"((?:\\.|[^"\\])*)"|'
    r'"document_url"\s*:\s*"([^"]+)"'
)

def extract_html_url(json_line):
    document_html = ""
    document_url = ""
    
    for match in pattern_html_url.finditer(json_line):
        if match.group(1):
            document_html = bytes(match.group(1), "utf-8").decode("unicode_escape")
        elif match.group(2):
            document_url = match.group(2)
    
    return document_html, document_url

with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'w', encoding='utf-8') as f_out:
    for i, linha in enumerate(f_in):
        if i >= 10:  # limita a 10 linhas
            break

        data = json.loads(linha)
        example_id = data.get("example_id", "")
        html, url = extract_html_url(linha)
        title_match = pattern_title.search(url)
        title = title_match.group(1) if title_match else "N/A"
        
        # Categorias normais
        match_normal = pattern_normal.search(html)
        if match_normal:
            categorias = re.findall(r'>([^<>]+)</A>', match_normal.group(1))
            categorias = [c.replace('\n',' ').replace('\t','').strip() for c in categorias]
        else:
            categorias = []

        # Categorias ocultas
        match_oculta = pattern_oculta.search(html)
        if match_oculta:
            categorias_ocultas = re.findall(r'>([^<>]+)</A>', match_oculta.group(1))
            categorias_ocultas = [c.replace('\n',' ').replace('\t','').strip() for c in categorias_ocultas]
        else:
            categorias_ocultas = []

        output_data = {
            "example_id": example_id,
            "title": title,
            "categorias": categorias,
            "categorias_ocultas": categorias_ocultas
        }
        f_out.write(json.dumps(output_data, ensure_ascii=False) + "\n")

print(f"Teste concluído. Saída salva em {output_file}")


Teste concluído. Saída salva em D:/v1.0/Categorias/nq-train-00-test.jsonl


In [None]:
# Pastas de entrada e saída
input_folder = "D:/v1.0/train-json"
output_folder = "D:/v1.0/Categorias"
os.makedirs(output_folder, exist_ok=True)

# Padrões regex
pattern_title = re.compile(r"[?&]title=([^&]+)")
pattern_normal = re.compile(r'id="mw-normal-catlinks".*?<UL>(.*?)</UL>', re.S)
pattern_oculta = re.compile(r'id="mw-hidden-catlinks".*?<UL>(.*?)</UL>', re.S)

# Regex apenas para extrair os campos grandes HTML e URL
pattern_html_url = re.compile(
    r'"document_html"\s*:\s*"((?:\\.|[^"\\])*)"|'
    r'"document_url"\s*:\s*"([^"]+)"'
)

def extract_html_url(json_line):
    document_html = ""
    document_url = ""
    
    for match in pattern_html_url.finditer(json_line):
        if match.group(1):
            document_html = bytes(match.group(1), "utf-8").decode("unicode_escape")
        elif match.group(2):
            document_url = match.group(2)
    
    return document_html, document_url

# Itera pelos 50 arquivos
for i in range(50):
    input_file = os.path.join(input_folder, f"nq-train-{i:02d}.jsonl")
    output_file = os.path.join(output_folder, f"nq-train-{i:02d}.jsonl")
    
    with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'w', encoding='utf-8') as f_out:
        for linha in f_in:
            # Pega example_id direto do JSON
            data = json.loads(linha)
            example_id = data.get("example_id", "")
            
            # Extrai document_html e document_url via regex otimizado
            html, url = extract_html_url(linha)
            
            # Extrai título
            title_match = pattern_title.search(url)
            title = title_match.group(1) if title_match else "N/A"
            
            # Categorias normais
            match_normal = pattern_normal.search(html)
            if match_normal:
                categorias = re.findall(r'>([^<>]+)</A>', match_normal.group(1))
                categorias = [c.replace('\n',' ').replace('\t','').strip() for c in categorias]
            else:
                categorias = []

            # Categorias ocultas
            match_oculta = pattern_oculta.search(html)
            if match_oculta:
                categorias_ocultas = re.findall(r'>([^<>]+)</A>', match_oculta.group(1))
                categorias_ocultas = [c.replace('\n',' ').replace('\t','').strip() for c in categorias_ocultas]
            else:
                categorias_ocultas = []

            # Monta o JSON de saída
            output_data = {
                "example_id": example_id,
                "title": title,
                "categorias": categorias,
                "categorias_ocultas": categorias_ocultas
            }

            f_out.write(json.dumps(output_data, ensure_ascii=False) + "\n")
    
    print(f"Arquivo processado: {input_file} → {output_file}")
# 309m 51.6s

Arquivo processado: D:/v1.0/train-json\nq-train-00.jsonl → D:/v1.0/Categorias\nq-train-00.jsonl
Arquivo processado: D:/v1.0/train-json\nq-train-01.jsonl → D:/v1.0/Categorias\nq-train-01.jsonl
Arquivo processado: D:/v1.0/train-json\nq-train-02.jsonl → D:/v1.0/Categorias\nq-train-02.jsonl
Arquivo processado: D:/v1.0/train-json\nq-train-03.jsonl → D:/v1.0/Categorias\nq-train-03.jsonl
Arquivo processado: D:/v1.0/train-json\nq-train-04.jsonl → D:/v1.0/Categorias\nq-train-04.jsonl
Arquivo processado: D:/v1.0/train-json\nq-train-05.jsonl → D:/v1.0/Categorias\nq-train-05.jsonl
Arquivo processado: D:/v1.0/train-json\nq-train-06.jsonl → D:/v1.0/Categorias\nq-train-06.jsonl
Arquivo processado: D:/v1.0/train-json\nq-train-07.jsonl → D:/v1.0/Categorias\nq-train-07.jsonl
Arquivo processado: D:/v1.0/train-json\nq-train-08.jsonl → D:/v1.0/Categorias\nq-train-08.jsonl
Arquivo processado: D:/v1.0/train-json\nq-train-09.jsonl → D:/v1.0/Categorias\nq-train-09.jsonl
Arquivo processado: D:/v1.0/train-json\n

In [None]:
import glob

arquivos = sorted(glob.glob("D:/v1.0/Categorias/nq-train-*.jsonl"))

with open("nq-train-all.jsonl", "w", encoding="utf-8") as saida:
    for caminho in arquivos:
        with open(caminho, "r", encoding="utf-8") as f:
            for linha in f:
                saida.write(linha)