In [None]:
import os
import re
import email
import tempfile
from email.header import decode_header
from email.utils import parseaddr
from bs4 import BeautifulSoup
from xhtml2pdf import pisa

def decode_email_header(hdr):
    parts = decode_header(hdr)
    out = []
    for txt, enc in parts:
        if isinstance(txt, bytes):
            out.append(txt.decode(enc or 'utf-8', errors='replace'))
        else:
            out.append(txt)
    return " ".join(out)

def extract_full_html(eml_path):
    # 1) Abrir y parsear EML
    msg = email.message_from_binary_file(open(eml_path, 'rb'))
    subject = decode_email_header(msg.get('Subject','(Sin asunto)'))
    frm = decode_email_header(parseaddr(msg.get('From',''))[0] or parseaddr(msg.get('From',''))[1])
    to  = decode_email_header(parseaddr(msg.get('To',''))[0]  or parseaddr(msg.get('To',''))[1])
    date= msg.get('Date','')

    # 2) Extraer body HTML o texto
    body_html, body_txt = "", ""
    for part in msg.walk():
        if part.get_content_maintype()=='multipart': 
            continue
        cd = part.get('Content-Disposition','')
        if 'attachment' in cd:
            continue
        ct = part.get_content_type()
        payload = part.get_payload(decode=True) or b""
        charset = part.get_content_charset() or 'utf-8'
        text = payload.decode(charset, errors='replace')
        if ct=='text/html' and not body_html:
            body_html = text
        elif ct=='text/plain' and not body_html:
            body_txt = text

    if not body_html:
        body_html = f"<pre style='font-family: monospace'>{body_txt}</pre>"

    # 3) Usamos BeautifulSoup solo para limpiar malformed tags    
    soup = BeautifulSoup(body_html, 'html.parser')
    cleaned_body = str(soup)

    # 4) Encabezado con logo, remitente, asunto...
    header = f"""
      <table style="width:100%; border:none; margin-bottom:1em;">
        <tr>
          <td><strong>{subject}</strong></td>
          <td style="text-align:right; font-size:0.8em;">De: {frm}<br>Para: {to}<br>{date}</td>
        </tr>
      </table>
      <hr/>
    """

    # 5) Construir documento HTML completo
    full_html = f"""<!DOCTYPE html>
    <html>
      <head>
        <meta charset="utf-8">
        <style>
          /* Propias del mail (puedes añadir aquí más global styles) */
          body {{ font-family: Arial, sans-serif; font-size: 12pt; line-height:1.4; }}
          h1, h2, h3 {{ color: #333; }}
          table {{ border-collapse: collapse; }}
          img {{ max-width: 100%; height: auto; }}
        </style>
      </head>
      <body>
        {header}
        {cleaned_body}
      </body>
    </html>"""
    return full_html

def clean_css_issues(html_str):
    # Elimina valores CSS "initial" que Pisa/reportlab no entiende
    html_str = re.sub(r'\binitial\b', '', html_str, flags=re.IGNORECASE)
    # Limpia dobles punto y coma resultantes
    html_str = html_str.replace(';;', ';')
    return html_str

def html_to_pdf(html_str, out_path):
    # 1) Sanitizar CSS problemáticas
    html_str = clean_css_issues(html_str)
    # 2) Generar PDF
    with open(out_path, "wb") as f:
        pisa_status = pisa.CreatePDF(html_str, dest=f)
    return pisa_status.err

def process_folder(folder):
    emls = [f for f in os.listdir(folder) if f.lower().endswith('.eml')]
    print(f"→ Encontrados {len(emls)} .eml")
    for fn in emls:
        src = os.path.join(folder, fn)
        pdf = os.path.join(folder, fn[:-4] + '.pdf')
        try:
            html = extract_full_html(src)
            err  = html_to_pdf(html, pdf)
            if err == 0:
                print(f"✅ {fn} → {os.path.basename(pdf)}")
            else:
                print(f"❌ Error Pisa ({err}): {fn}")
        except Exception as e:
            print(f"❌ Excepción procesando {fn}: {e}")

if __name__ == "__main__":
    carpeta = r"C:\Users\osmarrincon\Downloads\ACTIVIDAD 12\PQRS"
    process_folder(carpeta)

# Claude

In [None]:
import os
import email
import re
from email.header import decode_header
from email.utils import parseaddr
import tempfile
from bs4 import BeautifulSoup
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib import colors
from xhtml2pdf import pisa

# Instalar con:
# pip install beautifulsoup4 reportlab xhtml2pdf

def decode_email_header(header):
    """Decodifica el encabezado del correo electrónico."""
    if not header:
        return ""
    decoded_header = decode_header(header)
    header_parts = []
    for content, encoding in decoded_header:
        if isinstance(content, bytes):
            if encoding:
                header_parts.append(content.decode(encoding, errors='replace'))
            else:
                header_parts.append(content.decode('utf-8', errors='replace'))
        else:
            header_parts.append(str(content))
    return " ".join(header_parts)

def convert_html_to_pdf(html_content, output_path):
    """Convierte HTML a PDF usando xhtml2pdf."""
    with open(output_path, "wb") as result_file:
        success = pisa.CreatePDF(
            html_content,
            dest=result_file)
    return success.err == 0

def extract_email_content(eml_path):
    """Extrae el contenido del correo electrónico."""
    with open(eml_path, 'rb') as f:
        msg = email.message_from_binary_file(f)
    
    # Decodificar asunto
    subject = decode_email_header(msg.get('Subject', 'Sin asunto'))
    
    # Obtener remitente y destinatarios
    from_header = msg.get('From', '')
    to_header = msg.get('To', '')
    
    # Decodificar remitente y destinatario
    from_name, from_addr = parseaddr(from_header)
    if from_name:
        from_name = decode_email_header(from_name)
    to_name, to_addr = parseaddr(to_header)
    if to_name:
        to_name = decode_email_header(to_name)
    
    # Preparar información de cabeceras
    from_info = f"{from_name} <{from_addr}>" if from_name else from_addr
    to_info = f"{to_name} <{to_addr}>" if to_name else to_addr
    date = msg.get('Date', '')
    
    # Contenido del correo
    body_html = ""
    body_text = ""
    
    for part in msg.walk():
        if part.get_content_maintype() == 'multipart':
            continue
            
        content_type = part.get_content_type()
        content_disposition = part.get('Content-Disposition', '')
        
        # Procesar cuerpo del mensaje
        if 'attachment' not in content_disposition and not part.get_filename():
            if content_type == 'text/html':
                charset = part.get_content_charset() or 'utf-8'
                try:
                    body_html = part.get_payload(decode=True).decode(charset, errors='replace')
                except:
                    body_html = part.get_payload(decode=True).decode('utf-8', errors='replace')
            elif content_type == 'text/plain' and not body_html:
                charset = part.get_content_charset() or 'utf-8'
                try:
                    body_text = part.get_payload(decode=True).decode(charset, errors='replace')
                except:
                    body_text = part.get_payload(decode=True).decode('utf-8', errors='replace')
    
    # Si no hay contenido HTML, convertir el texto plano a HTML
    if not body_html and body_text:
        body_html = f"<p>{body_text.replace('\n', '<br>')}</p>"
    elif not body_html:
        body_html = "<p>El correo no contiene contenido legible.</p>"
    
    # Crear HTML completo con estilo básico
    email_html = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <meta charset="UTF-8">
        <title>{subject}</title>
        <style>
            body {{ font-family: Arial, sans-serif; margin: 20px; }}
            .header {{ border-bottom: 1px solid #ddd; padding-bottom: 10px; margin-bottom: 20px; }}
            .header-field {{ margin-bottom: 5px; }}
            .label {{ font-weight: bold; }}
        </style>
    </head>
    <body>
        <div class="header">
            <div class="header-field"><span class="label">Asunto:</span> {subject}</div>
            <div class="header-field"><span class="label">De:</span> {from_info}</div>
            <div class="header-field"><span class="label">Para:</span> {to_info}</div>
            <div class="header-field"><span class="label">Fecha:</span> {date}</div>
        </div>
        
        <div class="content">
            {body_html}
        </div>
    </body>
    </html>
    """
    
    return email_html, subject

def convert_eml_to_pdf(eml_path, output_dir=None):
    """Convierte un archivo EML a PDF."""
    if output_dir is None:
        output_dir = os.path.dirname(eml_path)
    
    try:
        # Extraer contenido del email
        html_content, subject = extract_email_content(eml_path)
        
        # Generar nombre para el PDF
        base_filename = os.path.basename(eml_path).replace('.eml', '')
        pdf_filename = f"{base_filename}.pdf"
        pdf_path = os.path.join(output_dir, pdf_filename)
        
        # Convertir HTML a PDF
        success = convert_html_to_pdf(html_content, pdf_path)
        
        if success:
            print(f"Convertido: {eml_path} -> {pdf_path}")
            return pdf_path
        else:
            print(f"Error al generar PDF para {eml_path}")
            return None
    except Exception as e:
        print(f"Error al procesar {eml_path}: {e}")
        return None

def process_directory(directory_path):
    """Procesa todos los archivos EML en un directorio."""
    success_count = 0
    error_count = 0
    
    # Asegurarse de que el directorio existe
    if not os.path.exists(directory_path):
        print(f"El directorio {directory_path} no existe.")
        return
    
    # Obtener todos los archivos .eml en el directorio
    eml_files = [f for f in os.listdir(directory_path) if f.lower().endswith('.eml')]
    
    if not eml_files:
        print(f"No se encontraron archivos .eml en {directory_path}")
        return
    
    print(f"Encontrados {len(eml_files)} archivos .eml")
    
    # Procesar cada archivo
    for eml_file in eml_files:
        eml_path = os.path.join(directory_path, eml_file)
        try:
            pdf_path = convert_eml_to_pdf(eml_path, directory_path)
            if pdf_path:
                success_count += 1
            else:
                error_count += 1
        except Exception as e:
            print(f"Error al procesar {eml_file}: {e}")
            error_count += 1
    
    print(f"\nResumen:")
    print(f"Total archivos procesados: {len(eml_files)}")
    print(f"Conversiones exitosas: {success_count}")
    print(f"Errores: {error_count}")

if __name__ == "__main__":
    # Ruta del directorio con los archivos .eml
    directory_path = r"C:\Users\osmarrincon\Downloads\ACTIVIDAD 12\TRASLADO EPS"
    process_directory(directory_path)