In [None]:
from glob import glob
from bs4 import BeautifulSoup
import pandas as pd
import json
import re

html_filenames = glob('../data/raw_html/*.html')

In [None]:
def read_html(file):
    """Abre un archivo HTML y retorna un objeto de BeautifulSoup"""
    with open(file, 'r', encoding='utf-8') as f:
        html = BeautifulSoup(f, 'html.parser')
    return html

In [None]:
def get_cards(html):
    """Encuentra las etiquetas <a> dentro del HTML donde el attribute 'data-testid' contenga 'card-product' dentro."""
    a_tags = html.find_all('a')
    cards = []
    for a in a_tags:
        if 'data-testid' in a.attrs and 'card-product' in a['data-testid']:
            cards.append(a)
    return cards

In [None]:
def main():
    with open('../data/dataset_autos_V2.jsonl', 'w', encoding='utf-8') as f:
        
        for file in html_filenames:
            html = read_html(file)  
            cards = get_cards(html) 
            print(len(cards), f"tarjetas encontradas en el html: {file}")
            
            for c in cards:
                id = c['data-testid'].replace('-', ' ').split()[-1]
                slug = c['href']

                # Encontramos el precio del vehiculo en la card, si no tiene continua con la siguiente iteracion.
                span_price = c.find(class_=re.compile(".*amount__large__price.*"))
                if span_price:
                    price = int(span_price.string.replace(',', '').strip())
                else: continue
        
                # Extraccion de sucursal
                footer = c.find(class_=re.compile(".*product_cardProduct__footerInfo.*"))
                if footer:
                    try:
                        sucursal = footer.string.split('•')[0].strip()
                    except Exception as e:
                        sucursal = None

                # Extraccion del subtitulo con anio, Kilometraje, Engine y Tipo de caja 
                # "subtitulo": ["2019 ", " 71,021 km ", " 2.0 EX AUTO ", " Autom\u00e1tico"]
                subtitle = c.find(class_=re.compile(".*Product__subtitle.*"))
                if subtitle: 
                    try:
                        subtitle = subtitle.string.split('•')
                        year = int(subtitle[0].strip())
                        km_str = subtitle[1].lower().replace('km', '').replace(',', '').strip()
                        km = int(km_str)
                        details = subtitle[2].strip()
                        shift = subtitle[3].strip()
                    except (ValueError, IndexError):
                        year, km, details, shift = None, None, None, None
                else: continue
                
                # Extraccion de banners
                hot_sale_flag = 0
                banner = c.find(string=re.compile("Precio imbatible"))
                if banner:
                    hot_sale_flag = 1
                
                f.write(
                    json.dumps(
                        {"id":id, 
                         "slug":slug,
                         "sucursal": sucursal, 
                         "price":price, 
                         "year" : year,
                         "km": km,
                         "caja": shift,
                         "oferta": hot_sale_flag,
                         "details": details
                         }) + '\n')
                


In [None]:
if __name__ == '__main__':
    main()