In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import requests
import pandas as pd
from datetime import datetime
import os
import time

In [2]:
driver = webdriver.Chrome()

In [3]:
driver.get('https://rpachallengeocr.azurewebsites.net')

In [4]:
def get_table_data():
    data = []
    while True:
        table_rows = driver.find_elements(By.CSS_SELECTOR, 'table tbody tr')
        for row in table_rows:
            cols = row.find_elements(By.TAG_NAME, 'td')
            if len(cols) >= 4:  # Verifica se há pelo menos 4 colunas
                invoice_id = cols[0].text
                number = cols[1].text
                date = cols[2].text
                url_element = cols[3].find_elements(By.TAG_NAME, 'a')
                url = url_element[0].get_attribute('href') if url_element else None
                data.append({
                    'ID da Fatura': invoice_id,
                    'Número da Fatura': number,
                    'Data da Fatura': date,
                    'URL da Fatura': url
                })
        
        # Tenta encontrar o botão "Próxima Página"
        try:
            next_button = driver.find_element(By.CSS_SELECTOR, 'a.next')
            if 'disabled' in next_button.get_attribute('class'):
                break  # Se o botão estiver desabilitado, sai do loop
            next_button.click()  # Clica no botão "Próxima Página"
            time.sleep(2)  # Espera um pouco para a página carregar
        except:
            break  # Se não encontrar o botão, sai do loop
    
    return data

In [5]:
def download_invoices(data, download_path):
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    
    for item in data:
        if item['URL da Fatura'] and datetime.strptime(item['Data da Fatura'], '%d-%m-%Y') <= datetime.now():
            response = requests.get(item['URL da Fatura'])
            file_name = os.path.join(download_path, f"{item['ID da Fatura']}.jpg")
            with open(file_name, 'wb') as f:
                f.write(response.content)

In [None]:
def filter_data(data, cutoff_date):
    cutoff_date = datetime.strptime(cutoff_date, '%d-%m-%Y')
    filtered_data = [item for item in data if datetime.strptime(item['Data da Fatura'], '%d-%m-%Y') <= cutoff_date]
    return filtered_data

In [6]:
def save_to_csv(data, csv_path):
    df = pd.DataFrame(data)
    df.to_csv(csv_path, index=False)

In [7]:
data = get_table_data()

In [8]:
filtered_data = filter_data(data, '01-08-2024')

In [9]:
download_invoices(filtered_data, 'faturas')

In [None]:
save_to_csv(filtered_data, 'faturas.csv')