In [None]:
# Combining Milvus Data Upload and Web News Ingestion Scripts

# Required libraries installation
!pip install -q einops==0.7.0 langchain==0.1.9 pypdf==4.0.2 pymilvus==2.3.6 sentence-transformers==2.4.0 requests

In [None]:
# Importing necessary libraries
import os
import requests
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Milvus

In [None]:
# Base parameters for Milvus connection
MILVUS_HOST = "vectordb-milvus.milvus.svc.cluster.local"
MILVUS_PORT = 19530
MILVUS_USERNAME = "root"
MILVUS_PASSWORD = "Milvus"
MILVUS_COLLECTION = "ba_gov_noticias"

In [None]:
# List of  URLs
news_urls = [
    "https://www.ba.gov.br/noticias/366144/governo-da-bahia-decreta-luto-oficial-de-2-dias-pelas-mortes-ocorridas-em-acidente-na-br116",
    "https://www.ba.gov.br/noticias/366130/governador-jeronimo-rodrigues-investe-mais-de-r-63-milhoes-em-seguranca-educacao-e-infraestrutura-em-paratinga",
    "https://www.ba.gov.br/noticias/366122/governo-do-estado-garante-realizacao-do-por-do-som-com-daniela-mercury-na-barra"
]

In [None]:
# Function to fetch and process news articles
def fetch_news_articles(urls):
    articles = []
    for url in urls:
        try:
            response = requests.get(url)
            if response.status_code == 200:
                articles.append({"url": url, "content": response.text})
            else:
                print(f"Failed to fetch {url}: Status Code {response.status_code}")
        except Exception as e:
            print(f"Error fetching {url}: {e}")
    return articles

In [None]:
# Fetching news articles
news_articles = fetch_news_articles(news_urls)

if news_articles:
    print(f"Fetched {len(news_articles)} articles.")

    # Creating Milvus index and storing news articles
    embeddings = HuggingFaceEmbeddings(show_progress=True)

    # Create Milvus instance and collection
    db = Milvus(
        embedding_function=embeddings,
        connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT, "user": MILVUS_USERNAME, "password": MILVUS_PASSWORD},
        collection_name=MILVUS_COLLECTION,
        metadata_field="metadata",
        text_field="page_content",
        auto_id=True,
        drop_old=True
    )

    # Splitting the articles into chunks before ingestion
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
    chunks = text_splitter.create_documents([
        article["content"] for article in news_articles
    ], metadatas=[{"source": article["url"]} for article in news_articles])

    # Adding documents to the Milvus collection
    db.add_documents(chunks)
    print("News articles added to Milvus successfully!")
else:
    print("No news articles to process.")
