In [3]:
import requests

FLY_API_URL = "https://ivedras-topic-api.fly.dev/predict"

response = requests.post(
        FLY_API_URL,
        json={"text": 'Buraco na estrada'},
        timeout=60
    )
response.raise_for_status()
response.json()

{'label_id': 5, 'confidence': 0.14835049211978912}

In [2]:
text = "Há um buraco enormena estrada da Avenida 5 de Outubro"


In [18]:
import re

# Simple regex pattern to detect addresses like: Rua do Comércio 10, 2560-100 Torres Vedras
pattern = r'\b(?:Rua|Avenida|Travessa|Largo|Estrada|Praceta)\s+(?:[\w\s]+?),?\s*\d{1,4}(?:[A-Z]?)?,?\s*\d{4}-\d{3}\s+[\w\s]+'

matches = re.findall(pattern, text, re.IGNORECASE)
matches

[]

In [3]:
import spacy

nlp = spacy.load("pt_core_news_lg")
doc = nlp(text)

addresses = [ent.text for ent in doc.ents if ent.label_ in ["LOC", "GPE"]]

addresses

['Avenida 5 de Outubro']

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load model directly
tokenizer = AutoTokenizer.from_pretrained("lfcc/bert-portuguese-ner")
model = AutoModelForTokenClassification.from_pretrained("lfcc/bert-portuguese-ner")

# NER pipeline
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

entities = ner(text)
print(entities)
for entity in entities:
    if entity['entity_group'] in ['Local', 'ORG', 'PER']:  # Adjust as needed
        print(entity)

  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'Local', 'score': 0.32134798, 'word': 'Avenida', 'start': 33, 'end': 40}, {'entity_group': 'Organizacao', 'score': 0.37256727, 'word': '5', 'start': 41, 'end': 42}, {'entity_group': 'Local', 'score': 0.54296064, 'word': 'de', 'start': 43, 'end': 45}, {'entity_group': 'Organizacao', 'score': 0.3874629, 'word': 'Outubro', 'start': 46, 'end': 53}]
{'entity_group': 'Local', 'score': 0.32134798, 'word': 'Avenida', 'start': 33, 'end': 40}
{'entity_group': 'Local', 'score': 0.54296064, 'word': 'de', 'start': 43, 'end': 45}


In [5]:
import pandas as pd
from difflib import SequenceMatcher
import numpy as np


text = "Há um buraco enorme na estrada, da rua euvaristo silva, na urbanizacao do hilariao"


# Load the CSV with Torres Vedras addresses
addresses_df = pd.read_csv("moradas_torres_vedras.csv")

# Function to calculate similarity between two strings
def calculate_similarity(str1, str2):
    """Calculate similarity ratio between two strings"""
    # Convert to strings and handle NaN values
    str1 = str(str1) if pd.notna(str1) else ""
    str2 = str(str2) if pd.notna(str2) else ""
    return SequenceMatcher(None, str1.lower(), str2.lower()).ratio()

# Function to find the most similar address from the database
def find_similar_address(extracted_address, addresses_df, threshold=0.6):
    """
    Find the most similar address from the database
    
    Args:
        extracted_address: The address extracted by NER
        addresses_df: DataFrame with addresses from CSV
        threshold: Minimum similarity ratio (0-1)
    
    Returns:
        tuple: (best_match, similarity_score) or (None, 0) if no match above threshold
    """
    best_match = None
    best_score = 0
    
    for idx, row in addresses_df.iterrows():
        db_address = row['address']
        # Skip NaN values in the database
        if pd.isna(db_address):
            continue
        similarity = calculate_similarity(extracted_address, db_address)
        
        if similarity > best_score:
            best_score = similarity
            best_match = db_address
    
    if best_score >= threshold:
        return best_match, best_score
    else:
        return None, best_score

# Test with the extracted entities from previous cells
print("Testing similarity with extracted entities:")
print("-" * 50)

# Get entities from the NER model
print(text)
entities = ner(text)
extracted_addresses = []

for entity in entities:
    if entity['entity_group'] in ['Local', 'ORG', 'PER']:
        extracted_addresses.append(entity['word'])

print(entities)


doc = nlp(text)
spacy_addresses = [ent.text for ent in doc.ents if ent.label_ in ["LOC", "GPE"]]


# Combine all extracted addresses
all_extracted = list(set(extracted_addresses + spacy_addresses))

print(f"Total addresses to check: {len(all_extracted)}")
print(f"Addresses in database: {len(addresses_df)}")

# Check each extracted address
for addr in all_extracted:
    if addr.strip():  # Skip empty strings
        best_match, score = find_similar_address(addr, addresses_df, threshold=0.3)
        
        print(f"\nExtracted: '{addr}'")
        if best_match:
            print(f"Best match: '{best_match}' (similarity: {score:.2%})")
            print(f"Coordinates: {addresses_df[addresses_df['address'] == best_match][['latitude', 'longitude']].iloc[0].to_dict()}")
        else:
            print(f"No good match found (best score: {score:.2%})")



Testing similarity with extracted entities:
--------------------------------------------------
Há um buraco enorme na estrada, da rua euvaristo silva, na urbanizacao do hilariao
[{'entity_group': 'Pessoa', 'score': 0.7485047, 'word': 'euvaristo silva', 'start': 39, 'end': 54}, {'entity_group': 'Local', 'score': 0.6089806, 'word': 'hi', 'start': 74, 'end': 76}, {'entity_group': 'Local', 'score': 0.5012844, 'word': '##lar', 'start': 76, 'end': 79}]
Total addresses to check: 4
Addresses in database: 5400

Extracted: '##lar'
No good match found (best score: 10.34%)

Extracted: 'urbanizacao do hilariao'
Best match: 'Urbanização do Vale da Azenha
2560-510 SILVEIRA
Silveira, Torres Vedras, Lisboa' (similarity: 39.22%)
Coordinates: {'latitude': 'GPS: 39.114514', 'longitude': '-9.366964'}

Extracted: 'rua euvaristo silva'
Best match: 'Rua Eduardo Sacristão
2560-544 SILVEIRA
Silveira, Torres Vedras, Lisboa' (similarity: 40.00%)
Coordinates: {'latitude': 'GPS: 39.112200', 'longitude': '-9.363663'