In [11]:
import re
import unicodedata
import pandas as pd

from datetime import datetime
from pathlib import Path

In [31]:
DATA_PATH = Path.cwd().parent / "data"

# Data Preprocessing 

In [34]:
def clean_text(text: str) -> str:
    """Clean and normalize French text."""
    # Normalize unicode characters
    text = unicodedata.normalize('NFKD', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters but keep French accents
    text = re.sub(r'[^a-zàâçéèêëîïôûùüÿñæœ0-9\s.,/-]', ' ', text)
    
    # Normalize whitespace
    text = ' '.join(text.split())
    
    return text


def preprocess_dataset(input_file: str) -> pd.DataFrame:
    
    df = pd.read_csv(input_file)
    
    df = df.dropna(subset=['Text'], how='all')  
    df = df[df['Text'].str.strip() != '']
    
    # Clean the text column
    df['text'] = df['Text'].fillna('').apply(clean_text)
    
    # Standardize date format in gold_date column (assuming DD/MM/YYYY format)
    df['gold_date'] = pd.to_datetime(df['Gold published date'], format='%d/%m/%Y', errors='coerce')
    
    # Clean URLs
    df['URL'] = df['URL'].str.strip().str.lower()
    
    return df


def extract_date_from_url(url: str):
    """
    Extract the most complete and likely publication date from a URL.
    Prioritizes:
    1. Full dates (day/month/year) in filename
    2. Full dates in path
    3. Partial dates (year/month)
    """
    if not url or not isinstance(url, str):
        return None
    
    url = url.lower().strip()
    
    # First, look for dates in the filename (after last slash)
    filename = url.split('/')[-1]
    
    # Try to find a full date in the filename first
    filename_patterns = [
        # dd.mm.yyyy
        r'(?P<day>[0-3]\d)[.](?P<month>[01]\d)[.](?P<year>20\d{2})',
        # yyyy.mm.dd
        r'(?P<year>20\d{2})[.](?P<month>[01]\d)[.](?P<day>[0-3]\d)',
        # yyyy/mm/dd or yyyy-mm-dd
        r'(?P<year>20\d{2})[/-](?P<month>0[1-9]|1[0-2])[/-](?P<day>0[1-9]|[12]\d|3[01])',
        # dd/mm/yyyy or dd-mm-yyyy
        r'(?P<day>0[1-9]|[12]\d|3[01])[/-](?P<month>0[1-9]|1[0-2])[/-](?P<year>20\d{2})',
        # yyyy/mm
        r'(?P<year>20\d{2})[/-](?P<month>0[1-9]|1[0-2])',
        # Special case for year only
        r'/(?P<year>20\d{2})/'
    ]
    
    # Check filename first
    for pattern in filename_patterns:
        match = re.search(pattern, filename)
        if match:
            try:
                date_parts = match.groupdict()
                year = int(date_parts['year'])
                month = int(date_parts['month'])
                day = int(date_parts['day'])
                if 1 <= month <= 12 and 1 <= day <= 31:
                    return datetime(year, month, day)
            except (ValueError, KeyError):
                continue
    
    # If no valid date in filename, check the full URL
    full_patterns = [
        # Full dates first
        r'(?P<day>[0-3]\d)[.-](?P<month>[01]\d)[.-](?P<year>20\d{2})',
        r'(?P<year>20\d{2})[.-](?P<month>[01]\d)[.-](?P<day>[0-3]\d)',
        r'(?P<day>[0-3]\d)/(?P<month>[01]\d)/(?P<year>20\d{2})',
        r'(?P<year>20\d{2})/(?P<month>[01]\d)/(?P<day>[0-3]\d)',
        # Partial dates last
        r'(?P<year>20\d{2})[/-](?P<month>[01]\d)',
        r'/(?P<year>20\d{2})/'
    ]
    
    for pattern in full_patterns:
        match = re.search(pattern, url)
        if match:
            try:
                date_parts = match.groupdict()
                year = int(date_parts['year'])
                month = int(date_parts.get('month', 1))
                day = int(date_parts.get('day', 1))
                if 1 <= month <= 12 and 1 <= day <= 31:
                    return datetime(year, month, day)
            except (ValueError, KeyError):
                continue
    
    return None

In [35]:
df = preprocess_dataset(DATA_PATH / "raw" / "annotated_data.csv")
df = df.rename(columns={'URL': 'url'})
df['extracted_url_date'] = df['url'].apply(extract_date_from_url)
df

Unnamed: 0,Text,Gold published date,url,text,gold_date,extracted_url_date
20,VILLE DE SEVRAN\n\nDépartement de la\nSeine-Sa...,14/02/2023,,ville de sevran de partement de la seine-saint...,2023-02-14,NaT
21,* REPUBLIQUE FRANÇAISE\n\nDEPARTEMENT\ndu\nVAL...,17/02/2023,,republique franc aise departement du val d ois...,2023-02-17,NaT
22,PROCES VERBAL DU CONSEIL MUNICIPAL\nSÉANCE DU ...,10/02/2023,,proces verbal du conseil municipal se ance du ...,2023-02-10,NaT
23,Envoyé en préfecture le 10/03/2023\nReçu en pr...,10/03/2023,,envoye en pre fecture le 10/03/2023 rec u en p...,2023-03-10,NaT
24,CONVOCATION\n\nLe vingt-quatre janvier deux mi...,30/01/2023,,convocation le vingt-quatre janvier deux mille...,2023-01-30,NaT
...,...,...,...,...,...,...
495,PROJET DE RAPPORT\nD’ORIENTATIONS BUDGETAIRES\...,15/02/2024,https://www.estuaire-sillon.fr/fileadmin/media...,projet de rapport d orientations budgetaires 2...,2024-02-15,2024-01-01
496,Dépârtement de la COTE-D'OR\nCanton de TALANT\...,24/01/2024,https://plombieres-les-dijon.fr/wp-content/upl...,de pa rtement de la cote-d or canton de talant...,2024-01-24,2024-01-24
497,Spécial n° 10 de janvier 2024\nn° 2024 01 10\n...,09/01/2024,https://www.orne.gouv.fr/contenu/telechargemen...,spe cial n 10 de janvier 2024 n 2024 01 10 mar...,2024-01-09,NaT
498,RECUEIL DES ACTES\nADMINISTRATIFS SPÉCIAL\nN°8...,22/11/2022,https://www.vosges.gouv.fr/contenu/telechargem...,recueil des actes administratifs spe cial n 88...,2022-11-22,NaT


In [36]:
df.to_csv(DATA_PATH / "processed" / "processed_dates.csv", index=False)