## Date Extraction Using Llama LLM with Ollama: A Prompt Engineering Approach

In [1]:
from datasets import load_dataset
import ollama
import re
from ollama import chat

In [None]:
# Load the dataset
dataset = load_dataset("maribr/publication_dates_fr")
train_ds = dataset['train']

In [3]:
# Convert the train dataset to a DataFrame
df = train_ds.to_pandas()
print(df.head())

                                                Text Gold published date  \
0  PROCES-VERBAL DE LA REUNION PUBLIQUE\nDU CONSE...          16/01/2023   
1  CONSEIL COMMUNAUTAIRE DU\n25 JANVIER 2023\nPRO...          25/01/2023   
2  Date de mise en ligne de\nl’acte : 02/ 02/2023...          02/02/2023   
3  Envoyé en préfecture le 26/01/2023\nReçu en pr...          26/01/2023   
4       \nFait à Bourg-en-Bresse, le 23 janvier 2023          16/01/2023   

                                                 url  
0  http://www.ville-saint-ay.fr/userfile/fichier-...  
1  https://www.gatine-racan.fr/wp-content/uploads...  
2  https://www.ville-mazeres.fr/IMG/pdf/2023_1_1.pdf  
3  https://www.fier-et-usses.com/cms_viewFile.php...  
4  https://www.grandbourg.fr/cms_viewFile.php?idt...  


In [4]:
print(df['Text'][5])

SEANCE DU 27 FEVRIER 2023


In [5]:
def preprocess_french_text(text):
    """
    Extracts the 5 words before each French-style date in the text and returns the cleaned text.
    For None or empty inputs, returns a message indicating no input was found.
    """
    if text is None or text.strip() == "":  # Check for None or empty/whitespace strings
        return "no input is found, output none"

    text = text.lower()  # Convert text to lowercase

    # French month names (case-insensitive)
    months = r"(janvier|février|fevrier|mars|avril|mai|juin|juillet|août|aout|septembre|octobre|novembre|décembre|decembre)"

    # Regex pattern for French dates (with case-insensitive flag for month names)
    date_pattern = fr"(\b\d{{1,2}}(?:er)?\s{months}\s\d{{4}})|(\b\d{{4}}-\d{{2}}-\d{{2}}\b)|(\b\d{{1,2}}/\d{{1,2}}/\d{{4}}\b)"

    # Find all matches
    matches = re.finditer(date_pattern, text, re.IGNORECASE)

    extracted_segments = []

    for match in matches:
        date = match.group(0)  # The date string
        start_idx = match.start()  # Start index of the date in the text

        # Extract 5 words before the date
        before = text[:start_idx].split()[-5:]  # Get the last 5 words before the date
        context = " ".join(before) + " " + date

        extracted_segments.append(context)

    # Join extracted contexts with newline or return None if no matches
    return "\n".join(extracted_segments) if extracted_segments else None

# Process the DataFrame
df['preprocessed_text'] = df['Text'].apply(preprocess_french_text)

In [6]:
print(df['preprocessed_text'][5])

seance du 27 fevrier 2023


In [7]:
responses = []

for text in df['preprocessed_text']:
    messages = [
        {
            'role': 'system',
            'content': (
                "you are an useful data assistant"
                "Your task is to identify the most probable publication date of online articles and output it in the format DD/MM/YYYY. "
                "If no date is found, return 'None' without any additional text, formatting, or explanation."
                "Do not output code, tags, or additional explanations. Only provide the date in the required format or 'None'."
                "Always output only one date."
                "example of date: 01/01/2022"
                "example : Fait à Bourg-en-Bresse, le 23 janvier 2023 output : 23/01/2023"
                "example : input : None output : None"
                "If multiple dates are present, choose the most probable publication date."
                "If the input is invalid or does not contain a valid publication date, output 'None'"
                "Avoid including any formatting, code, or explanations in the output."
            )
        },
        {
            'role': 'user',
            'content': text
        }
    ]
    response = chat(model='llama3.2', messages=messages)
    # put the extracted date in a new column
    responses.append(response['message']['content'])

In [8]:
print(responses[:5])

['16/01/2023', '25/01/2023', '13/01/2023', '26/01/2023', '23/01/2023']


In [9]:
# put the extracted date in a new column
df['extracted_date'] = responses

In [10]:
# Calculate accuracy
correct_predictions = (df['Gold published date'] == df['extracted_date']).sum()
total_predictions = len(df)
accuracy = correct_predictions / total_predictions

print(f"Accuracy: {accuracy:.2%}")

Accuracy: 51.00%


In [11]:
print(correct_predictions)

255
