Bibliothèques à installer

In [None]:
!pip install pandas
!pip install scikit-learn
!pip install openpyxl
!pip install unidecode fuzzywuzzy[speedup]


Code pour trier les résultats

In [None]:
import json
import re


EXTRACTED_JSON = "extracted_contract_info.json"

# === Load the JSON ===
with open(EXTRACTED_JSON, "r", encoding="utf-8") as f:
    data = json.load(f)

# === Extract numeric part of ID for sorting ===
def extract_numeric_id(entry):
    id_str = entry.get("id") or entry.get("file_name", "")
    id_str = id_str.replace(".pdf", "").strip().lower()
    match = re.search(r'\d+', id_str)
    return int(match.group()) if match else float("inf")  # sort non-numeric IDs last

# === Sort entries by numeric ID ===
sorted_data = sorted(data, key=extract_numeric_id)

with open("sorted_by_numeric_id.json", "w", encoding="utf-8") as f:
    json.dump(sorted_data, f, ensure_ascii=False, indent=2)

print(f"Sorted numerically by ID and saved {len(sorted_data)} entries to 'sorted_by_numeric_id.json'")


Evaluation and comparison 

In [None]:
import json
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from fuzzywuzzy import fuzz
import unidecode


EXTRACTED_JSON = "sorted_by_numeric_id.json"
GROUND_TRUTH_XLSX = "contracts_labelized_example.xlsx"

with open(EXTRACTED_JSON, "r", encoding="utf-8") as f:
    extracted_data = json.load(f)

extracted_df = pd.DataFrame(extracted_data)
ground_truth_df = pd.read_excel(GROUND_TRUTH_XLSX)

extracted_df["id"] = extracted_df["id"].astype(str).str.strip().str.lower()
ground_truth_df["id"] = ground_truth_df["id"].astype(str).str.strip().str.lower()

extracted_df.rename(columns={
    "titre_contrat": "pred_titre_contrat",
    "pays_cible": "pred_pays_cible",
    "societe_exploitation": "pred_societe_exploitation"
}, inplace=True)

merged_df = pd.merge(ground_truth_df, extracted_df, on="id", how="inner")
print(f"Found {len(merged_df)} matched rows")

if merged_df.empty:
    print("No matching IDs found.")
    exit()

def normalize_text(s):
    if isinstance(s, str):
        s = s.lower().strip()
        s = unidecode.unidecode(s)  # remove accents
        return s
    elif isinstance(s, list):
        return [normalize_text(i) for i in s if isinstance(i, str)]
    return s

for col in ["titre_contrat", "pred_titre_contrat", "societe_exploitation", "pred_societe_exploitation", "pays_cible", "pred_pays_cible"]:
    merged_df[col] = merged_df[col].apply(normalize_text)


def fuzzy_match(str1, str2, threshold=85):
    if not isinstance(str1, str) or not isinstance(str2, str):
        return False
    score = fuzz.token_set_ratio(str1, str2)
    return score >= threshold

merged_df["titre_contrat_match"] = merged_df.apply(
    lambda row: fuzzy_match(row["titre_contrat"], row["pred_titre_contrat"]), axis=1
)
merged_df["societe_exploitation_match"] = merged_df.apply(
    lambda row: fuzzy_match(row["societe_exploitation"], row["pred_societe_exploitation"]), axis=1
)

titre_accuracy = merged_df["titre_contrat_match"].mean()
societe_accuracy = merged_df["societe_exploitation_match"].mean()

print(f"Accuracy (titre_contrat) with fuzzy matching: {titre_accuracy:.2f}")
print(f"Accuracy (societe_exploitation) with fuzzy matching: {societe_accuracy:.2f}")

country_map = {
    "republique d'afrique du sud": "afrique du sud",
    "iles vierges britanniques": "british virgin island",
    "british virgin island": "british virgin island",
    "rdc": "republique du congo",
    "republique du congo": "republique du congo",
    
}

def map_countries(lst):
    if not isinstance(lst, list):
        return []
    return [country_map.get(c, c) for c in lst]

def to_list(x):
    if isinstance(x, str):
        return [i.strip() for i in x.split(",") if i.strip()]
    elif isinstance(x, list):
        return [str(i).strip() for i in x if isinstance(i, str)]
    return []

merged_df["true_pays"] = merged_df["pays_cible"].apply(to_list).apply(normalize_text).apply(map_countries)
merged_df["pred_pays"] = merged_df["pred_pays_cible"].apply(to_list).apply(normalize_text).apply(map_countries)

valid_df = merged_df[
    merged_df["true_pays"].apply(bool) & merged_df["pred_pays"].apply(bool)
]

if valid_df.empty:
    print("\nNo valid rows with non-empty 'pays_cible' and 'pred_pays_cible'.")
else:
    mlb = MultiLabelBinarizer()
    y_true = mlb.fit_transform(valid_df["true_pays"])
    y_pred = mlb.transform(valid_df["pred_pays"])  

    try:
        report = classification_report(y_true, y_pred, target_names=mlb.classes_, zero_division=0)
        print("\nClassification Report for 'pays_cible':")
        print(report)
    except ValueError as e:
        print("\nError generating classification report:", str(e))
