# PoC Technique ‚Äî WiseAnalyse

# Step 1 - Data Engineering

In [None]:
%pip install pandas
%pip install sentence_transformers

In [None]:
import pandas as pd
from datetime import datetime
import re

In [None]:
aria = pd.read_csv("aria.csv", sep=";")
aria

In [None]:
emars = pd.read_json("emars.json")
emars

Le sch√©ma EMARS est d√©j√† tr√®s complet, nous allons donc adapter ARIA √† ce sch√©ma

## Adaptation au format EMARS

### Normalisation du secteur

In [None]:
# Normalize industry sectors between sources

industry_mapping = {
    'Oil refining': 'Oil Refining',
    'Raffinage de p√©trole': 'Oil Refining',
    'petroleum storage': 'Oil Storage',
    'Stockage hydrocarbures': 'Oil Storage',
    'Chemical manufacturing': 'Chemicals',
    'Chimie': 'Chemicals',
    'Steel production': 'Steel production',
    'Sid√©rurgie': 'Steel',
    'Petrochemical': 'Petrochemical',
    'PETROCHIMIE': 'Petrochemical'
}

# Check if every value has a mapping
# If not, maybe ask an LLM to add it

keys = set(industry_mapping.keys())

is_aria_subset = set(aria["Activite"]).issubset(keys)
if not is_aria_subset : print("ARIA misses mapping for", set(aria["Activite"]).difference(keys))

is_emars_subset = set(emars["industry_sector"]).issubset(keys)
if not is_emars_subset : print("EMARS misses mapping for", set(emars["industry_sector"]).difference(keys))

def map_activite_to_sector(source_activite):
    global industry_mapping
    return industry_mapping[source_activite]

aria["Activite"] = aria["Activite"].apply(map_activite_to_sector)
emars["industry_sector"] = emars["industry_sector"].apply(map_activite_to_sector)


### Normalisation des dates

In [None]:
def parse_date_to_iso(source_date):
    try : 
        if "/" in source_date:
            d = datetime.strptime(source_date, "%d/%m/%Y")
        elif "-" in source_date:
            d = datetime.strptime(source_date, "%Y-%m-%d")
        return d.strftime("%Y-%m-%dT00:00:00Z")
    except :
        return source_date
    
aria["Date_Accident"] = aria["Date_Accident"].apply(parse_date_to_iso)
emars["occurrence_date"] = emars["occurrence_date"].apply(parse_date_to_iso)

### Extraire les casualities

In [None]:
for x in aria["Consequences"]:
    s = x.split(" - ")
    print(s)

def extract_casualities(line):
    res = {
        "fatalities":0,
        "injuries":0
    }
    for message in line.split(" - "):
        if "bless√©" in message:
            if "Pas" in message: continue
            nb = re.findall(r'\d+', message)
            if len(nb) > 0 : res["injuries"] = int(nb[0])
        if "mort" in message:
            if "Pas" in message: continue
            nb = re.findall(r'\d+', message)
            if len(nb) > 0 : res["fatalities"] = int(nb[0])
    return res

aria["casualties"] = aria["Consequences"].apply(extract_casualities)
aria

### Extraire root_causes

In [None]:
for x in aria["Origine"]:
    print(x)

def extract_root_causes(line):
    return line.replace(" + ", " - ").split(" - ")

aria["Origine"] = aria["Origine"].apply(extract_root_causes)

### Mapping des noms de colonnes

In [None]:
aria["country"] = "FR"
aria_renamed = aria.rename({"N¬∞ARIA":"report_id", "Resume":"description", "Date_Accident":"occurrence_date", "Activite":"industry_sector", "Origine":"root_causes"}, axis=1)
aria_renamed

# Grouping

Dans le cadre du POC, nous allons partir du principe que :
- Le pays suffit a localiser l'incident
- Les substances et les impacts environnementaux ne sont pas importants
- ARIA ne contient pas de "Lessons_learned", mais nous gardons quand meme celles pr√©sentes dans EMARS

In [None]:
aria_cleaned = aria_renamed.drop(["Commune", "Dept", "Consequences"], axis=1)
aria_cleaned["lessons_learned"] = None
aria_cleaned["seveso_tier"] = None
aria_cleaned["Source"] = "ARIA"
aria_cleaned

In [None]:
emars_cleaned = emars.drop(["region", "substance_involved", "event_type", "environmental_impact"], axis=1)
emars_cleaned["seveso_tier"] = emars_cleaned["seveso_tier"].apply(str.upper)
emars_cleaned["Source"] = "EMARS"
emars_cleaned

In [None]:
df = pd.concat([emars_cleaned, aria_cleaned])
df[['fatalities', 'injuries']] = pd.DataFrame(aria['casualties'].tolist())
df = df.drop('casualties', axis=1)
df

### Pour aller plus loin, on pourrait :
- Traduire le francais en anglais
- Cat√©goriser les root_causes, comme pour les industry_sector
- Retrouver les substances et les impacts environnementaux dans la description des √©v√©nements ARIA via LLM

# Step 3 - Prototype RAG

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

input_columns = df.columns

def prepare_rag_data(df):
    df = df.copy()
    df['rag_text'] = (
        df['industry_sector'] + " | " +
        df['occurrence_date'].astype(str) + " | " +
        df['country'] + " | " +
        df['description'].fillna('') + " | "
        # df['root_causes'].astype(str) + " | " +
        # df['lessons_learned'].fillna('') + " | " +
        # df['Source']
    )
    return df

def parse_client(df):
    df["country"] = "FR"
    df['industry_sector'] = "Petrochemical"
    df["occurrence_date"] = df["date"].apply(parse_date_to_iso)
    df["root_causes"] = df["root_cause_preliminary"]
    if "fatalities" not in df.columns : df["fatalities"] = 0
    return df

class HybridRAG:
    def __init__(self, df_public, client_docs_path):
        self.df_public = prepare_rag_data(df_public)
        self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        self.df_client = pd.read_json(client_docs_path)
        self.df_client['Source'] = 'CLIENT_INTERNE'
        self.df_client = parse_client(self.df_client)

        for col in input_columns :
            if col not in self.df_client.columns:
                print("Missing col in client data :", col)
                self.df_client[col] = None

        self.df_client = prepare_rag_data(self.df_client)
        
        all_texts = self.df_public['rag_text'].tolist() + self.df_client['rag_text'].tolist()
        self.all_embeddings = self.model.encode(all_texts, show_progress_bar=True)
        
        self.public_count = len(self.df_public)
    
    def search(self, query, k=5):
        query_emb = self.model.encode([query])
        similarities = cosine_similarity(query_emb, self.all_embeddings)[0]

        top_indices = np.argsort(similarities)[::-1][:k*2]
        
        results = []
        for idx in top_indices:
            score = similarities[idx]
            
            if idx < self.public_count: # Base publique
                row = self.df_public.iloc[idx]
                source_type = "PUBLIC"
            else: # Documents internes
                row = self.df_client.iloc[idx - self.public_count]
                source_type = "CLIENT"
            
            result = {
                'score': f"{score:.3f}",
                'source_type': source_type,
                'report_id': row['report_id'],
                'date': row['occurrence_date'],
                'country': row['country'],
                'industry': row['industry_sector'],
                'description': row['description'],
                'root_causes': row['root_causes'],
                'fatalities': row['fatalities'],
                'injuries': row['injuries'],
                'source': row['Source']
            }
            results.append(result)
        
        return results[:k]  # Top K final

def demo_hybrid_rag(df_public):    
    rag = HybridRAG(df_public, "client-internal-incidents.json")
    
    questions = [
        "Quels accidents impliquant des fuites de gaz ont eu lieu dans le secteur petrochimique ?",
    ]
    
    for i, question in enumerate(questions, 1):
        print(f"\nüìä QUESTION {i}: {question}")
        print("-"*80)
        
        results = rag.search(question, k=5)
        
        for j, res in enumerate(results, 1):
            print(f"{j:2d}. [{res['score']}] <{res['source_type']}>")
            print(f"    {res['country']} | {res['industry']} | {res['date']}")
            print(f"    {res['description']}...")
            print(f"       {res['fatalities']} morts, {res['injuries']} bless√©s")
            print()

demo_hybrid_rag(df)