In [None]:
import pandas as pd
from transformers import pipeline
import os

print("Initializing the legal-triplet-extractor model...")

HUGGING_FACE_TOKEN = "##############" 

try:
    # Pass the token to the pipeline constructor
    triplet_extractor = pipeline(
        "ner",
        model="Exploration-Lab/legal-triplet-extractor",
        aggregation_strategy="simple",
        token=HUGGING_FACE_TOKEN 
    )
    print("Model initialized successfully!")
except Exception as e:
    print(f"Error initializing model: {e}")
    print("Please ensure your Hugging Face token is correct and has 'read' access to the model.")
    print("Also, confirm you have an active internet connection and the model name is correct.")
    exit()

  from scipy.sparse import csr_matrix, issparse


Initializing the legal-triplet-extractor model...


config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of GemmaForTokenClassification were not initialized from the model checkpoint at Exploration-Lab/legal-triplet-extractor and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Device set to use cuda:0


Model initialized successfully!


In [None]:
input_folder = '../7 Dataset/prem_vs_conc_csv_files/all'
output_folder = './output_triplet_extraction' 
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print(f"Created output folder: {output_folder}")




Created output folder: ./output_triplet_extraction


In [None]:
print(f"Starting to process CSV files from: {input_folder}")

csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

if not csv_files:
    print(f"No CSV files found in the input folder: {input_folder}. Please check the path and contents.")
else:
    for csv_file in csv_files:
        input_file_path = os.path.join(input_folder, csv_file)
        output_file_path = os.path.join(output_folder, f"processed_{csv_file}")

        print(f"\nProcessing file: {csv_file}")
        try:
            df = pd.read_csv(input_file_path)
            print(f"Loaded {len(df)} rows from {csv_file}")

            df['dhananjay_events'] = [[] for _ in range(len(df))]

            for index, row in df.iterrows():
                text_to_process = row['text']
                if pd.isna(text_to_process):
                    print(f"Skipping row {index} in {csv_file} due to missing text.")
                    continue

                try:
                    extracted_triplets = triplet_extractor(text_to_process)
                    df.at[index, 'dhananjay_events'] = extracted_triplets
                except Exception as e:
                    print(f"Error extracting triplets for row {index} in {csv_file}: {e}")
                    df.at[index, 'dhananjay_events'] = []

            df.to_csv(output_file_path, index=False)
            print(f"Processed and saved results to: {output_file_path}")

        except FileNotFoundError:
            print(f"Error: The file {input_file_path} was not found.")
        except pd.errors.EmptyDataError:
            print(f"Error: The file {input_file_path} is empty.")
        except Exception as e:
            print(f"An unexpected error occurred while processing {csv_file}: {e}")

print("\nAll CSV files processed!")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Starting to process CSV files from: ../7 Dataset/prem_vs_conc_csv_files/all

Processing file: R2021_World Duty Free v. Commission.csv
Loaded 147 rows from R2021_World Duty Free v. Commission.csv


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processed and saved results to: ./output_triplet_extraction/processed_R2021_World Duty Free v. Commission.csv

Processing file: A2017_Ellinikos Chrysos AE Metalleion kai Viomichanias Chrysou v European Commission.csv
Loaded 33 rows from A2017_Ellinikos Chrysos AE Metalleion kai Viomichanias Chrysou v European Commission.csv
Processed and saved results to: ./output_triplet_extraction/processed_A2017_Ellinikos Chrysos AE Metalleion kai Viomichanias Chrysou v European Commission.csv

Processing file: R2013_Telefónica SA v European Commission.csv
Loaded 37 rows from R2013_Telefónica SA v European Commission.csv
Processed and saved results to: ./output_triplet_extraction/processed_R2013_Telefónica SA v European Commission.csv

Processing file: A2017_European Commission v Italian Republic_DT.csv
Loaded 52 rows from A2017_European Commission v Italian Republic_DT.csv
Processed and saved results to: ./output_triplet_extraction/processed_A2017_European Commission v Italian Republic_DT.csv

Proc