# Annotation C3M (e.g on ledgar datset)
---
Goal of the notebook: .

Inputs of the notebook:
- Dataset as pandas dataframe : with a column "text" and a "label" column
- Concepts list 

Output of the notebook:
- dataframe and dataloader of the augmented dataset with concepts


In [None]:
import sys
sys.path.append('../../run_experiments/')
sys.path.append('../../run_experiments/scripts')
sys.path.append('../../run_experiments/models')
sys.path.append('../../run_experiments/data')


import time
from tqdm import tqdm
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from datasets import load_dataset
from sklearn.metrics import f1_score
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, Dataset, Subset


# model for CBM
# import fonction for getting PLM and tokenizer
from models.utils import load_model_and_tokenizer

# import config 
from config_ledgar import Config

# library for managing memory RAM
import gc



In [None]:
#code for autoreload script associated with jupyter notebook
%load_ext autoreload
%autoreload 2

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
#Importing SLM for annotation

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

hf_token = "xxxxxxxxxxxxxxxx" 

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it", use_auth_token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-9b-it",
    device_map="auto", use_auth_token=hf_token,
)


In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(torch.cuda.device_count())
model.to(device)

In [None]:
#C3M

In [None]:
Amendments_concepts = ['Modification rights', 'Amendment procedures', 'Notice requirements', 'Approval mechanisms', 'Integration with original agreement', 'Format requirements', 'Severability of amendments', 'Retroactive application', 'Waiver limitations', 'Amendment thresholds', 'Unilateral amendment rights', 'Amendment restrictions', 'Prior versions validity', 'Amendment documentation', 'Version control mechanisms', 'Material change provisions']

Terminations_concepts = ['Termination rights', 'Notice periods', 'Termination for cause', 'Termination for convenience', 'Cure periods', 'Effect of termination', 'Wind-down procedures', 'Return of property/information', 'Early termination penalties', 'Mutual termination provisions', 'Partial termination rights', 'Force majeure termination', 'Change of control provisions', 'Bankruptcy/insolvency triggers', 'Performance-based termination', 'Regulatory/legal change termination', 'Exit assistance', 'Termination certification requirements', 'Post-termination restrictions', 'Transition obligations']

Survival_concepts = ['Post-termination obligations', 'Duration of surviving terms', 'Identification of specific clauses', 'Confidentiality persistence', 'IP rights retention', 'Indemnification continuation', 'Limitation of liability', 'Dispute resolution mechanisms', 'Payment obligations survival', 'Non-compete/non-solicitation persistence', 'Representations/warranties survival', 'Record-keeping requirements', 'Audit rights continuation', 'Escrow arrangements', 'Insurance obligations', 'Remedies availability post-termination', 'Data protection obligations', 'Perpetual rights', 'Transitional licenses', 'Legal compliance requirements']

Terms_concepts = ['Duration specifications', 'Commencement date', 'Expiration conditions', 'Renewal mechanisms', 'Term length', 'Condition precedents', 'Milestone-based periods', 'Extension options', 'Initial term vs. renewal term distinctions', 'Notice requirements for non-renewal', 'Evergreen provisions', 'Term modification triggers', 'Retroactive effective dates', 'Trial/probation periods', 'Minimum commitment periods', 'Maximum term limitations', 'Regulatory term constraints', 'Performance-based extensions', 'Phase-in schedules', 'Renegotiation periods', 'Term acceleration provisions', 'Rolling term provisions']


In [None]:
concepts_legdar = {'Amendments': Amendments_concepts, 'Survival': Survival_concepts, 'Terms': Terms_concepts, 'Terminations': Terminations_concepts}

In [None]:
all_concepts_ledgar_list = Amendments_concepts + Survival_concepts + Terms_concepts + Terminations_concepts
len(all_concepts_ledgar_list)

In [None]:
# import the data
# import config
config = Config()

# Define the maximum sequence length and batch size
model_name = config.model_name
dim = config.dim
max_len = config.max_len
batch_size = config.batch_size
lambda_XtoC = config.lambda_XtoC
is_aux_logits = config.is_aux_logits
num_labels = config.num_labels
num_epochs = config.num_epochs          
num_each_concept_classes = config.num_each_concept_classes
# data_type = config.data_type
device = config.device
SAVE_PATH = config.SAVE_PATH



In [None]:
from data_ledgar import prepare_ledgar_data
train_loader, test_loader, val_loader, train_df, val_df, test_df = prepare_ledgar_data(config)

In [None]:
train_df.label.value_counts()

# annotation

In [None]:
from annotation_C3M import get_annotation

In [None]:
import os
import json
with open(os.path.join(config.SAVE_PATH_CONCEPTS, "label_mapping.json"), 'r') as f:
    labelling_json = json.load(f)
labelling_json

In [None]:
df_final = test_df[["text", "label"]].dropna()
df_final.reset_index(drop=True)

# Sauvegarder le dataframe final
df_final.to_csv(f"{config.SAVE_PATH_CONCEPTS}/ledgar_test_saved.csv", index=False)


In [None]:
df_final = pd.read_csv(f'{config.SAVE_PATH_CONCEPTS}/ledgar_test_saved.csv')
df_final['state'] = 'eval'
df_final

In [None]:
df_final_C3M = df_final[df_final.state=='eval'].copy()


for c in all_concepts_ledgar_list :
    print(c)
    concept_annotation_0 = []
    for i, t in enumerate(tqdm(df_final_C3M.text, desc=f"Processing {c}")):

        concept_annotation_0.append(get_annotation(t, c, model, tokenizer))
        # print(i,concept_annotation_0[i])
    df_final_C3M[c] = concept_annotation_0
    
    df_final_C3M.loc[df_final_C3M[c].str.contains('detected'), c] = 'detected'
    df_final_C3M.loc[df_final_C3M[c].str.contains('missing'), c] = 'missing'
    df_final_C3M[c][df_final_C3M[c]=="detected"]=1
    df_final_C3M[c][df_final_C3M[c]=="missing"]=0
        
    
    df_final_C3M.to_csv(f'{config.SAVE_PATH_CONCEPTS}/df_with_topics_v4_C3M_test.csv')

In [None]:
concept_annotation_0