In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import random
import numpy as np
import torch

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using CUDA
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False  # makes performance deterministic


set_seed()

In [None]:
%%capture
!pip install gliner

In [None]:
IS_KAGGLE = False
if IS_KAGGLE:
  base_fpath = "/kaggle/input/refugee-prwp/"
else:
  base_fpath = "/content/drive/MyDrive/colab-artifacts/refugee_data/INPUT_FIN/"

In [None]:
import json
refugee_synthetic_fpath = f"{base_fpath}/REFUGEE_DATA_SYNTHETIC_MERGED.json"

with open(refugee_synthetic_fpath, "r") as f:
    rf_data = json.load(f)

In [None]:
len(rf_data)

2217

In [None]:
rf_data[0]

{'text': 'The analysis focuses on the impact of logging activities on forest cover loss. To assess this, I utilize satellite imagery and ground truth data from the Global Forest Watch dataset, which provides comprehensive insights into deforestation rates over the last decade. Additionally, I incorporate a binary variable indicating whether a region has implemented conservation policies, which significantly influences forest preservation efforts. The findings suggest that areas with active conservation measures show a marked reduction in deforestation rates compared to those without such policies.',
 'entities': [{'text': 'Global Forest Watch', 'label': 'named dataset'}]}

In [None]:
import re
import spacy
def tokenize_text(text):
    """Tokenize the input text into a list of tokens."""
    return re.findall(r'\w+(?:[-_]\w+)*|\S', text)

def create_patterns(example):
    patterns = []
    for entity in example["entities"]:
        patterns.append({
            "label": entity["label"],
            "pattern": entity["text"]
        })
    return patterns

def adjust_entity_positions(text, tokens, entities):
    """Adjust entity positions based on tokenized text."""
    token_positions = []
    current_pos = 0
    for token in tokens:
        start_pos = text.find(token, current_pos)
        end_pos = start_pos + len(token) - 1
        token_positions.append((start_pos, end_pos))
        current_pos = end_pos + 1

    adjusted_entities = []
    for entity in entities:
        entity_start = text.find(entity["text"])
        entity_end = entity_start + len(entity["text"]) - 1
        start_token = next(i for i, (start, end) in enumerate(token_positions) if start <= entity_start <= end)
        end_token = next(i for i, (start, end) in enumerate(token_positions) if start <= entity_end <= end)
        adjusted_entities.append([start_token, end_token, entity["label"]])

    return adjusted_entities

def process_synthetic_data(example, use_spacy=True, debug=False):
    if use_spacy:
        nlp = spacy.blank("en")
        patterns = create_patterns(example)
        ruler = nlp.add_pipe("entity_ruler")
        ruler.add_patterns(patterns)

        # Process the text
        doc = nlp(example["text"])

        # Preparing the output format
        if debug:
            tokenized_text = [f"{i}: {token.text}" for i, token in enumerate(doc)]
        else:
            tokenized_text = [token.text for token in doc]

        ner = []
        for ent in doc.ents:
            start = ent.start
            end = ent.end - 1  # Adjusting end index to be inclusive, not +1
            ner.append([start, end, ent.label_])
    else:
        tokens = tokenize_text(example["text"])

        # Preparing the output format
        if debug:
            tokenized_text = [f"{i}: {token}" for i, token in enumerate(tokens)]
        else:
            tokenized_text = tokens

        # Adjust entity positions based on tokenized text
        ner = adjust_entity_positions(example["text"], tokens, example["entities"])

    return {"tokenized_text": tokenized_text, "ner": ner}

In [None]:
output = process_synthetic_data(rf_data[0], debug=True, use_spacy=False)
print(output)

{'tokenized_text': ['0: The', '1: analysis', '2: focuses', '3: on', '4: the', '5: impact', '6: of', '7: logging', '8: activities', '9: on', '10: forest', '11: cover', '12: loss', '13: .', '14: To', '15: assess', '16: this', '17: ,', '18: I', '19: utilize', '20: satellite', '21: imagery', '22: and', '23: ground', '24: truth', '25: data', '26: from', '27: the', '28: Global', '29: Forest', '30: Watch', '31: dataset', '32: ,', '33: which', '34: provides', '35: comprehensive', '36: insights', '37: into', '38: deforestation', '39: rates', '40: over', '41: the', '42: last', '43: decade', '44: .', '45: Additionally', '46: ,', '47: I', '48: incorporate', '49: a', '50: binary', '51: variable', '52: indicating', '53: whether', '54: a', '55: region', '56: has', '57: implemented', '58: conservation', '59: policies', '60: ,', '61: which', '62: significantly', '63: influences', '64: forest', '65: preservation', '66: efforts', '67: .', '68: The', '69: findings', '70: suggest', '71: that', '72: areas',

In [None]:
execute_pool = False

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
import concurrent.futures
from tqdm.auto import tqdm
import json
def process_synthetic_data_parallel(dt):
    return process_synthetic_data(dt, debug=False, use_spacy=True)
if execute_pool:
    with concurrent.futures.ThreadPoolExecutor() as executor:
        outputs_parallel = list(tqdm(executor.map(process_synthetic_data_parallel, rf_data), total=len(rf_data), desc="Processing data in parallel"))
    print("\nUsing parallel processing with tqdm:")
    # Save the output to a JSON file
    with open(f'REFUGEE_SYNTHETIC_PROCESSED.json', 'w') as f:
        json.dump(outputs_parallel, f)
else:

    # Load the JSON file
    fpath_training_data = f"{base_fpath}/REFUGEE_SYNTHETIC_PROCESSED.json"
    with open(fpath_training_data, 'r') as f:
        outputs_parallel = json.load(f)

    # Print the loaded data
    print(outputs_parallel[0])

{'tokenized_text': ['The', 'analysis', 'focuses', 'on', 'the', 'impact', 'of', 'logging', 'activities', 'on', 'forest', 'cover', 'loss', '.', 'To', 'assess', 'this', ',', 'I', 'utilize', 'satellite', 'imagery', 'and', 'ground', 'truth', 'data', 'from', 'the', 'Global', 'Forest', 'Watch', 'dataset', ',', 'which', 'provides', 'comprehensive', 'insights', 'into', 'deforestation', 'rates', 'over', 'the', 'last', 'decade', '.', 'Additionally', ',', 'I', 'incorporate', 'a', 'binary', 'variable', 'indicating', 'whether', 'a', 'region', 'has', 'implemented', 'conservation', 'policies', ',', 'which', 'significantly', 'influences', 'forest', 'preservation', 'efforts', '.', 'The', 'findings', 'suggest', 'that', 'areas', 'with', 'active', 'conservation', 'measures', 'show', 'a', 'marked', 'reduction', 'in', 'deforestation', 'rates', 'compared', 'to', 'those', 'without', 'such', 'policies', '.'], 'ner': [[28, 30, 'named dataset']]}


In [None]:
len(outputs_parallel)

2217

In [None]:
from collections import Counter

label_counter = Counter()

for item in outputs_parallel:
    for ner in item.get('ner', []):
        label_counter[ner[2]] += 1

# Print sorted by frequency
for label, count in label_counter.most_common():
    print(f"{label}: {count}")

named dataset: 4675
unnamed dataset: 634
vague dataset: 316
citation data source: 88
document data source: 37


In [None]:
len(outputs_parallel)

2217

In [None]:
import json
from datetime import datetime
from pathlib import Path
from sklearn.model_selection import train_test_split

In [None]:
def save_data(data, file_path, overwrite):
    """Save data to a file, handling overwriting based on user preference."""
    path = Path(file_path)
    assets_dir = path.parent
    if not assets_dir.exists():
        assets_dir.mkdir(parents=True, exist_ok=True)

    if not overwrite and path.exists():
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        file_path = f"{path.stem}_{timestamp}{path.suffix}"

    with open(file_path, 'w') as f:
        json.dump(data, f)

    print(f"Data saved to {file_path}")
def convert(training_data, project_path="", train_split=0.8, eval_split=0.2, test_split=0.0,
            train_file="train.json", eval_file="eval.json", test_file="test.json",
            overwrite=True):
    """Process data and split into training, validation, and testing datasets."""
    #training_data = [process_example(example) for example in data]

    # Handle the data splitting
    if test_split > 0:
        train_val, test = train_test_split(training_data, test_size=test_split, random_state=42)
        save_data(test, Path(project_path, 'assets', test_file), overwrite)
    else:
        train_val = training_data

    eval_size = eval_split / (1 - test_split)  # Adjust eval size based on the remaining data
    train, val = train_test_split(train_val, test_size=eval_size, random_state=42)

    # Save the data
    save_data(train, Path(project_path, 'assets', train_file), overwrite)
    save_data(val, Path(project_path, 'assets', eval_file), overwrite)

    return training_data

In [None]:
# Convert and split the data into training, validation, and testing datasets
training_data = convert(outputs_parallel, project_path='', train_split=0.9, eval_split=0.1, test_split=0.0,
                        train_file='train.json', eval_file='eval.json', test_file='test.json', overwrite=True)
import pandas as pd
pd.read_json("assets/eval.json")[:5]

Data saved to assets/train.json
Data saved to assets/eval.json


Unnamed: 0,tokenized_text,ner
0,"[This, report, provides, an, overview, of, the...","[[10, 17, named dataset], [53, 53, named datas..."
1,"[Panel, A, :, Tax, Revenue, Panel, B, :, Expen...","[[121, 123, named dataset], [145, 147, named d..."
2,"[In, the, context, of, international, aid, ,, ...","[[25, 26, named dataset], [78, 79, named datas..."
3,"[In, the, realm, of, public, finance, ,, the, ...","[[27, 30, unnamed dataset], [51, 51, named dat..."
4,"[As, climate, change, continues, to, be, a, pr...","[[12, 15, named dataset]]"


In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

import torch
from gliner import GLiNERConfig, GLiNER
from gliner.training import Trainer, TrainingArguments
from gliner.data_processing.collator import DataCollatorWithPadding, DataCollator
from gliner.utils import load_config_as_namespace
from gliner.data_processing import WordsSplitter, GLiNERDataset

device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
data_model_id = "knowledgator/gliner-multitask-v1.0"
#data_model_id = "urchade/gliner_base"
model = GLiNER.from_pretrained(data_model_id)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

# Baseline Model Performance (Zero-Shot)

In [None]:
test_path = f"{base_fpath}/REFUGEE_TEST_DATA_HOLDOUT.json"
test_data = pd.read_json(test_path)

In [None]:
from collections import Counter

label_counter = Counter()

for item in test_data.to_dict(orient='records'):
    for ner in item.get('ner', []):
        label_counter[ner[2]] += 1

# Print sorted by frequency
for label, count in label_counter.most_common():
    print(f"{label}: {count}")
# Apply lowercase to the label part (last element of each inner list)
def lowercase_labels(entity_list):
    return [[start, end, label.lower()] for start, end, label in entity_list]

named dataset: 143
unnamed dataset: 23
vague dataset: 12
citation data source: 3


In [None]:
test_data = test_data.to_dict(orient='records')

In [None]:
baseline_labels = ['dataset', 'data']

In [None]:
test_preds_dt = []
for item in tqdm(test_data, desc="Annotated Evaluation"):
  text = " ".join(item['tokenized_text'])
  pred_dt = model.predict_entities(text, baseline_labels, flat_ner=True, threshold=0.5)
  test_preds_dt.append(pred_dt)

Annotated Evaluation:   0%|          | 0/77 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


## Helper function to get "|" pipe separated predictions and ground truth

In [None]:
def transform_ground_truth(gt_dict, allowed_labels=None):
    """
    Transforms a ground truth dictionary with 'tokenized_text' and 'ner' fields
    into a pipe-delimited string of extracted entity phrases, filtering by allowed labels.

    Parameters:
        gt_dict (dict): A dictionary with 'tokenized_text' and 'ner' (list of [start, end, label]).
        allowed_labels (set or list, optional): Labels to include, e.g., {"named dataset", "unnamed dataset"}.

    Returns:
        str: A string of extracted phrases separated by '|'.
    """
    tokens = gt_dict['tokenized_text']
    ner_entries = gt_dict['ner']
    phrases = []

    # Default to include all labels if not specified
    if allowed_labels is None:
        allowed_labels = {"named dataset", "unnamed dataset", "vague dataset"}

    for entry in ner_entries:
        start, end, label = entry
        if label in allowed_labels:
            phrase = " ".join(tokens[start:end+1])
            phrases.append(phrase)

    return "|".join(phrases)


def transform_predictions(preds):
    """
    Transforms a list of GLiNER prediction dictionaries into a
    pipe-delimited string of the extracted text phrases.

    Parameters:
        preds (list): A list of dictionaries, where each dictionary
                      represents a prediction with keys such as 'start' and 'text'.

    Returns:
        str: A string with each predicted phrase separated by a '|'.
    """
    # Sort predictions by the 'start' index to maintain order
    sorted_preds = sorted(preds, key=lambda x: x['start'])
    # Extract the text for each prediction
    phrases = [pred['text'] for pred in sorted_preds]
    # Join the phrases with a pipe symbol
    return "|".join(phrases)

In [None]:
test_predictions_dt = [transform_predictions(ep) for ep in test_preds_dt]
test_actual_dt = [transform_ground_truth(td) for td in test_data]

### Metrics (fBeta Score Calculation)

In [None]:
def get_precision_recall(tp, fp, fn):
    precision = tp / (tp+fp)
    recall = tp / (tp + fn)
    return precision, recall

def fbeta_score(precision, recall, beta):
    fbeta = (1+(beta*beta))*((precision*recall)/( (beta*beta*precision) + recall))
    return fbeta

def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def coleridge_initiative_jaccard(ground_truth, prediction, verbose=True):
    gts = ground_truth.split('|')
    pds = sorted(prediction.split('|'))
    if verbose:
        print("Ground truth : " , gts)
        print("Prediction : ", pds)

    js_scores = []
    cf_matrix = []

    #### Counting True Positives (TP) and False Positives (FP)

    for pd in pds:
        score = -1
        for gt in gts:
            js = jaccard(pd, gt)
            if js > score:
                score = js
        if score >= 0.5:
            js_scores.append(score)
            cf_matrix.append("TP")
        else:
            js_scores.append(score)
            cf_matrix.append("FP")


    #### Counting False Negatives (FN)

    for gt in gts:
        score = -1
        for pd in pds:
            js = jaccard(gt, pd)
            if js > score:
                score = js
        if score == 0:
            js_scores.append(score)
            cf_matrix.append("FN")

    return js_scores, " ".join(cf_matrix)
def evaluate_metrics(ground_truth, predictions):
  # Calculate overall counts for TP, FP, and FN across all samples
  overall_tp = 0
  overall_fp = 0
  overall_fn = 0

  # You can also optionally collect sample level metrics if needed
  for gt, pred in zip(ground_truth, predictions):
      js_scores, cf_matrix_str = coleridge_initiative_jaccard(gt, pred, verbose=False)
      cf_list = cf_matrix_str.split()
      tp = cf_list.count("TP")
      fp = cf_list.count("FP")
      fn = cf_list.count("FN")
      overall_tp += tp
      overall_fp += fp
      overall_fn += fn
      # You can print or log individual sample metrics here if desired
      # print("Sample TP:", tp, "FP:", fp, "FN:", fn)

  # Now, calculate overall precision, recall, and F1 score
  precision, recall = get_precision_recall(overall_tp, overall_fp, overall_fn)
  f1_score = fbeta_score(precision, recall, beta=1)

  return {
      "tp": overall_tp,
      "fp": overall_fp,
      "fn": overall_fn,
      "precision": precision,
      "recall": recall,
      "f1_score": f1_score
  }

In [None]:
evaluate_metrics(test_actual_dt, test_predictions_dt)

{'tp': 121,
 'fp': 17,
 'fn': 52,
 'precision': 0.8768115942028986,
 'recall': 0.6994219653179191,
 'f1_score': 0.7781350482315113}

# Pre-fine tuning using synthetic data

In [None]:
def load_json_data(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The specified file does not exist: {file_path}")
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

In [None]:
# use it for better performance, it mimics original implementation but it's less memory efficient
data_collator = DataCollator(model.config, data_processor=model.data_processor, prepare_labels=True)
import warnings
warnings.filterwarnings("ignore")
train_dataset = load_json_data("assets/train.json")
test_dataset = load_json_data("assets/eval.json")

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
# calculate number of epochs
num_steps = 500
batch_size = 8
data_size = len(train_dataset)
num_batches = data_size // batch_size
num_epochs = max(1, num_steps // num_batches)

training_args = TrainingArguments(
    output_dir="models",
    learning_rate=1e-5,
    weight_decay=0.01,
    others_lr=1e-5,
    others_weight_decay=0.01,
    lr_scheduler_type="cosine", #for prefinetuning
    warmup_ratio=0.1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    focal_loss_alpha=0.75,
    focal_loss_gamma=2,
    num_train_epochs=num_epochs,
    eval_strategy="steps",
    save_steps = 250,
    save_total_limit=10,
    dataloader_num_workers = 0,
    use_cpu = False,
    report_to="none",
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

trainer.train()

Step,Training Loss,Validation Loss
500,20.2314,264.913971


TrainOutput(global_step=500, training_loss=20.231400390625, metrics={'train_runtime': 363.8482, 'train_samples_per_second': 10.966, 'train_steps_per_second': 1.374, 'total_flos': 0.0, 'train_loss': 20.231400390625, 'epoch': 2.0})

## Calculate performance for Pre-finetuning using Synthetic Data

In [None]:
labels = list(label_counter.keys())

In [None]:
test_preds_all = []
test_preds_dt = []
for item in tqdm(test_data, desc="Annotated Evaluation"):
  text = " ".join(item['tokenized_text'])
  pred_dt = model.predict_entities(text, ['named dataset', 'unnamed dataset', 'vague dataset'], flat_ner=True, threshold=0.5)
  pred_all = model.predict_entities(text, labels, flat_ner=True, threshold=0.5)
  test_preds_all.append(pred_all)
  test_preds_dt.append(pred_dt)
test_predictions_all = [transform_predictions(ep) for ep in test_preds_all]
test_predictions_dt = [transform_predictions(ep) for ep in test_preds_dt]

Annotated Evaluation:   0%|          | 0/77 [00:00<?, ?it/s]

In [None]:
test_actual_all = [transform_ground_truth(td, labels) for td in test_data]
test_actual_dt = [transform_ground_truth(td) for td in test_data]

In [None]:
test_actual_all[:5]

['HEIS|HEIS|HEIS',
 'Household , Income and Labour Dynamics in Australia ( HILDA )',
 'humanitarian profile data|IOM ’ s DTM',
 'Economic Freedom of the World ( EFW )|Polity IV dataset|Freedom House|Database of Political Institutions ( DPI )|JuriGlobe',
 'phone survey|Washington Group Short Set on Disability']

In [None]:
test_predictions_all[:5]

['HEIS|HEIS|HEIS',
 'Household , Income and Labour Dynamics in Australia ( HILDA ) survey',
 'humanitarian profile data|IOM ’ s DTM',
 'Economic Freedom of the World ( EFW ) index|Polity IV dataset|Freedom House|Database of Political Institutions ( DPI )|JuriGlobe',
 'implementing organization ’ s data|phone survey|Washington Group Short Set on Disability']

In [None]:
evaluate_metrics(test_actual_all, test_predictions_all)

{'tp': 176,
 'fp': 35,
 'fn': 23,
 'precision': 0.8341232227488151,
 'recall': 0.8844221105527639,
 'f1_score': 0.8585365853658536}

In [None]:
evaluate_metrics(test_actual_dt, test_predictions_dt)

{'tp': 175,
 'fp': 32,
 'fn': 22,
 'precision': 0.8454106280193237,
 'recall': 0.8883248730964467,
 'f1_score': 0.8663366336633662}

In [None]:
n_data = []
for x in test_actual_dt:
    n_data.append(len(set(x.split("|"))))

In [None]:
sum(n_data)

161

In [None]:
model_prefinetuned = "/content/drive/MyDrive/colab-artifacts/refugee_gliner_models/gliner_multitask_prefinetuned"

model.save_pretrained(model_prefinetuned)

In [None]:
from huggingface_hub import login
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')
login(token=HF_TOKEN)

In [None]:
model.push_to_hub("rafmacalaba/gliner_multitask_prefinetuned_refugee-v1")

spm.model:   0%|          | 0.00/2.45M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/rafmacalaba/gliner_multitask_prefinetuned_refugee-v1/commit/c92cfa94214e48d89da5a8e5a6b6acd1c7565d12', commit_message='Push model using huggingface_hub.', commit_description='', oid='c92cfa94214e48d89da5a8e5a6b6acd1c7565d12', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rafmacalaba/gliner_multitask_prefinetuned_refugee-v1', endpoint='https://huggingface.co', repo_type='model', repo_id='rafmacalaba/gliner_multitask_prefinetuned_refugee-v1'), pr_revision=None, pr_num=None)

In [None]:
del model

In [None]:
model = GLiNER.from_pretrained(model_prefinetuned, load_tokenizer=True, local_files_only=True)

config.json not found in /content/drive/MyDrive/colab-artifacts/refugee_gliner_models/gliner_multitask_prefinetuned


In [None]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
# !rm -r /content/drive/MyDrive/colab-artifacts/refugee_gliner/models

# Fine Tune on Manually Annotated Rich Dataset

In [None]:
import json
ANNOTATED_DATA_TRAIN_PATH = f"{base_fpath}/REFUGEE_DATA_TRAIN.json"

with open(ANNOTATED_DATA_TRAIN_PATH, "r") as f:
    train_data = json.load(f)

In [None]:
train_data = pd.DataFrame(train_data)

In [None]:
len(train_data)

783

In [None]:
from collections import Counter

label_counter = Counter()

for item in train_data.to_dict(orient='records'):
    for ner in item.get('ner', []):
        label_counter[ner[2]] += 1

# Print sorted by frequency
for label, count in label_counter.most_common():
    print(f"{label}: {count}")

named dataset: 1497
unnamed dataset: 240
vague dataset: 72
citation data source: 25
document data source: 19


In [None]:
train_data = train_data.to_dict(orient='records')

In [None]:
# Convert and split the data into training, validation, and testing datasets
training_data = convert(train_data, project_path='', train_split=0.9, eval_split=0.1, test_split=0.0,
                        train_file='train_annot.json', eval_file='eval_annot.json', test_file='test.json', overwrite=True)
import pandas as pd
pd.read_json("assets/eval_annot.json")[:5]
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

Data saved to assets/train_annot.json
Data saved to assets/eval_annot.json


In [None]:
# use it for better performance, it mimics original implementation but it's less memory efficient
data_collator = DataCollator(model.config, data_processor=model.data_processor, prepare_labels=True)
import warnings
warnings.filterwarnings("ignore")
train_dataset = load_json_data("assets/train_annot.json")
test_dataset = load_json_data("assets/eval_annot.json")

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
# calculate number of epochs
num_steps = 500
batch_size = 4
data_size = len(train_dataset)
num_batches = data_size // batch_size
num_epochs = max(1, num_steps // num_batches)
#num_epochs = 4
training_args = TrainingArguments(
    output_dir="models",
    learning_rate=5e-6,
    weight_decay=0.01,
    others_lr=1e-5,
    others_weight_decay=0.01,
    lr_scheduler_type="linear", #fine tuning
    warmup_ratio=0.1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    focal_loss_alpha=0.75,
    focal_loss_gamma=2,
    num_train_epochs=num_epochs,
    eval_strategy="steps",
    save_steps = 250,
    save_total_limit=10,
    dataloader_num_workers = 0,
    use_cpu = False,
    report_to="none",
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

trainer.train()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Step,Training Loss,Validation Loss


TrainOutput(global_step=352, training_loss=5.714207042347301, metrics={'train_runtime': 216.7043, 'train_samples_per_second': 6.497, 'train_steps_per_second': 1.624, 'total_flos': 0.0, 'train_loss': 5.714207042347301, 'epoch': 2.0})

In [None]:
test_preds_all = []
test_preds_dt = []
for item in tqdm(test_data, desc="Annotated Evaluation"):
  text = " ".join(item['tokenized_text'])
  pred_dt = model.predict_entities(text, ['named dataset', 'unnamed dataset', 'vague dataset'], flat_ner=True, threshold=0.5)
  pred_all = model.predict_entities(text, labels, flat_ner=True, threshold=0.5)
  test_preds_all.append(pred_all)
  test_preds_dt.append(pred_dt)
test_predictions_all = [transform_predictions(ep) for ep in test_preds_all]
test_predictions_dt = [transform_predictions(ep) for ep in test_preds_dt]

Annotated Evaluation:   0%|          | 0/77 [00:00<?, ?it/s]

In [None]:
evaluate_metrics(test_actual_all, test_predictions_all)

{'tp': 182,
 'fp': 41,
 'fn': 7,
 'precision': 0.8161434977578476,
 'recall': 0.9629629629629629,
 'f1_score': 0.883495145631068}

In [None]:
evaluate_metrics(test_actual_dt, test_predictions_dt)

{'tp': 176,
 'fp': 23,
 'fn': 10,
 'precision': 0.8844221105527639,
 'recall': 0.946236559139785,
 'f1_score': 0.9142857142857144}

In [None]:
model.push_to_hub("rafmacalaba/gliner_multitask_finetuned_refugee-v1")

spm.model:   0%|          | 0.00/2.45M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/rafmacalaba/gliner_multitask_finetuned_refugee-v1/commit/860be6b70998b9ea9cb29b4fbfd0cbd79b359be2', commit_message='Push model using huggingface_hub.', commit_description='', oid='860be6b70998b9ea9cb29b4fbfd0cbd79b359be2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rafmacalaba/gliner_multitask_finetuned_refugee-v1', endpoint='https://huggingface.co', repo_type='model', repo_id='rafmacalaba/gliner_multitask_finetuned_refugee-v1'), pr_revision=None, pr_num=None)

In [None]:
test_actual_dt[:5]

['HEIS|HEIS|HEIS',
 'Household , Income and Labour Dynamics in Australia ( HILDA )',
 'humanitarian profile data|IOM ’ s DTM',
 'Economic Freedom of the World ( EFW )|Polity IV dataset|Freedom House|Database of Political Institutions ( DPI )|JuriGlobe',
 'phone survey|Washington Group Short Set on Disability']

In [None]:
test_predictions_dt[:5]

['HEIS|HEIS|HEIS',
 'Household , Income and Labour Dynamics in Australia ( HILDA ) survey',
 'humanitarian profile data|IOM ’ s DTM',
 'Economic Freedom of the World ( EFW ) index|Polity IV dataset|Freedom House|Database of Political Institutions ( DPI )|JuriGlobe',
 'phone survey|Washington Group Short Set on Disability']

In [None]:
test_actual_all[:5]

['HEIS|HEIS|HEIS',
 'Household , Income and Labour Dynamics in Australia ( HILDA )',
 'humanitarian profile data|IOM ’ s DTM',
 'Economic Freedom of the World ( EFW )|Polity IV dataset|Freedom House|Database of Political Institutions ( DPI )|JuriGlobe',
 'phone survey|Washington Group Short Set on Disability']

In [None]:
test_predictions_all[:5]

['HEIS|HEIS|HEIS',
 'Household , Income and Labour Dynamics in Australia ( HILDA ) survey',
 'humanitarian profile data|IOM ’ s DTM',
 'Economic Freedom of the World ( EFW ) index|Polity IV dataset|Freedom House|Database of Political Institutions ( DPI )|JuriGlobe',
 'implementing organization ’ s data|phone survey|Washington Group Short Set on Disability']

In [None]:
test_df = pd.DataFrame(test_data)

In [None]:
test_df

Unnamed: 0,tokenized_text,ner,validated
0,"[32, Appendix, A, ., Identifying, Afghan, Hous...","[[8, 8, named dataset], [8, 8, named dataset],...",True
1,"[8, analysis, with, data, from, the, Household...","[[6, 16, named dataset]]",True
2,"[,, but, data, are, not, comprehensive, and, n...","[[14, 16, unnamed dataset], [37, 40, named dat...",True
3,"[7, Figure, 1, ., Economic, Freedom, Score, ,,...","[[29, 36, named dataset], [44, 46, named datas...",True
4,"[β0, +, β1Ti, +, ϵi, ., Robust, standard, erro...","[[79, 80, vague dataset], [138, 143, named dat...",True
...,...,...,...
72,"[version, of, “, are, immigrants, good, or, ba...","[[33, 39, unnamed dataset], [49, 57, named dat...",True
73,"[regression, estimates, measuring, the, joint,...","[[78, 81, named dataset], [83, 88, named datas...",True
74,"[8, Finally, ,, we, also, apply, text, mining,...","[[111, 115, unnamed dataset], [154, 156, unnam...",True
75,"[15, Figure, 7, ., Evolution, in, the, Share, ...","[[23, 23, named dataset]]",True


In [None]:
test_df['validated'] = False
test_df['entities'] = test_preds_all

In [None]:
test_df.iloc[0]['entities']

[{'start': 49,
  'end': 53,
  'text': 'HEIS',
  'label': 'named dataset',
  'score': 0.7732076048851013},
 {'start': 88,
  'end': 92,
  'text': 'HEIS',
  'label': 'named dataset',
  'score': 0.7744329571723938},
 {'start': 424,
  'end': 428,
  'text': 'HEIS',
  'label': 'named dataset',
  'score': 0.6559640765190125}]

In [None]:
test_df

Unnamed: 0,tokenized_text,ner,validated,entities
0,"[32, Appendix, A, ., Identifying, Afghan, Hous...","[[8, 8, named dataset], [8, 8, named dataset],...",False,"[{'start': 49, 'end': 53, 'text': 'HEIS', 'lab..."
1,"[8, analysis, with, data, from, the, Household...","[[6, 16, named dataset]]",False,"[{'start': 30, 'end': 98, 'text': 'Household ,..."
2,"[,, but, data, are, not, comprehensive, and, n...","[[14, 16, unnamed dataset], [37, 40, named dat...",False,"[{'start': 75, 'end': 100, 'text': 'humanitari..."
3,"[7, Figure, 1, ., Economic, Freedom, Score, ,,...","[[29, 36, named dataset], [44, 46, named datas...",False,"[{'start': 137, 'end': 180, 'text': 'Economic ..."
4,"[β0, +, β1Ti, +, ϵi, ., Robust, standard, erro...","[[79, 80, vague dataset], [138, 143, named dat...",False,"[{'start': 247, 'end': 281, 'text': 'implement..."
...,...,...,...,...
72,"[version, of, “, are, immigrants, good, or, ba...","[[33, 39, unnamed dataset], [49, 57, named dat...",False,"[{'start': 241, 'end': 308, 'text': 'Developin..."
73,"[regression, estimates, measuring, the, joint,...","[[78, 81, named dataset], [83, 88, named datas...",False,"[{'start': 345, 'end': 373, 'text': '1928 Gree..."
74,"[8, Finally, ,, we, also, apply, text, mining,...","[[111, 115, unnamed dataset], [154, 156, unnam...",False,"[{'start': 638, 'end': 677, 'text': 'publicly ..."
75,"[15, Figure, 7, ., Evolution, in, the, Share, ...","[[23, 23, named dataset]]",False,"[{'start': 123, 'end': 128, 'text': 'SRHCS', '..."


In [None]:
test_df.to_csv(f"{base_fpath}/PREDICTIONS_REFUGEE_TEST_HOLDOUT.csv", index=False)