In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import glob
synth_paths = glob.glob("/content/drive/MyDrive/colab-artifacts/refugee_data/REFUGEE_RE/*.json")

In [None]:
import os
import json
def load_json_data(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The specified file does not exist: {file_path}")
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

In [None]:
synthetic_refugee_re = []
for synth in synth_paths:
  synthetic_refugee_re.extend(load_json_data(synth))

In [None]:
len(synthetic_refugee_re)

816

In [None]:
synthetic_refugee_re[0]

{'text': "Recent studies on ocean currents have shown that the data from the Oceanographic Data Center indicates significant changes in marine biodiversity. The analysis of the 'Global Ocean Temperature Dataset' reveals that temperature fluctuations are impacting marine species distribution. In particular, the Pacific Ocean has been affected, with researchers noting shifts in species populations. The Oceanographic Data Center has published findings in 2021, highlighting these changes. Furthermore, surveys conducted on marine life in the region have shown alarming trends in species decline, particularly in the last decade.",
 'entities': [{'text': 'Oceanographic Data Center', 'label': 'named dataset'},
  {'text': 'Global Ocean Temperature Dataset',
   'label': 'Oceanographic Data Center <> description'},
  {'text': 'Pacific Ocean', 'label': 'Oceanographic Data Center <> geography'},
  {'text': 'Oceanographic Data Center',
   'label': 'Oceanographic Data Center <> publisher'},
  {'text':

In [None]:
import json

def convert_my_data_for_gliner(dataset):
    out = []
    for sample in dataset:
        text = sample['text']
        ents, rels = [], []

        for item in sample['entities']:
            lbl = item['label']
            ent_text = item['text']
            if ' <>' in lbl:
                # this is a relation
                head_text, rel_type = lbl.split(' <> ', 1)
                tail_text = ent_text
                rels.append({
                    'head': head_text,
                    'tail': tail_text,
                    'label': rel_type
                })
            else:
                # this is an entity
                start = text.find(ent_text)
                if start == -1:
                    # fallback: skip or warn
                    continue
                end = start + len(ent_text)
                ents.append({
                    'text': ent_text,
                    'label': lbl,
                    'start': start,
                    'end': end
                })

        out.append({
            'text': text,
            'entities': ents,
            'relations': rels
        })
    return out

In [None]:
synthetic_refugee_re = convert_my_data_for_gliner(synthetic_refugee_re)

In [None]:
len(synthetic_refugee_re)

816

In [None]:
synthetic_refugee_re[0]

{'text': "Recent studies on ocean currents have shown that the data from the Oceanographic Data Center indicates significant changes in marine biodiversity. The analysis of the 'Global Ocean Temperature Dataset' reveals that temperature fluctuations are impacting marine species distribution. In particular, the Pacific Ocean has been affected, with researchers noting shifts in species populations. The Oceanographic Data Center has published findings in 2021, highlighting these changes. Furthermore, surveys conducted on marine life in the region have shown alarming trends in species decline, particularly in the last decade.",
 'entities': [{'text': 'Oceanographic Data Center',
   'label': 'named dataset',
   'start': 67,
   'end': 92},
  {'text': 'surveys conducted on marine life in the region',
   'label': 'unnamed dataset',
   'start': 493,
   'end': 539}],
 'relations': [{'head': 'Oceanographic Data Center',
   'tail': 'Global Ocean Temperature Dataset',
   'label': 'description'},
  

In [None]:
%%capture
!pip install gliner accelerate
!pip install --upgrade datasets huggingface_hub

In [None]:
synthetic_refugee_re[0]

{'text': "Recent studies on ocean currents have shown that the data from the Oceanographic Data Center indicates significant changes in marine biodiversity. The analysis of the 'Global Ocean Temperature Dataset' reveals that temperature fluctuations are impacting marine species distribution. In particular, the Pacific Ocean has been affected, with researchers noting shifts in species populations. The Oceanographic Data Center has published findings in 2021, highlighting these changes. Furthermore, surveys conducted on marine life in the region have shown alarming trends in species decline, particularly in the last decade.",
 'entities': [{'text': 'Oceanographic Data Center',
   'label': 'named dataset',
   'start': 67,
   'end': 92},
  {'text': 'surveys conducted on marine life in the region',
   'label': 'unnamed dataset',
   'start': 493,
   'end': 539}],
 'relations': [{'head': 'Oceanographic Data Center',
   'tail': 'Global Ocean Temperature Dataset',
   'label': 'description'},
  

In [None]:
import re

def tokenize_text(text):
    """Tokenize the input text into a list of tokens and record their character spans."""
    pattern = re.compile(r"\w+(?:[-_]\w+)*|\S")
    matches = list(pattern.finditer(text))
    tokens = [m.group(0) for m in matches]
    spans = [(m.start(), m.end()) for m in matches]
    return tokens, spans

# ─── 2) Convert raw examples to token-level format

def convert_to_token_level(examples):
    token_level = []
    for ex in examples:
        text = ex["text"]
        # a) tokenize using regex
        tokens, offsets = tokenize_text(text)
        # b) helper: map char index to token index
        def char_to_token(cidx):
            for tidx, (s, e) in enumerate(offsets):
                if s <= cidx < e:
                    return tidx
            return None

        # c) build NER spans
        ner = []
        for ent in ex.get("entities", []):
            s_tok = char_to_token(ent["start"])
            e_tok = char_to_token(ent["end"] - 1)
            if s_tok is None or e_tok is None:
                continue
            ner.append([s_tok, e_tok, ent["label"]])

        # d) build RE triples
        re_triples = []
        # map entity text to its token span
        ent_spans = {
            ent["text"]: (char_to_token(ent["start"]), char_to_token(ent["end"] - 1))
            for ent in ex.get("entities", [])
        }
        for rel in ex.get("relations", []):
            head_span = ent_spans.get(rel["head"])
            tail_span = ent_spans.get(rel["tail"])
            if head_span is None or tail_span is None:
                continue
            hs, he = head_span
            ts, te = tail_span
            re_triples.append([hs, he, ts, te, rel["label"]])

        token_level.append({
            "tokenized_text": tokens,
            "ner": ner,
            "re": re_triples
        })
    return token_level

# apply conversion
token_level_data = convert_to_token_level(synthetic_refugee_re)

In [None]:
token_level_data[0]

{'tokenized_text': ['Recent',
  'studies',
  'on',
  'ocean',
  'currents',
  'have',
  'shown',
  'that',
  'the',
  'data',
  'from',
  'the',
  'Oceanographic',
  'Data',
  'Center',
  'indicates',
  'significant',
  'changes',
  'in',
  'marine',
  'biodiversity',
  '.',
  'The',
  'analysis',
  'of',
  'the',
  "'",
  'Global',
  'Ocean',
  'Temperature',
  'Dataset',
  "'",
  'reveals',
  'that',
  'temperature',
  'fluctuations',
  'are',
  'impacting',
  'marine',
  'species',
  'distribution',
  '.',
  'In',
  'particular',
  ',',
  'the',
  'Pacific',
  'Ocean',
  'has',
  'been',
  'affected',
  ',',
  'with',
  'researchers',
  'noting',
  'shifts',
  'in',
  'species',
  'populations',
  '.',
  'The',
  'Oceanographic',
  'Data',
  'Center',
  'has',
  'published',
  'findings',
  'in',
  '2021',
  ',',
  'highlighting',
  'these',
  'changes',
  '.',
  'Furthermore',
  ',',
  'surveys',
  'conducted',
  'on',
  'marine',
  'life',
  'in',
  'the',
  'region',
  'have',
  

In [None]:
os.makedirs("assets", exist_ok=True)
split_idx = int(0.9 * len(token_level_data))

# Save as JSON array, not JSONL
with open("assets/train.json", "w") as ft:
    json.dump(token_level_data[:split_idx], ft)
with open("assets/eval.json", "w") as fe:
    json.dump(token_level_data[split_idx:], fe)

In [None]:

# 📌 STEP 4: Fine-Tune GLiNER Multitask (Joint NER + RE)
import torch
from gliner import GLiNER
from gliner.training import Trainer, TrainingArguments
from gliner.data_processing.collator import DataCollator

# Device and model setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GLiNER.from_pretrained('knowledgator/gliner-multitask-v1.0').to(device)
# DataCollator expects 'tokenized_text', 'ner', and 're'
data_collator = DataCollator(model.config, data_processor=model.data_processor, prepare_labels=True)

# Load datasets
train_dataset = load_json_data('assets/train.json')
eval_dataset = load_json_data('assets/eval.json')

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

special_tokens_map.json:   0%|          | 0.00/970 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/18.6k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.64M [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.45M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

In [None]:
train_dataset[0]

{'tokenized_text': ['Recent',
  'studies',
  'on',
  'ocean',
  'currents',
  'have',
  'shown',
  'that',
  'the',
  'data',
  'from',
  'the',
  'Oceanographic',
  'Data',
  'Center',
  'indicates',
  'significant',
  'changes',
  'in',
  'marine',
  'biodiversity',
  '.',
  'The',
  'analysis',
  'of',
  'the',
  "'",
  'Global',
  'Ocean',
  'Temperature',
  'Dataset',
  "'",
  'reveals',
  'that',
  'temperature',
  'fluctuations',
  'are',
  'impacting',
  'marine',
  'species',
  'distribution',
  '.',
  'In',
  'particular',
  ',',
  'the',
  'Pacific',
  'Ocean',
  'has',
  'been',
  'affected',
  ',',
  'with',
  'researchers',
  'noting',
  'shifts',
  'in',
  'species',
  'populations',
  '.',
  'The',
  'Oceanographic',
  'Data',
  'Center',
  'has',
  'published',
  'findings',
  'in',
  '2021',
  ',',
  'highlighting',
  'these',
  'changes',
  '.',
  'Furthermore',
  ',',
  'surveys',
  'conducted',
  'on',
  'marine',
  'life',
  'in',
  'the',
  'region',
  'have',
  

In [None]:
# calculate number of epochs
num_steps = 500
batch_size = 8
data_size = len(train_dataset)
num_batches = data_size // batch_size
num_epochs = max(1, num_steps // num_batches)

training_args = TrainingArguments(
    output_dir="models",
    learning_rate=1e-5,
    weight_decay=0.01,
    others_lr=1e-5,
    others_weight_decay=0.01,
    lr_scheduler_type="cosine", #for prefinetuning
    warmup_ratio=0.1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    focal_loss_alpha=0.75,
    focal_loss_gamma=2,
    num_train_epochs=num_epochs,
    eval_strategy="steps",
    save_steps = 250,
    save_total_limit=10,
    dataloader_num_workers = 0,
    use_cpu = False,
    report_to="none",
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

trainer.train()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Step,Training Loss,Validation Loss


TrainOutput(global_step=460, training_loss=18.423414147418477, metrics={'train_runtime': 321.5538, 'train_samples_per_second': 11.413, 'train_steps_per_second': 1.431, 'total_flos': 0.0, 'train_loss': 18.423414147418477, 'epoch': 5.0})

In [None]:
# ─── New Sample Text ─────────────────────────────────────────────────────────
second_sample_text = (
'''9 and the Syrian crisis , the UNHCR and the World Food Program ( WFP ) have been conducting a variety of surveys as well as extensive home visits that allowed researchers to analyze refugee conditions as had never been done before . The paper uses two data sets : the Jordan proGres registration system ( PG for short ) as of December 2014 and the Jordan Home Visits survey , round II data ( HV for short ) collected between November 2013 and September 2014 . Both data sets were provided by the UNHCR in the context of the joint World Bank-UNHCR study on the welfare of Syrian refugees ( Verme et al . , 2016 ) . These comprehensive data sets have the distinct advantage that they can be linked by a common identification number . We can therefore trace the same individuals and households across the two sources of data . The proGres registration system is what we consider the “ census ” of refugees . This data set has no information on consumption but contains socio-economic characteristics for all registered individuals and households . Variables available in the PG data include , among others'''
)

# ─── 1) NER on the new text ────────────────────────────────────────────────────
entity_labels = ["named dataset", "unnamed dataset", "vague dataset"]

ner_preds2 = model.predict_entities(
    second_sample_text,
    labels=entity_labels,
    threshold=0.5,
    flat_ner=True,
    multi_label=True,
)

print("🔍 NER on second sample:")
for p in ner_preds2:
    print(f" • {p['text']!r} → {p['label']} (score {p['score']:.2f})")

# ─── 2) Build RE prompts ──────────────────────────────────────────────────────
relation_types = ["description", "geography", "publisher", "publication year", "reference year"]
heads2 = list({p['text'] for p in ner_preds2})

re_labels2 = [f"{head} <> {rel}" for head in heads2 for rel in relation_types]

# ─── 3) Relation Extraction ──────────────────────────────────────────────────
re_preds2 = model.predict_entities(
    second_sample_text,
    labels=re_labels2,
    threshold=0.5,
    flat_ner=False,
    multi_label=True,
)

print("\n🔗 RE on second sample:")
for p in re_preds2:
    head, rel = p["label"].split(" <> ")
    tail = p["text"]
    print(f" • ({head!r}) —[{rel}]→ ({tail!r})  (score {p['score']:.2f})")


🔍 NER on second sample:
 • 'proGres registration system' → named dataset (score 0.66)
 • 'Home Visits survey' → named dataset (score 0.53)

🔗 RE on second sample:
 • ('proGres registration system') —[geography]→ ('Jordan')  (score 0.63)
 • ('Home Visits survey') —[geography]→ ('Jordan')  (score 0.50)
 • ('Home Visits survey') —[description]→ ('round II')  (score 0.58)
 • ('Home Visits survey') —[description]→ ('round II data')  (score 0.51)
 • ('proGres registration system') —[description]→ ('census')  (score 0.53)


FINE TUNE on ANNOTATED DATA

In [None]:
import pandas as pd
train_data_path = "/content/drive/MyDrive/colab-artifacts/refugee_data/REFUGEE_RE/150idx_train_data.csv"

train_data = pd.read_csv(train_data_path)

In [None]:
train_data = train_data[['text', 'entities']].to_dict(orient='records')

In [None]:
import ast

train_data = [
    { **entry,
      "entities": ast.literal_eval(entry["entities"])
    }
    for entry in train_data
]

In [None]:
train_data_converted = convert_my_data_for_gliner(train_data)

In [None]:
train_data_converted[0]

{'text': 'displacement has been decreasing slowly in conjunction with a softening of conﬂict intensity and violence . 8 Data from RUV also suggests that the cumulative population of forced migrants is balanced in terms of gender ( 51 percent women ) and that most are of working age . Colombia ’ s forced migrants are also young . In particular , 39 percent of forcefully displaced individuals were 15 years old or younger at the time of displacement , this percentage is disproportionately larger than this age group within the population of Colombia as a whole ( 28 percent ) . Indeed , 15 . 5 percent of forced migrants were younger than 5 years of age at the time of migration . Households also tend to be bigger as several members of the extended family tend to live together to save on housing costs ( Unidad para la Atenci ´ on y Reparaci ´ on de V ´ ıctimas , 2013 ) . Previous studies , using surveys given to migrants who were forcefully displaced , also report that this population has low

In [None]:
def convert_to_token_level(examples):
    token_level = []
    for ex in examples:
        text = ex["text"]
        # a) tokenize using your regex-based helper
        tokens, offsets = tokenize_text(text)

        # b) helper: map character index to token index
        def char_to_token(cidx):
            for tidx, (s, e) in enumerate(offsets):
                if s <= cidx < e:
                    return tidx
            return None

        # c) collect all mention texts: entities + relation head/tail
        ent_texts = {ent["text"] for ent in ex.get("entities", [])}
        for rel in ex.get("relations", []):
            ent_texts.add(rel["head"])
            ent_texts.add(rel["tail"])

        # d) find char spans for every mention text
        ent_char_spans = {}
        for txt in ent_texts:
            # use provided start/end if available
            match_ent = next((ent for ent in ex.get("entities", []) if ent["text"] == txt), None)
            if match_ent:
                ent_char_spans[txt] = (match_ent["start"], match_ent["end"])
            else:
                # fallback: find in text
                start = text.find(txt)
                if start >= 0:
                    ent_char_spans[txt] = (start, start + len(txt))

        # e) map char spans to token spans
        ent_token_spans = {}
        for txt, (s_char, e_char) in ent_char_spans.items():
            s_tok = char_to_token(s_char)
            e_tok = char_to_token(e_char - 1)
            if s_tok is not None and e_tok is not None:
                ent_token_spans[txt] = (s_tok, e_tok)

        # f) build NER list (only original entities)
        ner = []
        for ent in ex.get("entities", []):
            span = ent_token_spans.get(ent["text"])
            if span:
                s_tok, e_tok = span
                ner.append([s_tok, e_tok, ent["label"]])

        # g) build RE list using token spans map
        re_triples = []
        for rel in ex.get("relations", []):
            hs_he = ent_token_spans.get(rel["head"])
            ts_te = ent_token_spans.get(rel["tail"])
            if hs_he and ts_te:
                hs, he = hs_he
                ts, te = ts_te
                re_triples.append([hs, he, ts, te, rel["label"]])

        token_level.append({
            "tokenized_text": tokens,
            "ner": ner,
            "re": re_triples
        })
    return token_level

In [None]:
train_data_token_level = convert_to_token_level(train_data_converted)

In [None]:
train_data_token_level[0]

{'tokenized_text': ['displacement',
  'has',
  'been',
  'decreasing',
  'slowly',
  'in',
  'conjunction',
  'with',
  'a',
  'softening',
  'of',
  'conﬂict',
  'intensity',
  'and',
  'violence',
  '.',
  '8',
  'Data',
  'from',
  'RUV',
  'also',
  'suggests',
  'that',
  'the',
  'cumulative',
  'population',
  'of',
  'forced',
  'migrants',
  'is',
  'balanced',
  'in',
  'terms',
  'of',
  'gender',
  '(',
  '51',
  'percent',
  'women',
  ')',
  'and',
  'that',
  'most',
  'are',
  'of',
  'working',
  'age',
  '.',
  'Colombia',
  '’',
  's',
  'forced',
  'migrants',
  'are',
  'also',
  'young',
  '.',
  'In',
  'particular',
  ',',
  '39',
  'percent',
  'of',
  'forcefully',
  'displaced',
  'individuals',
  'were',
  '15',
  'years',
  'old',
  'or',
  'younger',
  'at',
  'the',
  'time',
  'of',
  'displacement',
  ',',
  'this',
  'percentage',
  'is',
  'disproportionately',
  'larger',
  'than',
  'this',
  'age',
  'group',
  'within',
  'the',
  'population',
  

In [None]:
os.makedirs("assets", exist_ok=True)
split_idx = int(0.9 * len(train_data_token_level))

# Save as JSON array, not JSONL
with open("assets/train_data.json", "w") as ft:
    json.dump(train_data_token_level[:split_idx], ft)
with open("assets/eval_data.json", "w") as fe:
    json.dump(train_data_token_level[split_idx:], fe)
# 📌 STEP 4: Fine-Tune GLiNER Multitask (Joint NER + RE)
import torch
from gliner import GLiNER
from gliner.training import Trainer, TrainingArguments
from gliner.data_processing.collator import DataCollator

# Device and model setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#model = GLiNER.from_pretrained('knowledgator/gliner-multitask-v1.0').to(device)
# DataCollator expects 'tokenized_text', 'ner', and 're'
data_collator = DataCollator(model.config, data_processor=model.data_processor, prepare_labels=True)

# Load datasets
train_dataset = load_json_data('assets/train_data.json')
eval_dataset = load_json_data('assets/eval_data.json')

In [None]:
# calculate number of epochs
num_steps = 500
batch_size = 4
data_size = len(train_dataset)
num_batches = data_size // batch_size
num_epochs = max(1, num_steps // num_batches)
#num_epochs = 4
training_args = TrainingArguments(
    output_dir="models",
    learning_rate=5e-6,
    weight_decay=0.01,
    others_lr=1e-5,
    others_weight_decay=0.01,
    lr_scheduler_type="linear", #fine tuning
    warmup_ratio=0.1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    focal_loss_alpha=0.75,
    focal_loss_gamma=2,
    num_train_epochs=num_epochs,
    eval_strategy="steps",
    save_steps = 250,
    save_total_limit=10,
    dataloader_num_workers = 0,
    use_cpu = False,
    report_to="none",
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=496, training_loss=1.4109761638026084, metrics={'train_runtime': 271.3819, 'train_samples_per_second': 7.193, 'train_steps_per_second': 1.828, 'total_flos': 0.0, 'train_loss': 1.4109761638026084, 'epoch': 16.0})

In [None]:
# ─── New Sample Text ─────────────────────────────────────────────────────────
second_sample_text = (
'''9 and the Syrian crisis , the UNHCR and the World Food Program ( WFP ) have been conducting a variety of surveys as well as extensive home visits that allowed researchers to analyze refugee conditions as had never been done before . The paper uses two data sets : the Jordan proGres registration system ( PG for short ) as of December 2014 and the Jordan Home Visits survey , round II data ( HV for short ) collected between November 2013 and September 2014 . Both data sets were provided by the UNHCR in the context of the joint World Bank-UNHCR study on the welfare of Syrian refugees ( Verme et al . , 2016 ) . These comprehensive data sets have the distinct advantage that they can be linked by a common identification number . We can therefore trace the same individuals and households across the two sources of data . The proGres registration system is what we consider the “ census ” of refugees . This data set has no information on consumption but contains socio-economic characteristics for all registered individuals and households . Variables available in the PG data include , among others'''
)

# ─── 1) NER on the new text ────────────────────────────────────────────────────
entity_labels = ["named dataset", "unnamed dataset", "vague dataset"]

ner_preds2 = model.predict_entities(
    second_sample_text,
    labels=entity_labels,
    threshold=0.5,
    flat_ner=True,
    multi_label=False,
)

print("🔍 NER on second sample:")
for p in ner_preds2:
    print(f" • {p['text']!r} → {p['label']} (score {p['score']:.2f})")

# ─── 2) Build RE prompts ──────────────────────────────────────────────────────
relation_types = ["description", "geography", "publisher", "publication year", "reference year", "version", "acronym"]
heads2 = list({p['text'] for p in ner_preds2})

re_labels2 = [f"{head} <> {rel}" for head in heads2 for rel in relation_types]

# ─── 3) Relation Extraction ──────────────────────────────────────────────────
re_preds2 = model.predict_entities(
    second_sample_text,
    labels=re_labels2,
    threshold=0.5,
    flat_ner=False,
    multi_label=True,
)

print("\n🔗 RE on second sample:")
for p in re_preds2:
    head, rel = p["label"].split(" <> ")
    tail = p["text"]
    print(f" • ({head!r}) —[{rel}]→ ({tail!r})  (score {p['score']:.2f})")


🔍 NER on second sample:
 • 'proGres' → named dataset (score 0.99)
 • 'PG' → named dataset (score 0.97)
 • 'Home Visits survey' → named dataset (score 0.99)

🔗 RE on second sample:
 • ('proGres') —[geography]→ ('Jordan')  (score 0.74)
 • ('PG') —[geography]→ ('Jordan')  (score 0.64)
 • ('Home Visits survey') —[geography]→ ('Jordan')  (score 0.54)
 • ('PG') —[acronym]→ ('proGres')  (score 0.80)
 • ('PG') —[acronym]→ ('PG')  (score 0.66)
 • ('Home Visits survey') —[version]→ ('round II')  (score 0.75)
 • ('Home Visits survey') —[acronym]→ ('HV')  (score 0.87)
