In [None]:
# The below code takes input of parallel sentences(samanantar dataset in this case), performs pre processing, filtering, alignment (bidirectional) and generates BIO format tagged NER dataset.
# Inputs, alignment algorithm and process, filtering are all modular and open to customization.
# Feel free to reach out (Utsav B21).

# Install required packages and clone awesome-align
!pip install transformers
!git clone https://github.com/neulab/awesome-align.git
%cd awesome-align
!pip install -e .

import os
import zipfile
import json
import random
import concurrent.futures
from pathlib import Path
from collections import defaultdict
from functools import reduce
import operator

import torch
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer

# ----- PARAMETERS & PATHS -----
DATA_DIR = Path("/home/user/112101050_Utsav/btp/samanantar_align/samanantar_data")
DATA_DIR.mkdir(exist_ok=True)
ZIP_FILE = DATA_DIR / "samanantar.zip"
EXTRACT_DIR = DATA_DIR / "extracted"

# Languages to process
LANGUAGE_CODES = {
    # "hindi": "hi",
    "assamese": "as",
    "gujarati": "gu",
    # "marathi": "mr",
    "punjabi": "pa"
}

# Filtering parameters
MIN_TOKENS = 1          # minimum number of tokens per sentence
MAX_TOKENS = 100        # maximum number of tokens per sentence

# ----- DEVICE SETUP -----
device_name = "GPU" if torch.cuda.is_available() else "CPU"
print(f"Using device: {device_name} ({torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'})")
device = 0 if torch.cuda.is_available() else -1
if device != -1:
    torch.cuda.set_device(device)

# ----- INITIALIZE TOKENIZER -----
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")


# ----- INITIALIZE NER PIPELINE -----
def create_ner_pipeline(device: int = -1):
    return pipeline(
        "ner",
        model="dslim/bert-base-NER",
        tokenizer="dslim/bert-base-NER",
        device=device,
        batch_size=16,
        aggregation_strategy="none",
        torch_dtype=torch.float16 if device != -1 else None
    )
ner_pipeline = create_ner_pipeline(device=device)


# ----- VALIDATE AND EXTRACT DATASET ZIP -----
def validate_and_extract(zip_path, extract_path):
    if not zip_path.exists():
        raise FileNotFoundError(f"ZIP file not found at {zip_path}.")
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            bad_file = zip_ref.testzip()
            if bad_file:
                raise ValueError(f"Corrupted file in ZIP: {bad_file}")
    except Exception as e:
        raise ValueError(f"Invalid ZIP file: {e}") from None

    if not extract_path.exists():
        extract_path.mkdir(parents=True, exist_ok=True)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            with concurrent.futures.ThreadPoolExecutor() as executor:
                for file in zip_ref.namelist():
                    executor.submit(zip_ref.extract, file, extract_path)
        print("Extraction complete.")

#validate_and_extract(ZIP_FILE, EXTRACT_DIR)

# ----- READ SENTENCE FILES -----
def process_language_file(en_file, lang_file):
    with open(en_file, 'r', encoding='utf-8') as en_f, open(lang_file, 'r', encoding='utf-8') as lang_f:
        english_sentences = [line.strip() for line in en_f if line.strip()]
        language_sentences = [line.strip() for line in lang_f if line.strip()]
    assert len(english_sentences) == len(language_sentences), "Mismatch in sentence counts."
    return english_sentences, language_sentences

# ----- MERGE ENTITIES -----
def merge_entities(entities, sentence):
    """
    Merges consecutive entities of the same type if they are adjacent or overlapping.
    """
    merged = []
    for entity in sorted(entities, key=lambda x: x['start']):
        full_tag = entity['entity']
        ent_type = full_tag.split('-')[-1] if '-' in full_tag else full_tag
        start, end = entity['start'], entity['end']
        if merged and merged[-1]['ent_type'] == ent_type:
            prev_end = merged[-1]['end']
            # If there's no actual text between previous entity and this entity, merge them
            if not sentence[prev_end:start].strip():
                merged[-1]['end'] = max(merged[-1]['end'], end)
                continue
        merged.append({'start': start, 'end': end, 'ent_type': ent_type})
    return merged

# ----- STEP 1: FILTER BY ENTITY (FIRST FILTER) -----
def filter_by_entity(english_sentences, language_sentences):
    """
    Runs NER on all English sentences to decide which have named entities.
    Keep all sentences that contain >=1 entity.
    Keep only 1% of those that have no entities.
    Returns the filtered subset of (english_sentences, language_sentences).
    """
    assert len(english_sentences) == len(language_sentences)

    batch_size = 64
    n = len(english_sentences)
    has_entity = [False] * n

    # Process NER in batches
    idx = 0
    for start_idx in tqdm(range(0, n, batch_size), desc="First Filter: Checking NER"):
        batch = english_sentences[start_idx:start_idx+batch_size]
        ner_results = ner_pipeline(batch)
        for i, entities in enumerate(ner_results):
            merged = merge_entities(entities, batch[i])
            if merged:  # if there's at least one entity
                has_entity[start_idx + i] = True

    entity_indices = [i for i, val in enumerate(has_entity) if val]
    non_entity_indices = [i for i, val in enumerate(has_entity) if not val]

    # We keep all entity sentences
    # For non-entity sentences, keep only 1%
    random.shuffle(non_entity_indices)
    sample_size = max(1, int(0.01 * len(non_entity_indices))) if len(non_entity_indices) > 0 else 0
    sampled_non_entity_indices = set(non_entity_indices[:sample_size])

    final_indices = entity_indices + list(sampled_non_entity_indices)
    final_indices.sort()

    filtered_en = [english_sentences[i] for i in final_indices]
    filtered_lang = [language_sentences[i] for i in final_indices]

    print(f"Total sentences: {len(english_sentences)}")
    print(f"  With entity: {len(entity_indices)}")
    print(f"  Without entity (sampled 1%): {sample_size} of {len(non_entity_indices)}")
    print(f"  => after first filter: {len(filtered_en)}")

    return filtered_en, filtered_lang

# ----- RUN ALIGNMENT & KEEP TOP 30% (SECOND FILTER) -----
def run_alignment(english_sentences, language_sentences):
    """
    Runs awesome-align forward & backward. Computes alignment-based scores
    (normalized by the number of words in each pair). Sorts by these scores
    and keeps top 30%.
    """
    batch_size = 256 if device != -1 else 128
    forward_input = "\n".join([f"{eng} ||| {lang}" for eng, lang in zip(english_sentences, language_sentences)])
    forward_input_path = f"{DATA_DIR}/forward_input.txt"
    forward_output_path = f"{DATA_DIR}/forward_output.txt"

    with open(forward_input_path, "w", encoding="utf-8") as f:
        f.write(forward_input)

    os.environ["CUDA_VISIBLE_DEVICES"] = "0" if device != -1 else "-1"

    # Forward alignment
    !python3 run_align.py \
      --model_name_or_path bert-base-multilingual-cased \
      --data_file "{forward_input_path}" \
      --output_file "{forward_output_path}" \
      --output_prob_file "{DATA_DIR}/forward_probabilities.txt" \
      --output_word_file "{DATA_DIR}/forward_word_pairs.txt" \
      --extraction 'softmax' \
      --batch_size {batch_size} \
      --cache_dir "{DATA_DIR}/cache"

    # Backward alignment
    backward_input = "\n".join([f"{lang} ||| {eng}" for eng, lang in zip(english_sentences, language_sentences)])
    backward_input_path = f"{DATA_DIR}/backward_input.txt"
    backward_output_path = f"{DATA_DIR}/backward_output.txt"

    with open(backward_input_path, "w", encoding="utf-8") as f:
        f.write(backward_input)

    !python3 run_align.py \
      --model_name_or_path bert-base-multilingual-cased \
      --data_file "{backward_input_path}" \
      --output_file "{backward_output_path}" \
      --output_prob_file "{DATA_DIR}/backward_probabilities.txt" \
      --output_word_file "{DATA_DIR}/backward_word_pairs.txt" \
      --extraction 'softmax' \
      --batch_size {batch_size} \
      --cache_dir "{DATA_DIR}/cache"

    def read_alignments(path, expected_length):
        alignments = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                pairs = set()
                if line.strip():
                    for pair in line.strip().split():
                        try:
                            e, l = map(int, pair.split('-'))
                            pairs.add((e, l))
                        except:
                            continue
                alignments.append(pairs)
        # Ensure length matches
        while len(alignments) < expected_length:
            alignments.append(set())
        return alignments[:expected_length]

    forward_alignments = read_alignments(forward_output_path, len(english_sentences))
    backward_alignments = read_alignments(backward_output_path, len(english_sentences))

    # Read probabilities, but use "average" normalized by total words to match the second filter requirement
    def read_probabilities(prob_file, eng_sents, lang_sents):
        scores = []
        with open(prob_file, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                numbers = list(map(float, line.strip().split()))
                # Normalized by total words in both sentences (or just English, if desired)
                total_words = len(eng_sents[i].split()) + len(lang_sents[i].split())
                if total_words == 0:
                    score = 0
                else:
                    score = sum(numbers) / float(total_words)
                scores.append(score)
        # Ensure scores length matches
        while len(scores) < len(eng_sents):
            scores.append(0)
        return scores

    forward_scores = read_probabilities(f"{DATA_DIR}/forward_probabilities.txt",
                                        english_sentences, language_sentences)

    # Sort by forward scores, pick top 30%
    threshold_index = int(len(forward_scores) * 0.3)
    # Indices sorted by forward_scores in descending order
    sorted_indices = sorted(range(len(forward_scores)), key=lambda i: forward_scores[i], reverse=True)
    top_indices = sorted_indices[:threshold_index]

    # Keep all top 30% sentences regardless of alignment intersection
    final_english = [english_sentences[i] for i in top_indices]
    final_lang = [language_sentences[i] for i in top_indices]
    final_alignments = [{"alignment": forward_alignments[i] & backward_alignments[i]} for i in top_indices]

    print(f"  => after second filter (top 30%): {len(final_english)}")

    return final_english, final_lang, final_alignments

def get_word_spans(sentence):
    """Extract words and their character spans from a sentence using whitespace tokenization."""
    words = []
    start = 0
    for word in sentence.split():
        # Find the start index of the word in the sentence
        while start < len(sentence) and sentence[start] == ' ':
            start += 1
        end = start + len(word)
        words.append((word, (start, end)))
        start = end
    return words

# ----- TOKENIZATION UTILITY -----
def tokenize_with_offsets(sentence, tokenizer):
    encoding = tokenizer(sentence, return_offsets_mapping=True, add_special_tokens=False)
    tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
    offsets = encoding["offset_mapping"]
    return tokens, offsets

# ----- TRANSFER TAGS (FINAL STEP) -----
def transfer_tags(english_sentences, language_sentences, alignments):
    curated_tagged_dataset = []
    sample_data = {}

    ner_results = []
    batch_size = 64
    for batch_start in tqdm(range(0, len(english_sentences), batch_size),
                          desc="Final Tag Transfer: NER on English"):
        batch = english_sentences[batch_start:batch_start+batch_size]
        ner_results.extend(ner_pipeline(batch))

    for idx, (eng_sent, lang_sent, align_info) in tqdm(
            enumerate(zip(english_sentences, language_sentences, alignments)),
            total=len(english_sentences),
            desc="Transferring tags"
        ):
        # English tokenization and tagging (unchanged)
        eng_tokens, eng_offsets = tokenize_with_offsets(eng_sent, bert_tokenizer)
        if not (MIN_TOKENS <= len(eng_tokens) <= MAX_TOKENS):
            continue

        english_entities = merge_entities(ner_results[idx], eng_sent)
        bio_tags = ['O'] * len(eng_tokens)
        for entity in english_entities:
            ent_type = entity['ent_type']
            ent_start = entity['start']
            ent_end = entity['end']
            overlapping_indices = []
            for i, (tok_start, tok_end) in enumerate(eng_offsets):
                if max(tok_start, ent_start) < min(tok_end, ent_end):
                    overlapping_indices.append(i)
            if overlapping_indices:
                bio_tags[overlapping_indices[0]] = f"B-{ent_type}"
                for j in overlapping_indices[1:]:
                    bio_tags[j] = f"I-{ent_type}"

        # Target language processing (modified)
        lang_subword_tokens, lang_subword_offsets = tokenize_with_offsets(lang_sent, bert_tokenizer)
        if len(lang_subword_tokens) == 0:
            continue

        alignment_map = defaultdict(list)
        for e_idx, l_idx in align_info["alignment"]:
            if e_idx < len(eng_tokens) and l_idx < len(lang_subword_tokens):
                alignment_map[l_idx].append(e_idx)

        lang_subword_tags = ['O'] * len(lang_subword_tokens)
        for l_idx in range(len(lang_subword_tags)):
            for e_idx in alignment_map[l_idx]:
                eng_tag = bio_tags[e_idx]
                if eng_tag != 'O':
                    lang_subword_tags[l_idx] = eng_tag
                    break

        # Group subwords into words
        word_spans = get_word_spans(lang_sent)
        word_level_tokens = []
        word_level_tags = []

        for word, (s_start, s_end) in word_spans:
            subword_indices = []
            for idx, (tok_start, tok_end) in enumerate(lang_subword_offsets):
                if tok_start >= s_start and tok_end <= s_end:
                    subword_indices.append(idx)

            tags_in_word = [lang_subword_tags[idx] for idx in subword_indices
                          if idx < len(lang_subword_tags)]

            # Take first non-O tag (now keeps I- tags as-is)
            word_tag = next((tag for tag in tags_in_word if tag != 'O'), 'O')
            word_level_tokens.append(word)
            word_level_tags.append(word_tag)

        # Enforce strict BIO constraints
        current_ent = None
        for i in range(len(word_level_tags)):
            tag = word_level_tags[i]

            if tag == 'O':
                current_ent = None
                continue

            parts = tag.split('-', 1)
            if len(parts) != 2:
                word_level_tags[i] = 'O'
                continue

            prefix, ent_type = parts

            # Handle invalid I- at start or after O
            if prefix == 'I':
                if current_ent != ent_type:
                    # Convert to B- if not continuation
                    word_level_tags[i] = f'B-{ent_type}'
                    current_ent = ent_type
                else:
                    # Valid continuation, keep I-
                    pass
            elif prefix == 'B':
                current_ent = ent_type
            else:
                word_level_tags[i] = 'O'
                current_ent = None

        # Final validation pass
        prev_tag = None
        for i in range(len(word_level_tags)):
            tag = word_level_tags[i]
            if tag.startswith('I-'):
                if prev_tag in [None, 'O'] or not prev_tag.endswith(tag.split('-')[1]):
                    # Convert standalone I- to B-
                    word_level_tags[i] = tag.replace('I-', 'B-', 1)
            prev_tag = word_level_tags[i]

        # Apply length filter
        if not (MIN_TOKENS <= len(word_level_tokens) <= MAX_TOKENS):
            continue

        # Store sample data
        if not sample_data:
            sample_data = {
                'english': eng_sent,
                'target': lang_sent,
                'english_tokens': list(zip(eng_tokens, bio_tags)),
                'target_final': list(zip(word_level_tokens, word_level_tags))
            }

        curated_tagged_dataset.append(list(zip(word_level_tokens, word_level_tags)))

    return curated_tagged_dataset, sample_data

# ----- PROCESS ALL LANGUAGES -----
def process_all_languages():
    for lang, lang_code in LANGUAGE_CODES.items():
        lang_dir = EXTRACT_DIR / "v2" / f"en-{lang_code}"
        en_file = lang_dir / "train.en"
        lang_file = lang_dir / f"train.{lang_code}"
        if not en_file.exists() or not lang_file.exists():
            print(f"Skipping {lang} - files not found")
            continue

        print(f"\nProcessing {lang}...")

        # 1) Read all sentences
        english_sentences, language_sentences = process_language_file(en_file, lang_file)

        # 2) First Filter: Keep all NE + 1% No NE
        filtered_english_1, filtered_lang_1 = filter_by_entity(english_sentences, language_sentences)

        # 3) Second Filter (Alignment-based): top 30% after scoring
        final_english, final_lang, alignments = run_alignment(filtered_english_1, filtered_lang_1)

        # 4) Transfer tags
        tagged_language_dataset, sample_data = transfer_tags(final_english, final_lang, alignments)

        # Show a verification sample
        print(f"\n--- {lang.upper()} VERIFICATION SAMPLE ---")
        print("English Sentence:", sample_data.get('english', 'N/A'))
        print("English Tokens & Tags:", sample_data.get('english_tokens', 'N/A'))
        print("Target Sentence:", sample_data.get('target', 'N/A'))
        print("Target Final Tags:", sample_data.get('target_final', 'N/A'))
        print("-" * 50)

        # 5) Save to JSON
        json_dataset = []
        for tagged_sentence in tagged_language_dataset:
            tokens = [token for token, tag in tagged_sentence]
            tags = [tag for token, tag in tagged_sentence]
            json_dataset.append({"tokens": tokens, "ner_tags": tags})

        output_dir = DATA_DIR / "new_bio_tagged_samanantar"
        output_dir.mkdir(exist_ok=True)
        output_file = output_dir / f"{lang}_ner.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(json_dataset, f, ensure_ascii=False, indent=2)
        print(f"Saved {len(json_dataset)} tagged sentences to {output_file}")


process_all_languages()

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
fatal: destination path 'awesome-align' already exists and is not an empty directory.
/home/user/112101050_Utsav/btp/samanantar_align/awesome-align
Defaulting to user installation because normal site-packages is not writeable
Obtaining file:///home/user/112101050_Utsav/btp/samanantar_align/awesome-align
  Preparing metadata (setup.py) ... [?25ldone
Installing collected packages: awesome-align
  Attempting uninstall: awesome-align
    Found existing installation: awesome-align 0.1.7
    Uninstalling awesome-align-0.1.7:
      Successfully uninstalled awesome-align-0.1.7
  Running setup.py develop for awesome-align
Successfully installed awesome-align



2025-03-16 03:28:50.824843: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-16 03:28:50.875043: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using device: GPU (NVIDIA RTX A4000)




KeyboardInterrupt: 

In [None]:
#this is the same code as above with a different model which we fine tuned on 400 manually annotated punjabi sentences

# below is the terminal command for the fine tuning

'''
# (already in virtualenv)

pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install -r requirements.txt

export TRAIN_FILE=/home/user/112101050_Utsav/btp/align_train_hm/enpa.src-tgt
export TRAIN_GOLD_FILE=/home/user/112101050_Utsav/btp/align_train_hm/enpa.gold
export OUTPUT_DIR=/home/user/112101050_Utsav/btp/align_train_hm/awesome_output

CUDA_VISIBLE_DEVICES=0 awesome-train \
    --output_dir=$OUTPUT_DIR \
    --model_name_or_path=bert-base-multilingual-cased \
    --extraction softmax \
    --do_train \
    --train_so \
    --train_data_file=$TRAIN_FILE \
    --train_gold_file=$TRAIN_GOLD_FILE \
    --per_gpu_train_batch_size 2 \
    --gradient_accumulation_steps 4 \
    --num_train_epochs 5 \
    --learning_rate 1e-4 \
    --save_steps 200
'''

# Install required packages and clone awesome-align
!pip install transformers
!git clone https://github.com/neulab/awesome-align.git
%cd awesome-align
!pip install -e .

import os
import zipfile
import json
import random
import concurrent.futures
from pathlib import Path
from collections import defaultdict
from functools import reduce
import operator

import torch
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer

# ----- PARAMETERS & PATHS -----
DATA_DIR = Path("/home/user/112101050_Utsav/btp/samanantar_align/samanantar_data")
DATA_DIR.mkdir(exist_ok=True)
ZIP_FILE = DATA_DIR / "samanantar.zip"
EXTRACT_DIR = DATA_DIR / "extracted"

# Languages to process
LANGUAGE_CODES = {
    # "hindi": "hi",
    # "assamese": "as",
    # "gujarati": "gu",
    # "marathi": "mr",
    "punjabi": "pa"
}

# Filtering parameters
MIN_TOKENS = 1          # minimum number of tokens per sentence
MAX_TOKENS = 100        # maximum number of tokens per sentence

# ----- DEVICE SETUP -----
device_name = "GPU" if torch.cuda.is_available() else "CPU"
print(f"Using device: {device_name} ({torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'})")
device = 0 if torch.cuda.is_available() else -1
if device != -1:
    torch.cuda.set_device(device)

# ----- INITIALIZE TOKENIZER -----
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")


# ----- INITIALIZE NER PIPELINE -----
def create_ner_pipeline(device: int = -1):
    return pipeline(
        "ner",
        model="dslim/bert-base-NER",
        tokenizer="dslim/bert-base-NER",
        device=device,
        batch_size=16,
        aggregation_strategy="none",
        torch_dtype=torch.float16 if device != -1 else None
    )
ner_pipeline = create_ner_pipeline(device=device)


# ----- VALIDATE AND EXTRACT DATASET ZIP -----
def validate_and_extract(zip_path, extract_path):
    if not zip_path.exists():
        raise FileNotFoundError(f"ZIP file not found at {zip_path}.")
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            bad_file = zip_ref.testzip()
            if bad_file:
                raise ValueError(f"Corrupted file in ZIP: {bad_file}")
    except Exception as e:
        raise ValueError(f"Invalid ZIP file: {e}") from None

    if not extract_path.exists():
        extract_path.mkdir(parents=True, exist_ok=True)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            with concurrent.futures.ThreadPoolExecutor() as executor:
                for file in zip_ref.namelist():
                    executor.submit(zip_ref.extract, file, extract_path)
        print("Extraction complete.")

#validate_and_extract(ZIP_FILE, EXTRACT_DIR)

# ----- READ SENTENCE FILES -----
def process_language_file(en_file, lang_file):
    with open(en_file, 'r', encoding='utf-8') as en_f, open(lang_file, 'r', encoding='utf-8') as lang_f:
        english_sentences = [line.strip() for line in en_f if line.strip()]
        language_sentences = [line.strip() for line in lang_f if line.strip()]
    assert len(english_sentences) == len(language_sentences), "Mismatch in sentence counts."
    return english_sentences, language_sentences

# ----- MERGE ENTITIES -----
def merge_entities(entities, sentence):
    """
    Merges consecutive entities of the same type if they are adjacent or overlapping.
    """
    merged = []
    for entity in sorted(entities, key=lambda x: x['start']):
        full_tag = entity['entity']
        ent_type = full_tag.split('-')[-1] if '-' in full_tag else full_tag
        start, end = entity['start'], entity['end']
        if merged and merged[-1]['ent_type'] == ent_type:
            prev_end = merged[-1]['end']
            # If there's no actual text between previous entity and this entity, merge them
            if not sentence[prev_end:start].strip():
                merged[-1]['end'] = max(merged[-1]['end'], end)
                continue
        merged.append({'start': start, 'end': end, 'ent_type': ent_type})
    return merged

# ----- STEP 1: FILTER BY ENTITY (FIRST FILTER) -----
def filter_by_entity(english_sentences, language_sentences):
    """
    Runs NER on all English sentences to decide which have named entities.
    Keep all sentences that contain >=1 entity.
    Keep only 1% of those that have no entities.
    Returns the filtered subset of (english_sentences, language_sentences).
    """
    assert len(english_sentences) == len(language_sentences)

    batch_size = 64
    n = len(english_sentences)
    has_entity = [False] * n

    # Process NER in batches
    idx = 0
    for start_idx in tqdm(range(0, n, batch_size), desc="First Filter: Checking NER"):
        batch = english_sentences[start_idx:start_idx+batch_size]
        ner_results = ner_pipeline(batch)
        for i, entities in enumerate(ner_results):
            merged = merge_entities(entities, batch[i])
            if merged:  # if there's at least one entity
                has_entity[start_idx + i] = True

    entity_indices = [i for i, val in enumerate(has_entity) if val]
    non_entity_indices = [i for i, val in enumerate(has_entity) if not val]

    # We keep all entity sentences
    # For non-entity sentences, keep only 1%
    random.shuffle(non_entity_indices)
    sample_size = max(1, int(0.01 * len(non_entity_indices))) if len(non_entity_indices) > 0 else 0
    sampled_non_entity_indices = set(non_entity_indices[:sample_size])

    final_indices = entity_indices + list(sampled_non_entity_indices)
    final_indices.sort()

    filtered_en = [english_sentences[i] for i in final_indices]
    filtered_lang = [language_sentences[i] for i in final_indices]

    print(f"Total sentences: {len(english_sentences)}")
    print(f"  With entity: {len(entity_indices)}")
    print(f"  Without entity (sampled 1%): {sample_size} of {len(non_entity_indices)}")
    print(f"  => after first filter: {len(filtered_en)}")

    return filtered_en, filtered_lang

# ----- RUN ALIGNMENT & KEEP TOP 30% (SECOND FILTER) -----
def run_alignment(english_sentences, language_sentences):
    """
    Runs awesome-align forward & backward. Computes alignment-based scores
    (normalized by the number of words in each pair). Sorts by these scores
    and keeps top 30%.
    """
    batch_size = 256 if device != -1 else 128
    forward_input = "\n".join([f"{eng} ||| {lang}" for eng, lang in zip(english_sentences, language_sentences)])
    forward_input_path = f"{DATA_DIR}/forward_input.txt"
    forward_output_path = f"{DATA_DIR}/forward_output.txt"

    with open(forward_input_path, "w", encoding="utf-8") as f:
        f.write(forward_input)

    os.environ["CUDA_VISIBLE_DEVICES"] = "0" if device != -1 else "-1"

    # Forward alignment
    !python3 run_align.py \
      --model_name_or_path "/home/user/112101050_Utsav/btp/align_train_hm/awesome_output" \
      --data_file "{forward_input_path}" \
      --output_file "{forward_output_path}" \
      --output_prob_file "{DATA_DIR}/forward_probabilities.txt" \
      --output_word_file "{DATA_DIR}/forward_word_pairs.txt" \
      --extraction 'softmax' \
      --batch_size {batch_size} \
      --cache_dir "{DATA_DIR}/cache"

    # Backward alignment
    backward_input = "\n".join([f"{lang} ||| {eng}" for eng, lang in zip(english_sentences, language_sentences)])
    backward_input_path = f"{DATA_DIR}/backward_input.txt"
    backward_output_path = f"{DATA_DIR}/backward_output.txt"

    with open(backward_input_path, "w", encoding="utf-8") as f:
        f.write(backward_input)

    !python3 run_align.py \
      --model_name_or_path "/home/user/112101050_Utsav/btp/align_train_hm/awesome_output" \
      --data_file "{backward_input_path}" \
      --output_file "{backward_output_path}" \
      --output_prob_file "{DATA_DIR}/backward_probabilities.txt" \
      --output_word_file "{DATA_DIR}/backward_word_pairs.txt" \
      --extraction 'softmax' \
      --batch_size {batch_size} \
      --cache_dir "{DATA_DIR}/cache"

    def read_alignments(path, expected_length):
        alignments = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                pairs = set()
                if line.strip():
                    for pair in line.strip().split():
                        try:
                            e, l = map(int, pair.split('-'))
                            pairs.add((e, l))
                        except:
                            continue
                alignments.append(pairs)
        # Ensure length matches
        while len(alignments) < expected_length:
            alignments.append(set())
        return alignments[:expected_length]

    forward_alignments = read_alignments(forward_output_path, len(english_sentences))
    backward_alignments = read_alignments(backward_output_path, len(english_sentences))

    # Read probabilities, but use "average" normalized by total words to match the second filter requirement
    def read_probabilities(prob_file, eng_sents, lang_sents):
        scores = []
        with open(prob_file, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                numbers = list(map(float, line.strip().split()))
                # Normalized by total words in both sentences (or just English, if desired)
                total_words = len(eng_sents[i].split()) + len(lang_sents[i].split())
                if total_words == 0:
                    score = 0
                else:
                    score = sum(numbers) / float(total_words)
                scores.append(score)
        # Ensure scores length matches
        while len(scores) < len(eng_sents):
            scores.append(0)
        return scores

    forward_scores = read_probabilities(f"{DATA_DIR}/forward_probabilities.txt",
                                        english_sentences, language_sentences)

    # Sort by forward scores, pick top 30%
    threshold_index = int(len(forward_scores) * 0.3)
    # Indices sorted by forward_scores in descending order
    sorted_indices = sorted(range(len(forward_scores)), key=lambda i: forward_scores[i], reverse=True)
    top_indices = sorted_indices[:threshold_index]

    # Keep all top 30% sentences regardless of alignment intersection
    final_english = [english_sentences[i] for i in top_indices]
    final_lang = [language_sentences[i] for i in top_indices]
    final_alignments = [{"alignment": forward_alignments[i] & backward_alignments[i]} for i in top_indices]

    print(f"  => after second filter (top 30%): {len(final_english)}")

    return final_english, final_lang, final_alignments

def get_word_spans(sentence):
    """Extract words and their character spans from a sentence using whitespace tokenization."""
    words = []
    start = 0
    for word in sentence.split():
        # Find the start index of the word in the sentence
        while start < len(sentence) and sentence[start] == ' ':
            start += 1
        end = start + len(word)
        words.append((word, (start, end)))
        start = end
    return words

# ----- TOKENIZATION UTILITY -----
def tokenize_with_offsets(sentence, tokenizer):
    encoding = tokenizer(sentence, return_offsets_mapping=True, add_special_tokens=False)
    tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
    offsets = encoding["offset_mapping"]
    return tokens, offsets

# ----- TRANSFER TAGS (FINAL STEP) -----
def transfer_tags(english_sentences, language_sentences, alignments):
    curated_tagged_dataset = []
    sample_data = {}

    ner_results = []
    batch_size = 64
    for batch_start in tqdm(range(0, len(english_sentences), batch_size),
                          desc="Final Tag Transfer: NER on English"):
        batch = english_sentences[batch_start:batch_start+batch_size]
        ner_results.extend(ner_pipeline(batch))

    for idx, (eng_sent, lang_sent, align_info) in tqdm(
            enumerate(zip(english_sentences, language_sentences, alignments)),
            total=len(english_sentences),
            desc="Transferring tags"
        ):
        # English tokenization and tagging (unchanged)
        eng_tokens, eng_offsets = tokenize_with_offsets(eng_sent, bert_tokenizer)
        if not (MIN_TOKENS <= len(eng_tokens) <= MAX_TOKENS):
            continue

        english_entities = merge_entities(ner_results[idx], eng_sent)
        bio_tags = ['O'] * len(eng_tokens)
        for entity in english_entities:
            ent_type = entity['ent_type']
            ent_start = entity['start']
            ent_end = entity['end']
            overlapping_indices = []
            for i, (tok_start, tok_end) in enumerate(eng_offsets):
                if max(tok_start, ent_start) < min(tok_end, ent_end):
                    overlapping_indices.append(i)
            if overlapping_indices:
                bio_tags[overlapping_indices[0]] = f"B-{ent_type}"
                for j in overlapping_indices[1:]:
                    bio_tags[j] = f"I-{ent_type}"

        # Target language processing (modified)
        lang_subword_tokens, lang_subword_offsets = tokenize_with_offsets(lang_sent, bert_tokenizer)
        if len(lang_subword_tokens) == 0:
            continue

        alignment_map = defaultdict(list)
        for e_idx, l_idx in align_info["alignment"]:
            if e_idx < len(eng_tokens) and l_idx < len(lang_subword_tokens):
                alignment_map[l_idx].append(e_idx)

        lang_subword_tags = ['O'] * len(lang_subword_tokens)
        for l_idx in range(len(lang_subword_tags)):
            for e_idx in alignment_map[l_idx]:
                eng_tag = bio_tags[e_idx]
                if eng_tag != 'O':
                    lang_subword_tags[l_idx] = eng_tag
                    break

        # Group subwords into words
        word_spans = get_word_spans(lang_sent)
        word_level_tokens = []
        word_level_tags = []

        for word, (s_start, s_end) in word_spans:
            subword_indices = []
            for idx, (tok_start, tok_end) in enumerate(lang_subword_offsets):
                if tok_start >= s_start and tok_end <= s_end:
                    subword_indices.append(idx)

            tags_in_word = [lang_subword_tags[idx] for idx in subword_indices
                          if idx < len(lang_subword_tags)]

            # Take first non-O tag (now keeps I- tags as-is)
            word_tag = next((tag for tag in tags_in_word if tag != 'O'), 'O')
            word_level_tokens.append(word)
            word_level_tags.append(word_tag)

        # Enforce strict BIO constraints
        current_ent = None
        for i in range(len(word_level_tags)):
            tag = word_level_tags[i]

            if tag == 'O':
                current_ent = None
                continue

            parts = tag.split('-', 1)
            if len(parts) != 2:
                word_level_tags[i] = 'O'
                continue

            prefix, ent_type = parts

            # Handle invalid I- at start or after O
            if prefix == 'I':
                if current_ent != ent_type:
                    # Convert to B- if not continuation
                    word_level_tags[i] = f'B-{ent_type}'
                    current_ent = ent_type
                else:
                    # Valid continuation, keep I-
                    pass
            elif prefix == 'B':
                current_ent = ent_type
            else:
                word_level_tags[i] = 'O'
                current_ent = None

        # Final validation pass
        prev_tag = None
        for i in range(len(word_level_tags)):
            tag = word_level_tags[i]
            if tag.startswith('I-'):
                if prev_tag in [None, 'O'] or not prev_tag.endswith(tag.split('-')[1]):
                    # Convert standalone I- to B-
                    word_level_tags[i] = tag.replace('I-', 'B-', 1)
            prev_tag = word_level_tags[i]

        # Apply length filter
        if not (MIN_TOKENS <= len(word_level_tokens) <= MAX_TOKENS):
            continue

        # Store sample data
        if not sample_data:
            sample_data = {
                'english': eng_sent,
                'target': lang_sent,
                'english_tokens': list(zip(eng_tokens, bio_tags)),
                'target_final': list(zip(word_level_tokens, word_level_tags))
            }

        curated_tagged_dataset.append(list(zip(word_level_tokens, word_level_tags)))

    return curated_tagged_dataset, sample_data

# ----- PROCESS ALL LANGUAGES -----
def process_all_languages():
    for lang, lang_code in LANGUAGE_CODES.items():
        lang_dir = EXTRACT_DIR / "v2" / f"en-{lang_code}"
        en_file = lang_dir / "train.en"
        lang_file = lang_dir / f"train.{lang_code}"
        if not en_file.exists() or not lang_file.exists():
            print(f"Skipping {lang} - files not found")
            continue

        print(f"\nProcessing {lang}...")

        # 1) Read all sentences
        english_sentences, language_sentences = process_language_file(en_file, lang_file)

        # 2) First Filter: Keep all NE + 1% No NE
        filtered_english_1, filtered_lang_1 = filter_by_entity(english_sentences, language_sentences)

        # 3) Second Filter (Alignment-based): top 30% after scoring
        final_english, final_lang, alignments = run_alignment(filtered_english_1, filtered_lang_1)

        # 4) Transfer tags
        tagged_language_dataset, sample_data = transfer_tags(final_english, final_lang, alignments)

        # Show a verification sample
        print(f"\n--- {lang.upper()} VERIFICATION SAMPLE ---")
        print("English Sentence:", sample_data.get('english', 'N/A'))
        print("English Tokens & Tags:", sample_data.get('english_tokens', 'N/A'))
        print("Target Sentence:", sample_data.get('target', 'N/A'))
        print("Target Final Tags:", sample_data.get('target_final', 'N/A'))
        print("-" * 50)

        # 5) Save to JSON
        json_dataset = []
        for tagged_sentence in tagged_language_dataset:
            tokens = [token for token, tag in tagged_sentence]
            tags = [tag for token, tag in tagged_sentence]
            json_dataset.append({"tokens": tokens, "ner_tags": tags})

        output_dir = DATA_DIR / "new_bio_tagged_samanantar"
        output_dir.mkdir(exist_ok=True)
        output_file = output_dir / f"{lang}_ner.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(json_dataset, f, ensure_ascii=False, indent=2)
        print(f"Saved {len(json_dataset)} tagged sentences to {output_file}")


process_all_languages()



Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
fatal: destination path 'awesome-align' already exists and is not an empty directory.
/home/user/112101050_Utsav/btp/samanantar_align/awesome-align
Defaulting to user installation because normal site-packages is not writeable
Obtaining file:///home/user/112101050_Utsav/btp/samanantar_align/awesome-align
  Preparing metadata (setup.py) ... [?25ldone
Installing collected packages: awesome-align
  Attempting uninstall: awesome-align
    Found existing installation: awesome-align 0.1.7
    Uninstalling awesome-align-0.1.7:
      Successfully uninstalled awesome-align-0.1.7
  Running setup.py develop for awesome-align
Successfully installed awesome-align

[

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Processing punjabi...


First Filter: Checking NER: 100%|


Total sentences: 2980383
  With entity: 1474208
  Without entity (sampled 1%): 15061 of 1506175
  => after first filter: 1489269
Loading the dataset...
Extracting: 279040it [45:46, 89.01it/s] Traceback (most recent call last):
  File "/home/user/112101050_Utsav/btp/samanantar_align/awesome-align/run_align.py", line 297, in <module>
    main()
  File "/home/user/112101050_Utsav/btp/samanantar_align/awesome-align/run_align.py", line 294, in main
    word_align(args, model, tokenizer)
  File "/home/user/112101050_Utsav/btp/samanantar_align/awesome-align/run_align.py", line 174, in word_align
    word_aligns_list = model.get_aligned_word(ids_src, ids_tgt, bpe2word_map_src, bpe2word_map_tgt, args.device, 0, 0, align_layer=args.align_layer, extraction=args.extraction, softmax_threshold=args.softmax_threshold, test=True, output_prob=(args.output_prob_file is not None))
  File "/home/user/112101050_Utsav/btp/samanantar_align/awesome-align/awesome_align/modeling.py", line 691, in get_aligned_wo

Final Tag Transfer: NER on Englis
Transferring tags:  16%|▏| 69771/Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors
Transferring tags: 100%|█| 446780



--- PUNJABI VERIFICATION SAMPLE ---
English Sentence: Polyclinic Chandigarh
English Tokens & Tags: [('Pol', 'B-ORG'), ('##y', 'I-ORG'), ('##clin', 'I-ORG'), ('##ic', 'I-ORG'), ('Chan', 'I-ORG'), ('##dig', 'I-ORG'), ('##ar', 'I-ORG'), ('##h', 'I-ORG')]
Target Sentence: ਚੰਡੀਗੜ ਹੋਮੀਉਪੈਥਿਕ ਕਲੀਨਿਕ
Target Final Tags: [('ਚੰਡੀਗੜ', 'B-ORG'), ('ਹੋਮੀਉਪੈਥਿਕ', 'O'), ('ਕਲੀਨਿਕ', 'O')]
--------------------------------------------------
Saved 443483 tagged sentences to /home/user/112101050_Utsav/btp/samanantar_align/samanantar_data/new_bio_tagged_samanantar/punjabi_ner.json
