# Key Phrases Extraction

1. Extractor Keywords
   1. Remove Stop Words:
      1. Chinese (zh): jieba
      2. Arabic (ar): Hugging Face (asafaya/bert-base-arabic)
      3. Hindi (hi): indic-nlp-library
      4. Basque (eu): tokenize - xx_ent_wiki_sm, Stopwords-iso - stopwords-eu.txt
      5. Czech (cs): stopwordsios
      6. Farsi (fa): Hazm
      7. Other Languages: spaCy model
   2. Recognize NER Entities:
      1. Hugging Face Models:
         1. Arabic (ar): asafaya/bert-base-arabic
         2. Catalan (ca): projecte-aina/roberta-base-ca-v2-cased-ner
         3. Farsi (fa): HooshvareLab/bert-fa-base-uncased-ner-arman
         4. Other Languages: FacebookAI/xlm-roberta-large-finetuned-conll03-english
      3. For Unrecognized Content, Perform Tokenization (Extract Key Nouns if Possible):
         1. Chinese (zh): jieba (tfidf-keywords)
         2. Hindi (hi): indic_tokenize
         3. Arabic (ar): Hugging Face (asafaya/bert-base-arabic)
         4. Czech (cs): Stanza
         5. Farsi (fa): Stanza
         6. Other Languages: spaCy tokenize
2. Acquire External Knowledgesults.

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import string
import pandas as pd
import nltk
from hazm import Normalizer, WordTokenizer, stopwords_list
import stanza
from stopwordsiso import stopwords
from indicnlp.tokenize import indic_tokenize
import jieba.analyse
import os
import torch
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import json
import gc
import re

In [None]:
# proxy setting (if possible)
# Jalynn`s settings
proxies = {
    "http": "http://127.0.0.1:10809",
    "https": "http://127.0.0.1:10809"
}
os.environ["http_proxy"] = proxies["http"]
os.environ["https_proxy"] = proxies["https"]

In [None]:
stanza.download('cs')
nlp_stanza_czech = stanza.Pipeline('cs')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-01-17 05:53:30 INFO: Downloaded file to C:\Users\hjy\stanza_resources\resources.json
2025-01-17 05:53:30 INFO: Downloading default packages for language: cs (Czech) ...
2025-01-17 05:53:31 INFO: File exists: C:\Users\hjy\stanza_resources\cs\default.zip
2025-01-17 05:53:31 INFO: Finished downloading models and saved to C:\Users\hjy\stanza_resources
2025-01-17 05:53:31 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-01-17 05:53:35 INFO: Downloaded file to C:\Users\hjy\stanza_resources\resources.json
2025-01-17 05:53:35 INFO: Loading these models for language: cs (Czech):
| Processor | Package      |
----------------------------
| tokenize  | pdt          |
| mwt       | pdt          |
| pos       | pdt_nocharlm |
| lemma     | pdt_nocharlm |
| depparse  | pdt_nocharlm |

2025-01-17 05:53:35 INFO: Using device: cpu
2025-01-17 05:53:35 INFO: Loading: tokenize
  return self.fget.__get__(instance, owner)()
2025-01-17 05:53:35 INFO: Loading: mwt
2025-01-17 05:53:36 INFO: Loading: pos
2025-01-17 05:53:37 INFO: Loading: lemma
2025-01-17 05:53:39 INFO: Loading: depparse
2025-01-17 05:53:40 INFO: Done loading processors!


In [None]:
stanza.download('fa')
nlp_stanza_fa = stanza.Pipeline('fa')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-01-17 05:53:45 INFO: Downloaded file to C:\Users\hjy\stanza_resources\resources.json
2025-01-17 05:53:45 INFO: Downloading default packages for language: fa (Persian) ...
2025-01-17 05:53:46 INFO: File exists: C:\Users\hjy\stanza_resources\fa\default.zip
2025-01-17 05:53:47 INFO: Finished downloading models and saved to C:\Users\hjy\stanza_resources
2025-01-17 05:53:47 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-01-17 05:53:49 INFO: Downloaded file to C:\Users\hjy\stanza_resources\resources.json
2025-01-17 05:53:50 INFO: Loading these models for language: fa (Persian):
| Processor | Package        |
------------------------------
| tokenize  | perdt          |
| mwt       | perdt          |
| pos       | perdt_charlm   |
| lemma     | perdt_nocharlm |
| depparse  | perdt_charlm   |
| ner       | arman          |

2025-01-17 05:53:50 INFO: Using device: cpu
2025-01-17 05:53:50 INFO: Loading: tokenize
2025-01-17 05:53:50 INFO: Loading: mwt
2025-01-17 05:53:50 INFO: Loading: pos
2025-01-17 05:53:52 INFO: Loading: lemma
2025-01-17 05:53:52 INFO: Loading: depparse
2025-01-17 05:53:53 INFO: Loading: ner
2025-01-17 05:53:55 INFO: Done loading processors!


In [None]:
def clear_model_cache():
    """
    Clear the model cache to free up memory
    """
    global model_cache
    for lang, pipeline in model_cache.items():
        del pipeline
    model_cache.clear()
    gc.collect()
    print("The model cache has been cleared and memory has been freed.")

In [None]:
# Delayed loading model
model_cache = {}

def get_ner_pipeline(language):
    if language not in model_cache:

        clear_model_cache()
        if language == "ar":
            tokenizer = AutoTokenizer.from_pretrained(
                "asafaya/bert-base-arabic",
                cache_dir="../huggingface/", # load local directory
                local_files_only=True,
                ignore_mismatched_sizes=True
            )

            model = AutoModelForTokenClassification.from_pretrained(
                "asafaya/bert-base-arabic",
                cache_dir="../huggingface/", # load local directory
                local_files_only=True,
                ignore_mismatched_sizes=True
            )

        elif language == "ca":
            tokenizer = AutoTokenizer.from_pretrained(
                "projecte-aina/roberta-base-ca-v2-cased-ner",
                cache_dir="../huggingface/", # load local directory
                local_files_only=True,
                resume_download=True,
                ignore_mismatched_sizes=True
            )

            model = AutoModelForTokenClassification.from_pretrained(
                "projecte-aina/roberta-base-ca-v2-cased-ner",
                cache_dir="../huggingface/", # load local directory
                local_files_only=True,
                resume_download=True,
                ignore_mismatched_sizes=True
            )

        elif language == "fa":
            tokenizer = AutoTokenizer.from_pretrained(
                "HooshvareLab/bert-fa-base-uncased-ner-arman",
                cache_dir="../huggingface/", # load local directory
                local_files_only=True,
                resume_download=True,
                ignore_mismatched_sizes=True
            )

            model = AutoModelForTokenClassification.from_pretrained(
                "HooshvareLab/bert-fa-base-uncased-ner-arman",
                cache_dir="../huggingface/", # load local directory
                local_files_only=True,
                resume_download=True,
                ignore_mismatched_sizes=True
            )

        else:
            tokenizer = AutoTokenizer.from_pretrained(
                "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
                cache_dir="../huggingface/", # load local directory
                local_files_only=True,
                ignore_mismatched_sizes=True
            )
            model = AutoModelForTokenClassification.from_pretrained(
                "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
                cache_dir="../huggingface/", # load local directory
                local_files_only=True,
                ignore_mismatched_sizes=True
            )

        model_cache[language] = pipeline("ner", model=model, tokenizer=tokenizer)

    return model_cache[language]

In [None]:
# Initial Hugging Face AR language model
tokenizer_ar = AutoTokenizer.from_pretrained(
    "asafaya/bert-base-arabic",
    cache_dir="../huggingface/",
    local_files_only=True,
    ignore_mismatched_sizes=True
)

model_ar = AutoModelForTokenClassification.from_pretrained(
    "asafaya/bert-base-arabic",
    cache_dir="../huggingface/",
    local_files_only=True,
    ignore_mismatched_sizes=True
)
ner_pipeline_ar = pipeline("ner", model=model_ar, tokenizer=tokenizer_ar)

In [None]:
nltk.download('punkt')

def remove_arabic_punctuation(text):

    arabic_punctuation = r'[؟،؛«»…"“‘’]'

    return re.sub(arabic_punctuation, ' ', text)


def load_stopwords(language):
    if language == "zh":
        # Chinese stopwords file 'stopwords_zh.txt'
        with open("../stopwords/stopwords-zh.txt", encoding="utf-8") as f:
            return set(line.strip() for line in f)
    elif language == "ar":
        # Arabic stopwords file 'stopwords_ar.txt'
        with open("../stopwords/stopwords-ar.txt", encoding="utf-8") as f:
            return set(line.strip() for line in f)

    elif language == "eu":
        with open('../stopwords/stopwords-eu.txt', encoding='utf-8') as f:
            return set(line.strip() for line in f)

    else:
        return set()

In [None]:
# filter stop words
def filter_stop_words(text: str, language: str):
    load_stopwords_file = load_stopwords(language)

    if language == "zh":

        tokens = jieba.cut(text)
        filtered_tokens = [token for token in tokens if token.strip() and token not in load_stopwords_file and token not in string.punctuation]
        return " ".join(filtered_tokens)

    elif language == 'ar':
        # Remove Arabic punctuation
        text = remove_arabic_punctuation(text)

        arabic_stopwords = stopwords("ar")

        combined_stopwords = arabic_stopwords.union(load_stopwords_file)

        tokens = text.split()

        filtered_tokens = [token for token in tokens if token not in combined_stopwords]

        return " ".join(filtered_tokens)

    elif language == "hi":
        hindi_stopwords = set(stopwords('hi'))

        tokens = indic_tokenize.trivial_tokenize(text, lang='hi')
        filtered_tokens = [token for token in tokens if token not in hindi_stopwords and token not in string.punctuation]

        return " ".join(filtered_tokens)

    elif language == "eu":
        try:
            nlp_basque = spacy.load("xx_ent_wiki_sm")
        except OSError:
            import os
            os.system("python -m spacy download xx_ent_wiki_sm")
            nlp_basque = spacy.load("xx_ent_wiki_sm")

        filtered_text = ' '.join([token.text for token in nlp_basque(text) if token.text.lower() not in load_stopwords_file])
        return filtered_text

    elif language == "cs":
        czech_stopwords = set(stopwords('cs'))
        nlp_czech = spacy.blank("cs")
        filtered_tokens = ' '.join([token.text for token in nlp_czech(text) if token.text.lower() not in czech_stopwords])
        return filtered_tokens

    elif language == "fa":
        normalizer_fa = Normalizer()
        tokenizer_fa = WordTokenizer()
        stopwords_fa = set(stopwords_list())

        normalized_text = normalizer_fa.normalize(text)
        tokens = tokenizer_fa.tokenize(normalized_text)
        filtered_text = ' '.join([token for token in tokens if token not in stopwords_fa])
        return filtered_text

    else:

        spacy_model_map = {
            "en": "en_core_web_sm",
            "es": "es_core_news_sm",
            "fr": "fr_core_news_sm",
            "de": "de_core_news_sm",
            "it": "it_core_news_sm",
            "fi": "fi_core_news_sm",
            "sv": "sv_core_news_sm",
            "ca": "ca_core_news_sm"
        }

        model_name = spacy_model_map.get(language)

        try:
            nlp = spacy.load(model_name)
        except Exception:
            raise ValueError(f"Make sure you have installed {model_name} model！")

        doc = nlp(text)
        filtered_tokens = [token.text for token in doc if not token.is_stop and not token.is_punct and token.pos_ in {"NOUN", "PROPN"}]
        return " ".join(filtered_tokens)

In [None]:
def merge_subwords_preserve_spaces(ner_results, text, language='en'):
    """
    Integration entity consolidation:
    - Remove special segmentation markers (▁, Ġ, ##)
    - Tag merged entities according to B/I
    - Retain space and location information
    """
    merged_entities = []
    current_entity = None
    current_words = []

    split_languages = {'zh', 'ar', 'hi', 'cs', 'eu', 'ca', 'fa'}

    for result in ner_results:

        word = result['word'].replace("▁", "").replace("Ġ", "").replace("##", "")
        entity_type = result['entity'].replace("B-", "").replace("I-", "")

        if (current_entity is None or entity_type != current_entity or
            ('▁' in result['word'] and language in split_languages)):

            if current_words:
                start = current_words[0]['start']
                end = current_words[-1]['end']

                combined_word = "".join([
                    res['word'].replace("▁", " ").replace("Ġ", "").replace("##", "")
                    for res in current_words
                ]).strip()

                # average confidence
                avg_score = sum(res['score'] for res in current_words) / len(current_words)

                merged_entities.append({
                    "entity": current_entity,
                    "word": combined_word,
                    "score": avg_score,
                    "start": start,
                    "end": end
                })

            current_entity = entity_type
            current_words = [result]
        else:
            current_words.append(result)

    if current_words:
        start = current_words[0]['start']
        end = current_words[-1]['end']
        combined_word = "".join([
            res['word'].replace("▁", " ").replace("Ġ", "").replace("##", "")
            for res in current_words
        ]).strip()
        avg_score = sum(res['score'] for res in current_words) / len(current_words)

        merged_entities.append({
            "entity": current_entity,
            "word": combined_word,
            "score": avg_score,
            "start": start,
            "end": end
        })

    return merged_entities


In [None]:
# key phrase extracted function
def extract_key_phrases(text: str, language: str = "en") -> dict:
    """
    extract key phrase:
    - filter stop words
    - using Hugging Face extract NER
    - for unextracted NER sentences, using different methods to extract key phrase (NOUNS)
    """
    # filter stop words
    filtered_text = filter_stop_words(text, language)
    ner_pipeline = get_ner_pipeline(language)

    # extract NER
    if language == "ar":
        ner_results = ner_pipeline(filtered_text)
        merged_entities = merge_subwords_preserve_spaces(ner_results, filtered_text)
        ner_entities = [entity['word'] for entity in merged_entities]

    elif language == "eu":
        ner_results = ner_pipeline(filtered_text)
        merged_entities = merge_subwords_preserve_spaces(ner_results, filtered_text)
        ner_entities = [entity['word'] for entity in merged_entities]

    elif language == "ca":
        ner_results = ner_pipeline(filtered_text)
        merged_entities = merge_subwords_preserve_spaces(ner_results, filtered_text)
        ner_entities = [entity['word'] for entity in merged_entities]

    elif language == "cs":
        ner_results = ner_pipeline(filtered_text)
        merged_entities = merge_subwords_preserve_spaces(ner_results, filtered_text)
        ner_entities = [entity['word'] for entity in merged_entities]

    elif language == "fa":
        normalizer_fa = Normalizer()
        tokenizer_fa = WordTokenizer()
        stopwords_fa = set(stopwords_list())

        ner_results = ner_pipeline(filtered_text)
        merged_entities = merge_subwords_preserve_spaces(ner_results, filtered_text)
        ner_entities = [entity['word'] for entity in merged_entities]

    else:
        ner_results = ner_pipeline(filtered_text)
        merged_entities = merge_subwords_preserve_spaces(ner_results, filtered_text)
        ner_entities = [entity['word'] for entity in merged_entities]

    # if NER entity has been extracted, then return
    if ner_entities:
        return {
            "NER_entities": ner_entities,
            "Additional_phrases": []
        }

    # if NER cannot be extracted
    # for zh, using jieba
    elif language == "zh":
        if not filtered_text.strip():
            return {
                    "NER_entities": ner_entities,
                    "Additional_phrases": []
                }

        tfidf_keywords = jieba.analyse.extract_tags(filtered_text, topK=2, withWeight=False)
        if not tfidf_keywords:
            return {
                "NER_entities": ner_entities,
                "Additional_phrases": []
            }

        return {
            "NER_entities": ner_entities,
            "Additional_phrases": tfidf_keywords
        }

    # for hi, using indic-nlp-library extract token key phrase
    elif language == "hi":
        if not filtered_text.strip():
            return {
                    "NER_entities": ner_entities,
                    "Additional_phrases": []
                }

        tokens = list(indic_tokenize.trivial_tokenize(filtered_text))
        if not tokens or all(token in string.punctuation for token in tokens):
            return {
                "NER_entities": ner_entities,
                "Additional_phrases": []
            }

        return {
            "NER_entities": ner_entities,
            "Additional_phrases": tokens
        }

    elif language == 'ar':
        if not filtered_text.strip():
            return {
                    "NER_entities": ner_entities,
                    "Additional_phrases": []
                }
        # using Hugging Face model to tokenize
        inputs = tokenizer_ar(filtered_text, return_tensors="pt")
        with torch.no_grad():
            outputs = model_ar(**inputs)

        tokens = tokenizer_ar.convert_ids_to_tokens(inputs["input_ids"][0])

        if not tokens:
            return {
                "NER_entities": ner_entities,
                "Additional_phrases": []
            }

        noun_tokens = [token for token in tokens if token.isalpha() and token not in string.punctuation]

        if len(noun_tokens) < 2:
            return {
                "NER_entities": ner_entities,
                "Additional_phrases": noun_tokens
            }

        # use TF-IDF extract keywords
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform([" ".join(noun_tokens)])
        feature_names = tfidf_vectorizer.get_feature_names_out()
        tfidf_scores = tfidf_matrix.toarray()[0]

        tfidf_keywords = [
            feature_names[i]
            for i in tfidf_scores.argsort()[-10:][::-1]
        ]
        return {
            "NER_entities": ner_entities,
            "Additional_phrases": tfidf_keywords
        }

    elif language == "eu":
        if not filtered_text.strip():
            return {
                    "NER_entities": ner_entities,
                    "Additional_phrases": []
                }
        tokens = [ent.text for ent in nlp_basque(filtered_text).ents]

        if not tokens:
            return {
                "NER_entities": ner_entities,
                "Additional_phrases": []
            }

        else:

            tfidf_vectorizer = TfidfVectorizer()
            tfidf_matrix = tfidf_vectorizer.fit_transform([" ".join(tokens)])
            feature_names = tfidf_vectorizer.get_feature_names_out()
            tfidf_scores = tfidf_matrix.toarray()[0]


            tfidf_keywords = [
                feature_names[i]
                for i in tfidf_scores.argsort()[-10:][::-1]
            ]

            return {
                "NER_entities": ner_entities,
                "Additional_phrases": tfidf_keywords
            }

    elif language == "cs":
        if not filtered_text.strip():
            return {
                    "NER_entities": ner_entities,
                    "Additional_phrases": []
                }

        doc = nlp_stanza_czech(filtered_text)
        tokens = [word.text for sent in doc.sentences for word in sent.words if word.upos in ["NOUN", "PROPN"]]

        if not tokens or all(token in string.punctuation for token in tokens):
            return {
                "NER_entities": ner_entities,
                "Additional_phrases": []
            }

        else:

            tfidf_vectorizer = TfidfVectorizer()
            tfidf_matrix = tfidf_vectorizer.fit_transform([" ".join(tokens)])
            feature_names = tfidf_vectorizer.get_feature_names_out()
            tfidf_scores = tfidf_matrix.toarray()[0]


            tfidf_keywords = [
                feature_names[i]
                for i in tfidf_scores.argsort()[-10:][::-1]
            ]

            return {
                "NER_entities": ner_entities,
                "Additional_phrases": tfidf_keywords
            }

    elif language == "fa":
        if not filtered_text.strip():
            return {
                    "NER_entities": ner_entities,
                    "Additional_phrases": []
                }
        doc = nlp_stanza_fa(filtered_text)
        tokens = [word.text for sent in doc.sentences for word in sent.words if word.upos in ["NOUN", "PROPN"]]

        if not tokens or all(token in string.punctuation for token in tokens):
            return {
                "NER_entities": ner_entities,
                "Additional_phrases": []
            }

        else:

            tfidf_vectorizer = TfidfVectorizer()
            tfidf_matrix = tfidf_vectorizer.fit_transform([" ".join(tokens)])
            feature_names = tfidf_vectorizer.get_feature_names_out()
            tfidf_scores = tfidf_matrix.toarray()[0]


            tfidf_keywords = [
                feature_names[i]
                for i in tfidf_scores.argsort()[-10:][::-1]
            ]

            return {
                "NER_entities": ner_entities,
                "Additional_phrases": tfidf_keywords
            }

    else:
        # for other languages, using spaCy extract key phrase
        try:

            spacy_model_map = {
                "en": "en_core_web_sm",
                "es": "es_core_news_sm",
                "fr": "fr_core_news_sm",
                "de": "de_core_news_sm",
                "it": "it_core_news_sm",
                "fi": "fi_core_news_sm",
                "sv": "sv_core_news_sm",
                "eu": "xx_ent_wiki_sm",
                "ca": "ca_core_news_sm"
            }

            model_name = spacy_model_map.get(language)

            nlp = spacy.load(model_name)
        except Exception:
            raise ValueError(f"Make sure installed {language}_core_news_sm model！")

        if not filtered_text.strip():
            return {
                    "NER_entities": ner_entities,
                    "Additional_phrases": []
                }
        doc = nlp(filtered_text)
        tokens = {token.text for token in doc if token.pos_ in {"NOUN", "PROPN"}}

        if not tokens or all(token in string.punctuation for token in tokens):
            return {
                "NER_entities": ner_entities,
                "Additional_phrases": []
            }


        if len(tokens) < 2:
            return {
                "NER_entities": ner_entities,
                "Additional_phrases": tokens
            }

        else:

            tfidf_vectorizer = TfidfVectorizer()
            print("tokens: ", tokens)
            tfidf_matrix = tfidf_vectorizer.fit_transform([" ".join(tokens)])
            feature_names = tfidf_vectorizer.get_feature_names_out()
            tfidf_scores = tfidf_matrix.toarray()[0]


            tfidf_keywords = [
                feature_names[i]
                for i in tfidf_scores.argsort()[-10:][::-1]
            ]

            return {
                "NER_entities": ner_entities,
                "Additional_phrases": tfidf_keywords
            }

## Apply on the Dataset

In [None]:
input_dir = '../data/val/val/'
output_dir = '../data/detect_val/extract_m2/'

os.makedirs(output_dir, exist_ok=True)

In [None]:
for filename in tqdm(os.listdir(input_dir)):
    if filename.endswith('.jsonl'):
        input_path = os.path.join(input_dir, filename)
        output_path = os.path.join(output_dir, filename)

        with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
            for line in infile:
                data = json.loads(line)
                text = data.get('model_input', '')
                language = data.get('lang', 'en').lower()
                # print("language: ", language)

                key_phrases_result = extract_key_phrases(text, language)
                ner_entities = key_phrases_result.get("NER_entities", [])
                additional_phrases = key_phrases_result.get("Additional_phrases", [])

                combined_keywords = list(set(ner_entities + list(additional_phrases)))

                data['keywords'] = combined_keywords

                outfile.write(json.dumps(data, ensure_ascii=False) + '\n')

print("Keywords extractiono and save completed!")