In [1]:
from transformers import pipeline
import pandas as pd

class NERExtractor:
    def __init__(self, model_name="dbmdz/bert-large-cased-finetuned-conll03-english"):
        # Load the NER pipeline with the specified model
        self.ner_pipeline = pipeline("ner", model=model_name, aggregation_strategy="simple")
    
    def extract_entities(self, text):
        """Extract named entities from the provided text."""
        return self.ner_pipeline(text)

    def format_entities(self, entities):
        """Format extracted entities for easy integration into a knowledge graph."""
        formatted_entities = []
        for entity in entities:
            formatted_entities.append({
                "entity": entity['word'],
                "type": entity['entity_group'],
                "score": round(entity['score'], 2)
            })
        return formatted_entities
    
    def filter_entities_by_type(self, entities, entity_type):
        """Filter entities by a specific type."""
        return [entity for entity in entities if entity['type'] == entity_type]

def main():
    # Sample text for NER
    text = """
    Elon Musk is the CEO of SpaceX and Tesla, which are headquartered in California.
    On October 1, 2023, he announced the launch of a new rocket named Starship.
    """
    
    # Initialize the NER extractor
    ner_extractor = NERExtractor()

    # Extract entities from the text
    extracted_entities = ner_extractor.extract_entities(text)

    # Format the extracted entities
    formatted_entities = ner_extractor.format_entities(extracted_entities)

    # Print all extracted entities
    print("Extracted Entities:")
    for entity in formatted_entities:
        print(f"Entity: {entity['entity']}, Type: {entity['type']}, Score: {entity['score']}")

    # Example: Filter entities by type
    print("\nFiltered Entities (Person):")
    person_entities = ner_extractor.filter_entities_by_type(formatted_entities, "PER")
    for entity in person_entities:
        print(entity)

    print("\nFiltered Entities (Organization):")
    organization_entities = ner_extractor.filter_entities_by_type(formatted_entities, "ORG")
    for entity in organization_entities:
        print(entity)

if __name__ == "__main__":
    main()




config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development





model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Extracted Entities:
Entity: Elon Musk, Type: PER, Score: 1.0
Entity: SpaceX, Type: ORG, Score: 1.0
Entity: Tesla, Type: ORG, Score: 1.0
Entity: California, Type: LOC, Score: 1.0
Entity: Starship, Type: MISC, Score: 0.8500000238418579

Filtered Entities (Person):
{'entity': 'Elon Musk', 'type': 'PER', 'score': 1.0}

Filtered Entities (Organization):
{'entity': 'SpaceX', 'type': 'ORG', 'score': 1.0}
{'entity': 'Tesla', 'type': 'ORG', 'score': 1.0}
