In [1]:
from polyner import PolyNER
import pandas as pd

# Initialize the processor
processor = PolyNER()

# Test text with multiple languages and named entities
test_text = """
Apple Inc. was founded by Steve Jobs in Cupertino, California.
Google tiene su sede en Mountain View, California.
Amazon wurde von Jeff Bezos in Seattle gegründet.
Microsoft was founded by Bill Gates and Paul Allen.
La Tour Eiffel est située à Paris, France.
"""

print("Testing multilingual processing with different confidence thresholds...\n")

# Test with default confidence (0.5)
print("=== DEFAULT CONFIDENCE (0.5) ===")
try:
    result_default = processor.process_multi(test_text)
    
    # Display all tokens and their properties
    print("All tokens:")
    print(result_default)
    
    # Display just the entities
    entities_default = result_default[result_default["entity_label"].notna()]
    print("\nEntities detected (default confidence):")
    if "entity_score" in entities_default.columns:
        print(entities_default[["token", "language", "entity_label", "entity_score"]])
    else:
        print(entities_default[["token", "language", "entity_label"]])
        
except Exception as e:
    print(f"Error with default confidence: {e}")

print("\n" + "="*50 + "\n")


Testing multilingual processing with different confidence thresholds...

=== DEFAULT CONFIDENCE (0.5) ===


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


All tokens:
         token language  is_emoji  norm_token entity_label  confidence
0        Apple       en     False       apple          ORG    0.996401
1         Inc.       en     False        inc.          ORG    0.996401
2          was       en     False         was         None         NaN
3      founded       en     False     founded         None         NaN
4           by       en     False          by         None         NaN
5        Steve       en     False       steve          PER    0.999432
6         Jobs       en     False        jobs          PER    0.999432
7           in       en     False          in         None         NaN
8    Cupertino       en     False   cupertino          LOC    0.998989
9            ,       en     False           ,         None         NaN
10  California       en     False  california          LOC    0.999100
11           .       en     False           .         None         NaN
12      Google       en     False      google          ORG    0.9

In [2]:
# Test with high confidence (0.8)
print("=== HIGH CONFIDENCE (0.8) ===")
try:
    result_high = processor.process_multi(test_text, confidence_threshold=0.8)
    
    # Display just the entities
    entities_high = result_high[result_high["entity_label"].notna()]
    print("Entities detected (high confidence):")
    print(entities_high)
        
except Exception as e:
    print(f"Error with high confidence: {e}")

print("\n" + "="*50 + "\n")



=== HIGH CONFIDENCE (0.8) ===


Device set to use cpu


Entities detected (high confidence):
         token language  is_emoji  norm_token entity_label  confidence
0        Apple       en     False       apple          ORG    0.996401
1         Inc.       en     False        inc.          ORG    0.996401
5        Steve       en     False       steve          PER    0.999432
6         Jobs       en     False        jobs          PER    0.999432
8    Cupertino       en     False   cupertino          LOC    0.998989
10  California       en     False  california          LOC    0.999100
12      Google       en     False      google          ORG    0.991959
17    Mountain       en     False    mountain          LOC    0.999322
18        View       en     False        view          LOC    0.999322
20  California       en     False  california          LOC    0.999098
22      Amazon       de     False      amazon          ORG    0.991528
25        Jeff       de     False        jeff          PER    0.999698
26       Bezos       de     False       

In [3]:
# Test with low confidence (0.2)
print("=== LOW CONFIDENCE (0.2) ===")
try:
    result_low = processor.process_multi(test_text, confidence_threshold=0.2)
    
    # Display just the entities
    entities_low = result_low[result_low["entity_label"].notna()]
    print("Entities detected (low confidence):")
    print(entities_low)
        
except Exception as e:
    print(f"Error with low confidence: {e}")

print("\n" + "="*50 + "\n")

=== LOW CONFIDENCE (0.2) ===


Device set to use cpu


Entities detected (low confidence):
         token language  is_emoji  norm_token entity_label  confidence
0        Apple       en     False       apple          ORG    0.996401
1         Inc.       en     False        inc.          ORG    0.996401
5        Steve       en     False       steve          PER    0.999432
6         Jobs       en     False        jobs          PER    0.999432
8    Cupertino       en     False   cupertino          LOC    0.998989
10  California       en     False  california          LOC    0.999100
12      Google       en     False      google          ORG    0.991959
17    Mountain       en     False    mountain          LOC    0.999322
18        View       en     False        view          LOC    0.999322
20  California       en     False  california          LOC    0.999098
22      Amazon       de     False      amazon          ORG    0.991528
25        Jeff       de     False        jeff          PER    0.999698
26       Bezos       de     False       b

In [4]:
from polyner import PolyNER

processor = PolyNER()

# List of texts
texts = [
    "Apple Inc. is based in Cupertino.",
    "Google tiene su sede en Mountain View.",
    "Amazon wurde von Jeff Bezos gegründet."
]

# Process batch
results = processor.process_batch_multi(texts)

# Access results
for i, df in enumerate(results):
    print(f"Text {i+1} entities:")
    entities = df[df["entity_label"].notna()]
    print(entities[["token", "language", "entity_label"]])

Device set to use cpu
Device set to use cpu
Device set to use cpu


Text 1 entities:
       token language entity_label
0      Apple       en          ORG
1       Inc.       en          ORG
5  Cupertino       en          LOC
Text 2 entities:
      token language entity_label
0    Google       nl          ORG
5  Mountain       nl          LOC
6      View       nl          LOC
Text 3 entities:
    token language entity_label
0  Amazon       de          ORG
3    Jeff       de          PER
4   Bezos       de          PER
