# ***Prepoznavanje imenovanih entiteta (NER)***

### Opis: Ispitajte algoritme koji identificiraju imena, lokacije i organizacije unutar teksta.

##### Molim vas da stavite NER.ipynb u folder/mapu i pokrenete ćeliju ispod tako da se svi potrebni podaci skinu (oko 1.01 GB)
* U slučaju da imate kaggle račun već spojen jedna od ćelija se može preskočiti

In [None]:
# Potrebno za sve celije ispod
import os

In [None]:
from pathlib import Path

notebook_path = Path().resolve()  # fallback if metadata fails
try:
    from notebook import notebookapp
    import requests, json
    # Pronalazenje trenutnog puta i imena foldera u kojem se nalazi
    connection_file = os.path.basename(os.path.realpath(os.path.join(os.getcwd(), '..', 'kernel.json')))
except:
    pass

# Put do 'NER.ipynb'
notebook_path = Path("NER.ipynb").resolve().parent

# Kreiranje 'data' foldera gdje se nalazi 'NER.ipynb'
dest_dir = notebook_path / "data"
os.makedirs(dest_dir, exist_ok=True)


In [None]:
import io, zipfile, urllib.request, shutil

# === Settings ===
repo = "tinficok-faks/NER"   # owner/repo
branch = "main"              # branch name
subfolder = "data"           # the subfolder you want from the repo

# === Download the ZIP of the repo ===
zip_url = f"https://github.com/{repo}/archive/refs/heads/{branch}.zip"
with urllib.request.urlopen(zip_url) as resp:
    zip_bytes = resp.read()

# === Extract ONLY the subfolder ===
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as z:
    # GitHub zips have a top-level folder like "NER-main/"
    names = z.namelist()
    if not names:
        raise RuntimeError("Empty zip from GitHub.")
    top = names[0].split("/")[0]  # npr. 'NER-main'
    prefix = f"{top}/{subfolder.strip('/')}/"

    found_any = False
    for member in names:
        if member.startswith(prefix) and not member.endswith("/"):
            found_any = True
            rel_path = member[len(prefix):]  # put do mog subfoldera
            target_path = os.path.join(dest_dir, rel_path)
            os.makedirs(os.path.dirname(target_path), exist_ok=True)
            with z.open(member) as src, open(target_path, "wb") as dst:
                shutil.copyfileobj(src, dst)

if not found_any:
    raise FileNotFoundError(f"Subfolder '{subfolder}' not found in repo {repo}@{branch}.")


🟨 Ćelija ispod se može preskočiti ako imate spojen config datoteku s API-jem već nekog Kaggle računa

In [None]:
!pip install kaggle

# Target file path
dest_file = notebook_path / "kaggle.json"

# Download the file
url = "https://raw.githubusercontent.com/tinficok-faks/NER/main/kaggle.json"
urllib.request.urlretrieve(url, dest_file)

print(f"Downloaded kaggle.json to: {dest_file}")

# Ovo je moj API key
os.environ["KAGGLE_CONFIG_DIR"] = os.getcwd()

🟩 Ova ćelija je potrebna za preuzimanje baze gradova i kompanija

In [None]:
# Download i unzip u 'data' folder
!kaggle datasets download -d juanmah/world-cities -p "data" --unzip
!kaggle datasets download -d peopledatalabssf/free-7-million-company-dataset -p "data" --unzip

In [None]:
# Za brisanje 'kaggle.json' jer nije potreban
# A ako si ga zelite zadrzati mozete obrisati/zakomentirati ovo
file_to_delete = notebook_path / "kaggle.json"
if file_to_delete.exists():
    file_to_delete.unlink()

# Početak NER programa

Sve potrebne biblioteke za projekt

In [None]:
# Required libraries
import os
import random
import nltk
import regex as re
import pandas as pd
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report

# Download NLTK tokenizer
nltk.download('punkt')

Ucitavanje podataka
* imena
* gradovi
* kompanije
* države

In [None]:
# Ucitavanje HR i ENG imena iz 4 text file-a
def load_names():
    names = set()
    files = ['ENG-muska-imena.txt', 'ENG-zenska-imena.txt', 'HR-muska-imena.txt', 'HR-zenska-imena.txt']
    for file in files:
        path = os.path.join('data', file)
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                names.add(line.strip().lower())
    return names

# Ucitavanje gradova iz excel baze
def load_cities():
    df = pd.read_csv('data/worldcities.csv')
    return set(df['city'].dropna().str.lower().unique())

# Ucitavanje 7M kompanija iz excel baze
def load_companies():
    df = pd.read_csv('data/companies_sorted.csv', usecols=['name'], encoding='utf-8', low_memory=False)
    return set(df['name'].dropna().str.lower().unique())

# Ucitavanje drzava na HR i ENG iz text file-ova
def load_countries():
    countries = set()
    files = ['countries.txt', 'drzave.txt']
    for file in files:
        path = os.path.join('data', file)
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:  # skip empty lines
                    countries.add(line.lower())
    return countries

names_set = load_names()
cities_set = load_cities()
companies_set = load_companies()
countries_set = load_countries()

Tokenizira rečenice i pretvara ih u BIO (beggining, inside, out) oznake

In [None]:
def prepare_data(sentences):
    data = []
    for sent, annotations in sentences:
        tokens = word_tokenize(sent)
        labels = ['O'] * len(tokens)
        for entity, tag in annotations:
            entity_tokens = word_tokenize(entity)
            for i in range(len(tokens) - len(entity_tokens) + 1):
                if tokens[i:i+len(entity_tokens)] == entity_tokens:
                    labels[i] = 'B-' + tag
                    for j in range(1, len(entity_tokens)):
                        labels[i + j] = 'I-' + tag
                    break
        data.append((tokens, labels))
    return data

* **word2features** - pravi rječnik za treniranje modela kao što je CRF, koristi posebne znakove za BOS (beginning of sentence) i EOS (end of sentence)
* **sent2features** - pokreće funkciju *word2features* za svaki token u rečenici
* **sent2labels** - vraća nepromijenjenu listu oznaka/tagova

In [None]:
def word2features(sent, i):
    word = sent[i]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word.istitle()': word.istitle(),
        'word.isupper()': word.isupper(),
        'word.isdigit()': word.isdigit(),
        'suffix3': word[-3:],
        'prefix3': word[:3],
        'in_names': word.lower() in names_set,
        'in_cities': word.lower() in cities_set,
        'in_companies': word.lower() in companies_set,
    }
    if i > 0:
        features.update({
            '-1:word.lower()': sent[i-1].lower(),
            '-1:word.istitle()': sent[i-1].istitle(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        features.update({
            '+1:word.lower()': sent[i+1].lower(),
            '+1:word.istitle()': sent[i+1].istitle(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(labels):
    return labels

Učitavanje podataka za treniranje
* train_1000 - ima 1000 rečenica
* train_2000 - ima 2000 rečenica
* train_7000 - ima 7000 rečenica

In [None]:
# train test
def load_training_data(filepath):
    sentences = []
    tokens = []
    labels = []
    
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append((tokens, labels))
                    tokens = []
                    labels = []
                continue
            try:
                token, label = line.split()
                tokens.append(token)
                labels.append(label)
            except ValueError:
                print(f"Skipping malformed line: {line}")
    
    if tokens:
        sentences.append((tokens, labels))
    
    return sentences

# Load the dataset
# data = load_training_data('data/train_1000.txt')
# data = load_training_data('data/train_2000.txt')
data = load_training_data('data/train_7000.txt')

Slijed događaja
* Randomiziranje rečenica kako bi se izbjegla pristranost
* X - popis niza značajki za svaku rečenicu iz *sent2features*
* Y - popis nizova oznaka za svaku rečenicu
* Podjela na 80% treniranje i 20% testiranje
* *Train CRF* - kreira Conditional Random Field i prilagođava ga oznakama za učenje

In [None]:
# Prepare training data
random.shuffle(data)

X = [sent2features(tokens) for tokens, _ in data]
y = [labels for _, labels in data]


# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train CRF model
crf = CRF(algorithm='lbfgs', max_iterations=1000)
crf.fit(X_train, y_train)

Provjerava koliko se dobro predviđene oznake modela podudaraju s ispravnim oznakama nevidljivih rečenica

In [None]:
# Evaluate model
y_pred = crf.predict(X_test)
print(flat_classification_report(y_test, y_pred))

Funkcija uzima tekst i vraća entitete (B,I,O) koje pronađe istrenirani CRF model

In [None]:
def predict_entities(text):
    tokens = word_tokenize(text)
    feats = sent2features(tokens)
    preds = crf.predict_single(feats)
    entities = []
    current = []
    current_tag = None

    for token, label in zip(tokens, preds):
        if label.startswith("B-"):
            if current:
                entities.append((" ".join(current), current_tag))
            current = [token]
            current_tag = label[2:]
        elif label.startswith("I-") and current:
            current.append(token)
        else:
            if current:
                entities.append((" ".join(current), current_tag))
                current = []
                current_tag = None
    if current:
        entities.append((" ".join(current), current_tag))
    return entities

Ručno provjeravanje pozivanjem funkcije 'predict_entities' što će model izbaciti

In [None]:
print(predict_entities("Ivan radi za Rimac Automobili i živi u Splitu u Hrvatskoj"))
print(predict_entities("Marla Vidaković works for Microsoft in London."))
print(predict_entities("Tihomira Bilića nema na igralištu pored Adrie"))

Pokretanje NER modela na svim rečenicama u fileovima
* test ima 98 rečenica
* test_veliki ima 2000 rečenica

In [None]:
# Run NER on every sentence in test files
# test_file = 'data/test.txt'
test_file = 'data/test_veliki.txt'

with open(test_file, 'r', encoding='utf-8') as f:
    test_sentences = [line.strip() for line in f if line.strip()]

print(f"Loaded {len(test_sentences)} test sentences\\n")

for i, sent in enumerate(test_sentences, 1):
    entities = predict_entities(sent)
    print(f"{i:>3}. {sent}")
    for ent, tag in entities:
        print(f"    -> {ent:25}  [{tag}]")
    print()