In [4]:
import re
import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

CSV_PATH = "baza_faq.csv"
MODEL_PATH = "models/intent_bilstm.pt"  

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE


'cpu'

In [5]:
_TOKEN_RE = re.compile(r"[a-zA-ZčćšđžČĆŠĐŽ0-9]+")

def tokenize(text: str):
    text = (text or "").lower().strip()
    return _TOKEN_RE.findall(text)

def encode(text: str, stoi: dict):
    unk = stoi.get("<unk>", 1)
    return [stoi.get(tok, unk) for tok in tokenize(text)]

class BiLSTMIntent(nn.Module):
    def __init__(self, vocab_size: int, num_classes: int, emb_dim: int = 128, hidden_dim: int = 128, dropout: float = 0.3):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.drop = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x: torch.Tensor, lengths: torch.Tensor) -> torch.Tensor:
        emb = self.emb(x)
        packed = nn.utils.rnn.pack_padded_sequence(
            emb, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        _, (h, _) = self.lstm(packed)
        h_fwd = h[-2]
        h_bwd = h[-1]
        h_cat = torch.cat([h_fwd, h_bwd], dim=1)
        return self.fc(self.drop(h_cat))

In [7]:
bundle = torch.load(MODEL_PATH, map_location="cpu")
stoi = bundle["stoi"]
itos = bundle["itos"]
intent2id = bundle["intent2id"]
id2intent = bundle["id2intent"]

#Učitaj model i težine
model = BiLSTMIntent(vocab_size=len(itos), num_classes=len(intent2id)).to(DEVICE)
model.load_state_dict(bundle["state_dict"])
model.eval()

#Učitavanje podataka
df = pd.read_csv(CSV_PATH, encoding="utf-8")

#Retrieval koristimo na TRAIN bazi (nema "varanja" s testom)
train_df = df[df["split"] == "train"].reset_index(drop=True)

#Vrati TF-IDF index po intentu
intent_index = {}
for intent in train_df["intent"].unique():
    sub = train_df[train_df["intent"] == intent].reset_index(drop=True)
    vect = TfidfVectorizer(ngram_range=(1,2), min_df=1)
    X = vect.fit_transform(sub["question"].tolist())
    intent_index[intent] = {"df": sub, "vectorizer": vect, "X": X}


In [8]:
#Ova funkcija služi za prikaz top k predviđenih intenta po upitu
@torch.no_grad()
def predict_topk_intents(message: str, k: int = 3):
    ids = encode(message, stoi)
    if len(ids) == 0:
        return []
    x = torch.tensor(ids, dtype=torch.long).unsqueeze(0)  # [1,T]
    lengths = torch.tensor([x.size(1)], dtype=torch.long)

    x = x.to(DEVICE)
    lengths = lengths.to(DEVICE)

    logits = model(x, lengths)
    probs = torch.softmax(logits, dim=1).squeeze(0)

    k = min(k, probs.numel())
    vals, idxs = torch.topk(probs, k=k)
    return [(id2intent[int(i)], float(p)) for i, p in zip(idxs.tolist(), vals.tolist())]

def retrieve_within_intent(message: str, intent: str):
    idx = intent_index.get(intent)
    if idx is None:
        return None, 0.0

    vect = idx["vectorizer"]
    X = idx["X"]
    sub = idx["df"]

    q_vec = vect.transform([message])
    sims = cosine_similarity(q_vec, X).ravel()
    best_i = int(np.argmax(sims))
    best_score = float(sims[best_i])

    row = sub.iloc[best_i]
    return {
        "answer": row["answer"],
        "matched_question": row["question"],
        "answer_id": row["answer_id"],
        "intent": intent,
    }, best_score


In [9]:
def chat_once(message: str, min_sim: float = 0.20, topk: int = 3):
    message = (message or "").strip()
    if not message:
        return {
            "answer": "Molim upiši pitanje.",
            "intent": None,
            "similarity": 0.0,
            "matched_question": None,
            "answer_id": None,
            "top_intents": []
        }

    top = predict_topk_intents(message, k=topk)
    if not top:
        return {
            "answer": "Nisam siguran da razumijem pitanje. Možeš li ga preformulirati?",
            "intent": None,
            "similarity": 0.0,
            "matched_question": None,
            "answer_id": None,
            "top_intents": []
        }

    best_intent = top[0][0]
    best_row, sim = retrieve_within_intent(message, best_intent)

    if best_row is None:
        return {
            "answer": "Nemam bazu odgovora za taj intent.",
            "intent": best_intent,
            "similarity": 0.0,
            "matched_question": None,
            "answer_id": None,
            "top_intents": [{"intent": i, "prob": p} for i, p in top]
        }

    if sim < min_sim:
        return {
            "answer": "Nisam siguran da imam točan odgovor. Možeš li navesti više detalja (npr. koji studij/semestar/rok)?",
            "intent": best_intent,
            "similarity": sim,
            "matched_question": best_row["matched_question"],
            "answer_id": best_row["answer_id"],
            "top_intents": [{"intent": i, "prob": p} for i, p in top]
        }

    return {
        "answer": best_row["answer"],
        "intent": best_intent,
        "similarity": sim,
        "matched_question": best_row["matched_question"],
        "answer_id": best_row["answer_id"],
        "top_intents": [{"intent": i, "prob": p} for i, p in top]
    }


In [10]:
chat_once("Koje je radno vrijeme referade?")


{'answer': 'Radno vrijeme za studente je ponedjeljak–petak 7:30–14:30.',
 'intent': 'referada',
 'similarity': 0.7432511013213126,
 'matched_question': 'Radno vrijeme referade?',
 'answer_id': 'A011',
 'top_intents': [{'intent': 'referada', 'prob': 0.9740541577339172},
  {'intent': 'knjiznica', 'prob': 0.01690058410167694},
  {'intent': 'studentski_zbor', 'prob': 0.00561949098482728}]}

In [11]:
def chat_loop(min_sim: float = 0.20, topk: int = 3):
    print("MATHOS chatbot (upiši 'exit' za izlaz)\n")
    while True:
        user = input("TI: ").strip()
        if user.lower() in {"exit", "quit", "q"}:
            print("BOT: Bok!")
            break

        out = chat_once(user, min_sim=min_sim, topk=topk)

        print(f"BOT: {out['answer']}")
        # debug info (korisno za razvoj)
        print(f"     intent={out['intent']}  sim={out['similarity']:.2f}  answer_id={out['answer_id']}")
        if out.get("matched_question"):
            print(f"     match: {out['matched_question']}")
        if out.get("top_intents"):
            tops = ", ".join([f"{x['intent']}:{x['prob']:.2f}" for x in out["top_intents"]])
            print(f"     top: {tops}")
        print()

chat_loop(min_sim=0.20, topk=3)


MATHOS chatbot (upiši 'exit' za izlaz)



TI:  Kada radi referada??


BOT: Radno vrijeme za studente je ponedjeljak–petak 7:30–14:30.
     intent=referada  sim=0.32  answer_id=A011
     match: Kad referada radi sa studentima?
     top: referada:0.87, upisi:0.03, povijest:0.02



TI:  Kak


BOT: Nisam siguran da imam točan odgovor. Možeš li navesti više detalja (npr. koji studij/semestar/rok)?
     intent=mathos_cup  sim=0.00  answer_id=A129
     match: Što je MathOS cup?
     top: mathos_cup:0.19, anketa:0.15, knjiznica:0.14



TI:  Gdje se MOgu upisati?


BOT: Studentska referada je na adresi Trg Ljudevita Gaja 6, 31000 Osijek. 
     intent=referada  sim=0.47  answer_id=A012
     match: Gdje se nalazi referada?
     top: referada:0.61, nastava:0.18, knjiznica:0.11



TI:  exit


BOT: Bok!
