In [None]:
!pip install gradio transformers scikit-learn emoji "clean-text[gpl]" --quiet
import os
import re
import gradio as gr
import torch
import numpy as np
import pandas as pd
import torch.nn.functional as F

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from emoji import demojize
from cleantext import clean


MODEL_PATH = "/content/drive/MyDrive/SENTIEMENT_MODEL/bert_turk_sentiment_model"

files = os.listdir(MODEL_PATH)

if not any(x in files for x in ["vocab.json", "sentencepiece.bpe.model", "tokenizer.json"]):
    print("Tokenizer klasörde bulunamadı → indiriliyor...")
    tokenizer = AutoTokenizer.from_pretrained("bert_turk_sentiment_model")
    tokenizer.save_pretrained(MODEL_PATH)
else:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()

id2label = {0: "Negative", 1: "Notr", 2: "Positive"}


def clean_text(text: str) -> str:
    if not isinstance(text, str):
        text = str(text)

    text = demojize(text, delimiters=(" ", " "))
    text = text.replace("_", " ")

    text = clean(
        text,
        fix_unicode=True,
        to_ascii=False,
        lower=True,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=True,
        no_punct=False,
        no_emoji=True,
        no_line_breaks=True,
        replace_with_url=" ",
        replace_with_email=" ",
        replace_with_phone_number=" ",
        replace_with_number=" ",
        replace_with_currency_symbol=" ",
        lang="tr"
    )

    text = re.sub(r"(.)\1{2,}", r"\1\1", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


train_df = pd.read_csv("/content/drive/MyDrive/SENTIEMENT_MODEL/clean_sentiment_dataset.csv")


def normalize_label_to_id(x):
    if isinstance(x, (int, np.integer)):
        return int(x)
    s = str(x).strip().lower()
    if s in ["0", "negative", "negatif"]:
        return 0
    if s in ["1", "notr", "neutral", "nötr", "notr.", "nötr."]:
        return 1
    if s in ["2", "positive", "pozitif"]:
        return 2

    return 1

train_df["label_id"] = train_df["label"].apply(normalize_label_to_id)


vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(train_df["text"].astype(str))


train_vectors_all = vectorizer.transform(train_df["text"].astype(str))

class_data = {}
for lid in [0, 1, 2]:
    sub_df = train_df[train_df["label_id"] == lid].copy()
    sub_vectors = vectorizer.transform(sub_df["text"].astype(str))
    class_data[lid] = (sub_df.reset_index(drop=True), sub_vectors)


def retrieve_similar_by_label(text, label_id, top_k=3):
    """
    Sadece label_id ile aynı sınıftaki örnekler içinde cosine similarity ile top_k getirir.
    """
    if label_id not in class_data:

        vec = vectorizer.transform([text])
        sims = cosine_similarity(vec, train_vectors_all).flatten()
        top_idx = sims.argsort()[-top_k:][::-1]
        return train_df.iloc[top_idx].copy()

    sub_df, sub_vectors = class_data[label_id]
    if len(sub_df) == 0:
        return sub_df

    vec = vectorizer.transform([text])
    sims = cosine_similarity(vec, sub_vectors).flatten()
    k = min(top_k, sims.shape[0])
    top_idx = sims.argsort()[-k:][::-1]
    return sub_df.iloc[top_idx].copy()


def get_top_keywords(text, top_n=5):
    tfidf_vec = vectorizer.transform([text])
    feature_array = np.array(vectorizer.get_feature_names_out())
    tfidf_sort = np.argsort(tfidf_vec.toarray()).flatten()[::-1]
    return ", ".join(feature_array[tfidf_sort][:top_n])


def predict_sentiment_with_probs(text):
    original_text = text
    cleaned = clean_text(text)

    inputs = tokenizer(
        cleaned,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1).cpu().numpy().flatten()
        pred_id = int(np.argmax(probs))
        pred_label = id2label[pred_id]

    similar_examples = retrieve_similar_by_label(cleaned, pred_id, top_k=3)

    explanation_lines = []
    for _, row in similar_examples.iterrows():
        example_text = str(row["text"])
        example_label = row["label"]
        keywords = get_top_keywords(example_text)
        explanation_lines.append(
            f"- \"{example_text}\" (Etiket: {example_label} | Keywords: {keywords})"
        )
    explanation = "\n".join(explanation_lines)

    prob_text = f"Negative={probs[0]:.2f} | Notr={probs[1]:.2f} | Positive={probs[2]:.2f}"

    result_text = (
        f"Orijinal Metin:\n{original_text}\n\n"
        f"Temizlenmiş Metin:\n{cleaned}\n\n"
        f"Tahmin: {pred_label} ({prob_text})"
    )

    return result_text, explanation


demo = gr.Interface(
    fn=predict_sentiment_with_probs,
    inputs=gr.Textbox(lines=3, placeholder="Metin gir..."),
    outputs=[
        gr.Textbox(label="Tahmin + Temizlenmiş Metin + Olasılıklar"),
        gr.Textbox(label="Benzer Örnekler + Anahtar Kelimeler")
    ],
    title="Türkçe Sentiment + Akıllı Temizleme + Label-Filtered Retrieval",
    description="Metin normalize edilir, model sınıflandırır, sonra SADECE aynı label içinden en benzer örnekler gösterilir."
)

demo.launch(share=True)


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/175.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m174.1/175.4 kB[0m [31m6.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.4/175.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for emoji (setup.py) ... [?25l[?25hdone
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://403b5b535993d04b3d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in

