## Set Environment

In [None]:
from minio import Minio
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from google import genai
import ipywidgets as widgets
from IPython.display import display
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm
from matplotlib.colors import Normalize

## Connect to MinIO & Download CSV

In [None]:
# 1) Connect to MinIO
client = Minio(
    "play.min.io",
    access_key="Q3AM3UQ867SPQQA43P2F",
    secret_key="zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG",
    secure=True
)

bucket_name = "sit-bcc-news"
object_name = "bbc_news.csv"
download_to = "bbc_news.csv"

# 2) Download file
client.fget_object(bucket_name, object_name, download_to)

## Load & Explore Dataset

In [None]:
df = pd.read_csv("/content/bbc_news.csv")

# ต้องมีคอลัมน์ชื่อ title
titles = df["title"].astype(str).tolist()

## Topic Modeling With BERTopic


In [None]:
vectorizer_model = CountVectorizer(stop_words="english")

In [13]:
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(titles, show_progress_bar=True)

NameError: name 'SentenceTransformer' is not defined

In [None]:
topic_model = BERTopic(
    language="english",
    min_topic_size= 250,
    vectorizer_model=vectorizer_model
)
topics, probs = topic_model.fit_transform(titles, embeddings)

In [None]:
df["topic"] = topics

## Check the number of clusters and the details of each topic.

In [None]:
topic_model.get_topic_info()

In [None]:
topic_info = topic_model.get_topic_info()
df_topics = topic_info.reset_index(drop=True)
df_topics[["Topic", "Representation"]].head()

## Visualization from topic

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_barchart()

## Create Topic Tags

In [None]:
client = genai.Client(api_key="AIzaSyCUMDvCcMm0sBkEORCL1i9WpzGI6krcYTE")

def generate_topic_tag(keywords):
    prompt = f"""
    You are an expert at naming news categories.
    Given a list of representative keywords from a topic model, generate a short, concise, human-friendly topic label.

    Rules:
    - Maximum 3–5 words
    - No special characters
    - Use clear English
    - It must be a category name, not a sentence.

    Keywords: {keywords}

    Respond with only the topic label.
    """

    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=prompt
    )

    return response.text.strip()


In [None]:
topic_tags = []

for _, row in df_topics.iterrows():
    keywords = row["Representation"]
    topic_id = row["Topic"]
    tag = generate_topic_tag(keywords)

    topic_tags.append({
        "Topic": topic_id,
        "Keywords": keywords,
        "Tag": tag
    })

import pandas as pd
df_tags = pd.DataFrame(topic_tags)
df_tags

## Merged DataFrame

In [None]:
df_merged = df.merge(df_tags, left_on="topic", right_on="Topic", how="left")

## Search news by tag

In [None]:
# -------------------------
# UI Widgets
# -------------------------
tag_dropdown = widgets.Dropdown(
    options=df_tags["Tag"].tolist(),
    description="Tag:",
    style={"description_width": "80px"},
    layout=widgets.Layout(width="400px")
)

search_button = widgets.Button(
    description="Search",
    button_style="primary",
    layout=widgets.Layout(width="150px")
)

output_area = widgets.Output()

# -------------------------
# Search Function
# -------------------------
def search_by_tag(tag_name):
    result = df_merged[df_merged["Tag"] == tag_name]
    return result[["title", "description", "topic"]]

# -------------------------
# Event Handler
# -------------------------
def on_search_click(b):
    with output_area:
        output_area.clear_output()

        tag = tag_dropdown.value
        df_result = search_by_tag(tag)

        print(f"Showing results for tag: {tag}\n")
        display(df_result.head(50))  # แสดง 50 รายการแรก

search_button.on_click(on_search_click)

# -------------------------
# Display UI
# -------------------------
display(tag_dropdown, search_button, output_area)

##Visualization: Number of News Articles per Tag (Bar Chart)

In [None]:
# นับจำนวนข่าวต่อ Tag
tag_counts = df_merged["Tag"].value_counts().sort_values(ascending=False)

counts = tag_counts.values
tags = tag_counts.index

# ใช้ค่า count ในการทำให้สีเข้ม/อ่อนตามจำนวน
norm = Normalize(vmin=min(counts), vmax=max(counts))

# cmap: viridis (เข้มเมื่อค่ามาก)
colors = cm.viridis_r(norm(counts))

plt.figure(figsize=(14, 7))

bars = plt.bar(tags, counts, color=colors)

# ใส่จำนวนบนแต่ละแท่ง
for bar in bars:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height,
        f"{height}",
        ha='center',
        va='bottom',
        fontsize=9
    )

plt.title("NUMBER OF NEWS ARTICLES PER TAG", fontsize=18)
plt.xlabel("Tag", fontsize=13)
plt.ylabel("Count", fontsize=13)
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
