## Text Classification Function

### Imports and Loading

In [1]:
# imports
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from lightgbm import LGBMClassifier

import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import joblib

In [2]:
model = joblib.load('data/trained_model.joblib')
vectorizer = joblib.load('data/vectorizer.joblib')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [3]:
def preprocess_text(text):
    '''Preprocess text by making it lowercase, removing text in square brackets,
removing links, removing punctuation, and removing words containing numbers.'''
    return re.sub('\[.*?\]|\w*\d\w*|https?://\S+|www\.\S+|<.*?>+|[%s]' %
re.escape(string.punctuation), '', str(text).lower())

def apply_stemming(sentence):
    stemmer = nltk.SnowballStemmer("english")
    return ' '.join(stemmer.stem(word) for word in sentence.split(' '))

def preprocess_and_clean(sentence):
    '''Preprocess and clean the text'''
    cleaned_text = preprocess_text(sentence)
    stop_words = stopwords.words('english')
    removed_stopwords_text = ' '.join(word for word in
cleaned_text.split(' ') if word not in stop_words)
    stemmed_text = ' '.join(apply_stemming(word) for word
in removed_stopwords_text.split(' '))
    return stemmed_text

### Classify Function

In [7]:
def returnClassifyResults(txt: str, model, vectorizer, k:int = 3) -> np.ndarray:
    """
        Function to return top k classification results based on given text
    """
    processed_txt = preprocess_and_clean(txt)
    x = pd.DataFrame([processed_txt], columns = ['preprocessed_text'])
    x_vectorized = vectorizer.transform(x['preprocessed_text'])
    y_pred_proba = model.predict_proba(x_vectorized)
    topk_indices = np.argsort(y_pred_proba[0])[::-1][:k]
    topk_categories = model.classes_[topk_indices]
    topk_probabilities = y_pred_proba[0][topk_indices]
    return topk_categories, topk_probabilities


In [22]:
txt = """
This year’s flu season is on track to be the worst it’s been since before the Covid-19 pandemic, as respiratory illnesses surge to a second peak. There have already been at least 24 million illnesses, 310,000 hospitalizations, and 13,000 deaths from flu, the US Centers for Disease Control and Prevention estimates. The cumulative hospitalization rate – about 64 stays for every 100,000 people, as of February 1 – is the highest it’s been at this point in the season for the past seven years. Flu activity is high or very high in all but six states.
"""
topk_categories, topk_probabilities = returnClassifyResults(txt, model, vectorizer, k=3)

for category, probability in zip(topk_categories, topk_probabilities):
    print(f"Category: {category}, Probability: {probability}")

Category: health, Probability: 0.9772184712538277
Category: society, Probability: 0.019720840618475097
Category: economy, business and finance, Probability: 0.001408854446298189


### Interface

In [49]:
def pseudoClassifyAPI(txt: str):
    
    # return top 5 search results
    topk_categories, topk_probabilities = returnClassifyResults(txt, model, vectorizer, k=3)
    response = {category: probability for category, probability in zip(topk_categories, topk_probabilities)}
    # response = {'categories': topk_categories, 'probabilities': topk_probabilities}
    return response

In [28]:
import gradio as gr
gr.Interface(fn=pseudoClassifyAPI, 
             inputs='text', 
             outputs=gr.Label(num_top_classes=3),
             theme='ocean',
             title = 'Multi-label news category classificatiom').launch()

* Running on local URL:  http://127.0.0.1:7868

To create a public link, set `share=True` in `launch()`.




In [41]:
import gradio as gr

import gradio as gr
import requests
from bs4 import BeautifulSoup
import random

# Function to fetch a random CNN Lite article
def get_random_cnn_article():
    base_url = "https://lite.cnn.com/"
    response = requests.get(base_url)
    
    if response.status_code != 200:
        return "Failed to fetch articles."

    soup = BeautifulSoup(response.text, "html.parser")
    
    # Extract article links
    articles = [a["href"] for a in soup.find_all("a", href=True) if a["href"].startswith("/")]
    if not articles:
        return "No articles found."
    
    # Pick a random article
    random_article = random.choice(articles)
    article_url = f"{base_url}{random_article}"
    
    # Fetch article content
    response = requests.get(article_url)
    if response.status_code != 200:
        return "Failed to fetch the article."

    soup = BeautifulSoup(response.text, "html.parser")
    paragraphs = [p.get_text() for p in soup.find_all("p")]
    txt = "\n".join(paragraphs).split('See Full Web Article')[0]  # Limit output to first 5 paragraphs
    return txt  # Limit output to first 5 paragraphs



In [50]:
with gr.Blocks(theme='ocean') as demo:
    gr.Markdown("# Multi-label news category classification")
    
    with gr.Row():
        # Left column
        with gr.Column(scale=1):
            text_input = gr.Textbox(label="Enter text for classification")
            random_button = gr.Button("Get a random today's CNN article")
            
            with gr.Row():
                clear_button = gr.Button("Clear")
                classify_button = gr.Button("Classify", variant="primary")
        
        # Right column
        with gr.Column(scale=1):
            output = gr.Label(num_top_classes=3)
    
    random_button.click(
        fn=get_random_cnn_article,
        outputs=text_input
    )
    
    clear_button.click(
        fn=lambda: ("", None), 
        outputs=[text_input, output]
    )
    
    classify_button.click(
        fn=pseudoClassifyAPI,
        inputs=text_input,
        outputs=output
    )

demo.launch()

* Running on local URL:  http://127.0.0.1:7880

To create a public link, set `share=True` in `launch()`.




In [52]:
import requests
requests.get('http://35.92.204.37/info').text

'{"name":"news-cateorization","description":"Categorization API for Tzu-Jo Hsu\'s NLP project demo."}'