In [17]:
import os
import pandas as pd
import torch
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from google.colab import drive

# Disable Weights & Biases logging
os.environ["WANDB_DISABLED"] = "true"


# Load your dataset
train_path = "/content/drive/MyDrive/final pro1/train.csv"
test_path = "/content/drive/MyDrive/final pro1/test.csv"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Class Mapping
category_mapping = {1: "Politics", 2: "Sports", 3: "Entertainment", 4: "Technology"}
train_df["Class Index"] -= 1
test_df["Class Index"] -= 1


# Combine title + description
train_df["text"] = train_df["Title"] + " " + train_df["Description"]
test_df["text"] = test_df["Title"] + " " + test_df["Description"]

# ✅ TinyBERT model + tokenizer
model_name = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = BertTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

# HuggingFace Dataset
dataset_train = Dataset.from_pandas(train_df[["text", "Class Index"]].rename(columns={"Class Index": "label"}))
dataset_test = Dataset.from_pandas(test_df[["text", "Class Index"]].rename(columns={"Class Index": "label"}))

dataset_train = dataset_train.map(tokenize_function, batched=True)
dataset_test = dataset_test.map(tokenize_function, batched=True)

# TinyBERT Model for Classification
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=4)

# Training Settings
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Train Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
)

trainer.train()

# ✅ Save fine-tuned TinyBERT
model.save_pretrained("/content/drive/MyDrive/fine_tuned_tinybert")
tokenizer.save_pretrained("/content/drive/MyDrive/fine_tuned_tinybert")

print("✅ TinyBERT training complete and saved in Google Drive.")


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,0.2515,0.247906
2,0.1841,0.231755
3,0.1437,0.235665


✅ TinyBERT training complete and saved in Google Drive.


AWS Ec2 instances, connecting for S3 and RDS databases with streamlit application code 

In [None]:
import streamlit as st
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import boto3
import os
import pymysql
from datetime import datetime
import tempfile

# -------------------- CONFIGURATION --------------------

# RDS MySQL credentials
RDS_HOST = 'database-1.csv4o824ohqz.us-east-1.rds.amazonaws.com'
RDS_USER = 'admin'
RDS_PASSWORD = 'vasanth123'
RDS_DB = 'news_db'

# S3 model location
S3_BUCKET = 'news-article-bucket1'
S3_MODEL_PATH = 'fine_tuned_tinybert/'  # Ensure trailing slash

# Category mapping
category_mapping = {0: "Politics", 1: "Sports", 2: "Entertainment", 3: "Technology"}

# -------------------- LOAD MODEL FROM S3 --------------------

@st.cache_resource
def load_model_from_s3():
    s3 = boto3.client('s3')
    temp_dir = tempfile.mkdtemp()

    # Download model files
    objects = s3.list_objects_v2(Bucket=S3_BUCKET, Prefix=S3_MODEL_PATH)
    for obj in objects.get('Contents', []):
        key = obj['Key']
        if key.endswith('/'):
            continue
        file_path = os.path.join(temp_dir, os.path.basename(key))
        s3.download_file(S3_BUCKET, key, file_path)

    tokenizer = BertTokenizer.from_pretrained(temp_dir)
    model = BertForSequenceClassification.from_pretrained(temp_dir)
    model.eval()
    return tokenizer, model

tokenizer, model = load_model_from_s3()

# -------------------- DATABASE FUNCTIONS --------------------

def connect_db():
    return pymysql.connect(
        host=RDS_HOST,
        user=RDS_USER,
        password=RDS_PASSWORD,
        database=RDS_DB
    )

def log_user(user_name):
    conn = connect_db()
    cursor = conn.cursor()
    sql = "INSERT INTO user_logs (user_name, login_time) VALUES (%s, %s)"
    cursor.execute(sql, (user_name, datetime.now()))
    conn.commit()
    conn.close()

def log_prediction(user_name, title, description, predicted_label):
    conn = connect_db()
    cursor = conn.cursor()
    sql = """
        INSERT INTO classification_logs (user_name, title, description, predicted_label, timestamp)
        VALUES (%s, %s, %s, %s, %s)
    """
    cursor.execute(sql, (user_name, title, description, predicted_label, datetime.now()))
    conn.commit()
    conn.close()

# -------------------- STREAMLIT APP --------------------

# Page control
if 'page' not in st.session_state:
    st.session_state.page = 'login'

# Page 1: Login
if st.session_state.page == 'login':
    st.title("🔐 Welcome to News Classifier")
    user_name = st.text_input("Enter your name to start:")
    if st.button("Start"):
        if user_name.strip():
            st.session_state.user_name = user_name
            log_user(user_name)
            st.session_state.page = 'classify'
        else:
            st.warning("Please enter your name.")

# Page 2: Classification
elif st.session_state.page == 'classify':
    st.title("📰 News Article Classification")
    st.write(f"Welcome, **{st.session_state.user_name}**! Enter a news title and description.")

    title = st.text_input("Enter News Title:")
    description = st.text_area("Enter News Description:")

    if st.button("Classify"):
        if not title.strip() or not description.strip():
            st.warning("Both fields are required.")
        else:
            combined_text = title + " " + description
            inputs = tokenizer(combined_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

            with torch.no_grad():
                outputs = model(**inputs)
                prediction = torch.argmax(outputs.logits, dim=1).item()
                label = category_mapping[prediction]

            # Show result
            st.success(f"Predicted Category: **{label}** 🏷️")

            # Log to RDS
            log_prediction(st.session_state.user_name, title, description, label)
