In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/urdu-sentence-labeled/Balanced_Sentiments_dataset_bert_model.xlsx


In [None]:
import pandas as pd
import re
import nltk
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.linear_model import LogisticRegression

# Ensure nltk resources are available
# nltk.download('punkt')
# nltk.download('stopwords')

# Create output widget to manage dynamic output
output = widgets.Output()

# Create text input widget globally (but not displayed yet)
text_input = widgets.Text(description="Enter Urdu text:", placeholder='Type a sentence in Urdu and press Enter')

# ✅ Urdu Preprocessing Function
def urdu_preprocessor(text):
    text = re.sub(r'[۔،؛؟!٭ء]', ' ', str(text))  # Remove Urdu punctuation
    text = re.sub(r'\s+', ' ', text).strip()
    try:
        tokens = word_tokenize(text)
        urdu_stopwords = set(stopwords.words('urdu')) if 'urdu' in stopwords.fileids() else set()
        tokens = [token for token in tokens if token not in urdu_stopwords]
        return ' '.join(tokens)
    except Exception:
        return text

# 📚 Train function
def train_model(b):
    with output:
        clear_output(wait=True)
        print("🔄 Training started...")

        # 📁 Load dataset
        try:
            df = pd.read_excel("/kaggle/input/urdu-sentence-labeled/Dataset.xlsx", sheet_name='Sheet1')
            df.columns = df.columns.str.strip()
            df = df.dropna(subset=['Urdu Sentence', 'Sentiment'])
        except Exception as e:
            print(f"❌ Error loading data: {e}")
            return

        # 🧹 Preprocess Urdu
        df['cleaned_text'] = df['Urdu Sentence'].apply(urdu_preprocessor)

        # 🧪 Split data
        X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['Sentiment'], test_size=0.2, random_state=42)

        # 🔤 TF-IDF
        vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.9, min_df=5)
        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)

        # 🧠 Train model
        model = LogisticRegression(max_iter=1000)
        model.fit(X_train_vec, y_train)
        print("✅ Model trained!")

        # 📊 Evaluation
        y_pred = model.predict(X_test_vec)
        print("📊 Classification Report:")
        print(classification_report(y_test, y_pred))

        # 🔥 Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(6, 4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='YlGnBu', xticklabels=model.classes_, yticklabels=model.classes_)
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title("Confusion Matrix (Naive Bayes - Urdu)")
        plt.tight_layout()
        plt.savefig("urdu_confusion_matrix.png")
        plt.show()

        # 💾 Save model and vectorizer
        with open("urdu_sentiment_nb_classifier.pkl", "wb") as f:
            pickle.dump(model, f)
        print("✅ Saved: urdu_sentiment_nb_classifier.pkl")

        with open("urdu_tfidf_vectorizer.pkl", "wb") as f:
            pickle.dump(vectorizer, f)
        print("✅ Saved: urdu_tfidf_vectorizer.pkl")

        # 💾 Save predictions to Excel
        results_df = pd.DataFrame({
            'Original Sentence': X_test.values,
            'Actual Sentiment': y_test.values,
            'Predicted Sentiment': y_pred
        })
        results_df.to_excel("urdu_predictions_with_actual_predicated.xlsx", index=False)
        print("📁 Saved predictions to 'urdu_predictions_with_actual_predicated.xlsx'")

# 🔍 Test function
def test_model(b):
    with output:
        clear_output(wait=True)
        print("🔍 Testing loaded model...")

        try:
            with open("urdu_sentiment_nb_classifier.pkl", "rb") as f:
                model = pickle.load(f)
            with open("urdu_tfidf_vectorizer.pkl", "rb") as f:
                vectorizer = pickle.load(f)
            print("✅ Model and Vectorizer loaded!")
        except Exception as e:
            print(f"❌ Error loading model/vectorizer: {e}")
            return

        # Define the submit function
        def on_submit(change):
            with output:
                clear_output(wait=True)
                print("📝 Enter key pressed, processing input...")
                input_text = text_input.value.strip()
                if input_text:
                    try:
                        processed_input = urdu_preprocessor(input_text)
                        input_vector = vectorizer.transform([processed_input])
                        prediction = model.predict(input_vector)[0]
                        print(f"💡 Predicted Sentiment: {prediction}")
                    except Exception as e:
                        print(f"❌ Error predicting sentiment: {e}")
                else:
                    print("ℹ Please enter text to get a sentiment prediction.")

        text_input.unobserve_all()
        text_input.on_submit(on_submit)

        # ✅ Show the input box only during testing
        display(text_input)

# 🔘 Create buttons with layout
button_layout = widgets.Layout(width='300px')
train_button = widgets.Button(
    description="Train Urdu_Sentence Model",
    button_style='success',
    layout=button_layout
)
test_button = widgets.Button(
    description="Urdu_Sentence Model",
    button_style='info',
    layout=button_layout
)

train_button.on_click(train_model)
test_button.on_click(test_model)

# 📺 Display the UI
display(widgets.HBox([train_button, test_button]))
display(output)

HBox(children=(Button(button_style='success', description='Train Urdu_Sentence Model', layout=Layout(width='30…

Output()