In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK data
nltk.download('punkt')

# Load the dataset
df = pd.read_excel('/content/myFinalDataset(edited version).xlsx')

# Remove rows with NaN labels
df = df.dropna(subset=['label'])

# Ensure labels are integers
df['label'] = df['label'].astype(int)

# Extract the texts and labels
texts = df['text'].tolist()
labels = df['label'].tolist()

bengali_stop_words = {'আমি', 'তুমি', 'সে', 'আমাদের', 'তোমাদের', 'করে', 'তা', 'কিছু', 'কিছুই', 'এই', 'যে', 'এক',
                      'এটা', 'এ', 'হয়', 'কি', 'ও', 'এবং', 'করতে', 'হয়ে', 'থেকে', 'হয়েছে', 'হয়েছিল', 'থাকে',
                      'থাকা', 'যায়', 'যা', 'নিয়ে', 'না', 'বলে', 'এমন', 'করা', 'জন্য', 'মাধ্যমে', 'কিন্তু', 'আপনি', 'আমার', 'তার', 'এখন',
                      'সঙ্গে', 'তারা', 'করছে', 'এইটা', 'তাদের', 'সেটা', 'সম্পর্কে', 'হতে', 'যেতে', 'সেখান', 'সেটি', 'তারেকে', 'এইচেসে', 'করবেন',
                      'অন্য', 'অন্যান্য', 'বার', 'বা', 'প্রায়', 'আবার', 'আগে', 'এস', 'আগেই', 'যেমন', 'হলে', 'এটি', 'মাত্র', 'কিছুদিন', 'তাহলে',
                      'সেও', 'কেউ', 'মোটামুটি', 'হলো', 'জানা', 'হচ্ছে', 'সব', 'আসে', 'কয়েক', 'বেশি', 'সমস্ত', 'মোটেই', 'যান', 'সহ', 'তিনি',
                      'অথবা', 'যদি', 'দিয়ে', 'আবার', 'পারে', 'কারণ', 'কম', 'হল', 'হলেও', 'কেন', 'বাংলা', 'এখানে', 'কোনো', 'পরে', 'গেল',
                      'সেই', 'দেখা', 'হয়েছে', 'হলেই', 'এসে', 'বিশেষ', 'ওঁরা', 'করি', 'মোট', 'হতেই', 'চেয়ে', 'সম্প্রতি'
}
# Preprocess text
def preprocess_text(text):
    # Remove non-Bengali characters and numbers
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words
    tokens = [token for token in tokens if token.lower() not in bengali_stop_words]

    return ' '.join(tokens)

# Preprocess the texts
df['text'] = df['text'].apply(preprocess_text)

# Extract the texts and labels again after preprocessing
texts = df['text'].tolist()
labels = df['label'].tolist()

# Split the dataset into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Vectorize the texts using Count Vectorizer
vectorizer = CountVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

# Train a Naive Bayes model
model = MultinomialNB()
model.fit(X_train, train_labels)

# Evaluate the model
predictions = model.predict(X_test)
accuracy = accuracy_score(test_labels, predictions)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(test_labels, predictions))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy: 0.8235
              precision    recall  f1-score   support

           0       0.79      0.85      0.82       492
           1       0.84      0.85      0.84       400
           2       0.85      0.76      0.80       366

    accuracy                           0.82      1258
   macro avg       0.83      0.82      0.82      1258
weighted avg       0.83      0.82      0.82      1258

