In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_excel('/content/myFinalDataset(edited version).xlsx')

# Remove rows with NaN labels
df = df.dropna(subset=['label'])

# Ensure labels are integers
df['label'] = df['label'].astype(int)

# Define Bengali stop words
bengali_stop_words = {'আমি', 'তুমি', 'সে', 'আমাদের', 'তোমাদের', 'করে', 'তা', 'কিছু', 'কিছুই', 'এই', 'যে', 'এক',
                      'এটা', 'এ', 'হয়', 'কি', 'ও', 'এবং', 'করতে', 'হয়ে', 'থেকে', 'হয়েছে', 'হয়েছিল', 'থাকে',
                      'থাকা', 'যায়', 'যা', 'নিয়ে', 'না', 'বলে', 'এমন', 'করা', 'জন্য', 'মাধ্যমে', 'কিন্তু', 'আপনি', 'আমার', 'তার', 'এখন',
                      'সঙ্গে', 'তারা', 'করছে', 'এইটা', 'তাদের', 'সেটা', 'সম্পর্কে', 'হতে', 'যেতে', 'সেখান', 'সেটি', 'তারেকে', 'এইচেসে', 'করবেন',
                      'অন্য', 'অন্যান্য', 'বার', 'বা', 'প্রায়', 'আবার', 'আগে', 'এস', 'আগেই', 'যেমন', 'হলে', 'এটি', 'মাত্র', 'কিছুদিন', 'তাহলে',
                      'সেও', 'কেউ', 'মোটামুটি', 'হলো', 'জানা', 'হচ্ছে', 'সব', 'আসে', 'কয়েক', 'বেশি', 'সমস্ত', 'মোটেই', 'যান', 'সহ', 'তিনি',
                      'অথবা', 'যদি', 'দিয়ে', 'আবার', 'পারে', 'কারণ', 'কম', 'হল', 'হলেও', 'কেন', 'বাংলা', 'এখানে', 'কোনো', 'পরে', 'গেল',
                      'সেই', 'দেখা', 'হয়েছে', 'হলেই', 'এসে', 'বিশেষ', 'ওঁরা', 'করি', 'মোট', 'হতেই', 'চেয়ে', 'সম্প্রতি'
}
# Preprocess the text data
def preprocess_text(text):
    # Remove non-Bengali characters and numbers
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize the text using space
    tokens = text.split()

    # Remove stop words
    tokens = [word for word in tokens if word not in bengali_stop_words]

    # Join the tokens back into text
    text = ' '.join(tokens)

    return text

# Apply preprocessing to the text column
df['text'] = df['text'].apply(preprocess_text)

# Extract the texts and labels
texts = df['text'].tolist()
labels = df['label'].tolist()

# Split the dataset into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Vectorize the texts using TF-IDF
vectorizer = TfidfVectorizer(max_features=3000)#9000
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=100)#4500
model.fit(X_train, train_labels)

# Evaluate the model
predictions = model.predict(X_test)
accuracy = accuracy_score(test_labels, predictions)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(test_labels, predictions))


Accuracy: 0.8490
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       492
           1       0.89      0.84      0.87       400
           2       0.85      0.81      0.83       366

    accuracy                           0.85      1258
   macro avg       0.85      0.85      0.85      1258
weighted avg       0.85      0.85      0.85      1258



In [None]:
#Logistic_Regression_with Word2Vec
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK data
nltk.download('punkt')

# Load the dataset
df = pd.read_excel('/content/myFinalDataset(edited version).xlsx')

# Remove rows with NaN labels
df = df.dropna(subset=['label'])

# Ensure labels are integers
df['label'] = df['label'].astype(int)

# Extract the texts and labels
texts = df['text'].tolist()
labels = df['label'].tolist()

# Split the dataset into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Tokenize the texts
train_tokens = [word_tokenize(text) for text in train_texts]
test_tokens = [word_tokenize(text) for text in test_texts]

# Train a Word2Vec model
w2v_model = Word2Vec(sentences=train_tokens, vector_size=3000, window=50, min_count=1, workers=4)

# Generate average Word2Vec embeddings for each text
def get_avg_word2vec(tokens_list, model, vector_size):
    embeddings = []
    for tokens in tokens_list:
        valid_tokens = [token for token in tokens if token in model.wv]
        if valid_tokens:
            embeddings.append(np.mean(model.wv[valid_tokens], axis=0))
        else:
            embeddings.append(np.zeros(vector_size))
    return np.array(embeddings)

X_train = get_avg_word2vec(train_tokens, w2v_model, vector_size=100)
X_test = get_avg_word2vec(test_tokens, w2v_model, vector_size=100)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, train_labels)

# Evaluate the model
predictions = model.predict(X_test)
accuracy = accuracy_score(test_labels, predictions)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(test_labels, predictions))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy: 0.4221
              precision    recall  f1-score   support

           0       0.42      0.78      0.54       492
           1       0.92      0.03      0.06       400
           2       0.41      0.37      0.39       366

    accuracy                           0.42      1258
   macro avg       0.58      0.39      0.33      1258
weighted avg       0.58      0.42      0.35      1258



In [None]:
#logistic regression with fast text pretrained feature extraction
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK data
nltk.download('punkt')

# Load the dataset
df = pd.read_excel('/content/myFinalDataset(edited version).xlsx')

# Remove rows with NaN labels
df = df.dropna(subset=['label'])

# Ensure labels are integers
df['label'] = df['label'].astype(int)

# Extract the texts and labels
texts = df['text'].tolist()
labels = df['label'].tolist()

# Split the dataset into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Tokenize the texts
train_tokens = [word_tokenize(text) for text in train_texts]
test_tokens = [word_tokenize(text) for text in test_texts]

# Download pre-trained FastText embeddings (this is a large file, around 2.4GB)
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.vec.gz

# Unzip the file
!gunzip cc.bn.300.vec.gz

# Load pre-trained FastText embeddings
fasttext_model_path = 'cc.bn.300.vec'
fasttext_model = KeyedVectors.load_word2vec_format(fasttext_model_path, binary=False)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


--2024-05-18 17:29:55--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 108.156.201.76, 108.156.201.129, 108.156.201.112, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|108.156.201.76|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 881274182 (840M) [binary/octet-stream]
Saving to: ‘cc.bn.300.vec.gz’


2024-05-18 17:30:08 (63.6 MB/s) - ‘cc.bn.300.vec.gz’ saved [881274182/881274182]

gzip: cc.bn.300.vec already exists; do you wish to overwrite (y or n)? n
	not overwritten


In [None]:
# Generate average FastText embeddings for each text
def get_avg_fasttext(tokens_list, model, vector_size=500):
    embeddings = []
    for tokens in tokens_list:
        valid_tokens = [token for token in tokens if token in model]
        if valid_tokens:
            embeddings.append(np.mean(model[valid_tokens], axis=0))
        else:
            embeddings.append(np.zeros(vector_size))
    return np.array(embeddings)

X_train = get_avg_fasttext(train_tokens, fasttext_model)
X_test = get_avg_fasttext(test_tokens, fasttext_model)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, train_labels)

# Evaluate the model
predictions = model.predict(X_test)
accuracy = accuracy_score(test_labels, predictions)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(test_labels, predictions))

Accuracy: 0.7623
              precision    recall  f1-score   support

           0       0.71      0.82      0.76       492
           1       0.82      0.72      0.77       400
           2       0.79      0.73      0.76       366

    accuracy                           0.76      1258
   macro avg       0.77      0.76      0.76      1258
weighted avg       0.77      0.76      0.76      1258

