<a href="https://colab.research.google.com/github/vnavya2004/BTP/blob/main/Paper(1_s2_0_S0933365723002300_main)_Bangla.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import pandas as pd
from google.colab import files

# Upload dataset
uploaded = files.upload()

# Load dataset
file_name = list(uploaded.keys())[0]
data = pd.read_excel(file_name)  # Change this if the dataset isn't in CSV format


Saving Bangla2_translated_Arabic.xlsx to Bangla2_translated_Arabic.xlsx


In [3]:
# Libraries for text processing
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Set Arabic stopwords
arabic_stopwords = set(stopwords.words('arabic'))

def clean_text(text):
    # Remove URLs, hashtags, mentions, and special characters
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'#\w+|@\w+', '', text)
    text = re.sub(r'[^ء-ي]', ' ', text)
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in arabic_stopwords])
    return text

# Apply cleaning function to the dataset
data['cleaned_text'] = data['tweets_arabic'].apply(clean_text)  # Replace 'text_column' with the actual column name


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Bag of Words
bow_vectorizer = CountVectorizer(max_features=5000)
bow_features = bow_vectorizer.fit_transform(data['cleaned_text'])

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_features = tfidf_vectorizer.fit_transform(data['cleaned_text'])


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np

# Define features and labels
X = tfidf_features  # Or use bow_features for Bag-of-Words
y = data['labels']  # Replace 'label' with your label column name

# Initialize classifiers
models = {
    'LGBM': LGBMClassifier(),
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC(),
    'LogisticRegression': LogisticRegression()
}

# Number of epochs
num_epochs = 5
results = {model_name: [] for model_name in models.keys()}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate each model over multiple epochs
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    # Split data randomly for each epoch


    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calculate F1 score for each epoch
        f1 = f1_score(y_test, y_pred, average='weighted')
        results[model_name].append(f1)

        # Print epoch results for each model
        print(f"{model_name} - Epoch {epoch + 1}:")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("F1 Score:", f1)
        print(classification_report(y_test, y_pred))

# Calculate the average F1 score for each model
avg_f1_scores = {model_name: np.mean(scores) for model_name, scores in results.items()}

# Find the model with the highest average F1 score
best_model_name = max(avg_f1_scores, key=avg_f1_scores.get)
best_model = models[best_model_name]

print(f"\nBest model based on average F1 score over {num_epochs} epochs: {best_model_name}")
print(f"Average F1 Score: {avg_f1_scores[best_model_name]:.4f}")

# Retrain the best model on the entire training set and evaluate on test data
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

print(f"\nFinal Evaluation of Best Model ({best_model_name}) on Test Set:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.




Epoch 1/5
[LightGBM] [Info] Number of positive: 793, number of negative: 2338
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005334 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3640
[LightGBM] [Info] Number of data points in the train set: 3131, number of used features: 296
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.253274 -> initscore=-1.081228
[LightGBM] [Info] Start training from score -1.081228
LGBM - Epoch 1:
Accuracy: 0.8186462324393359
F1 Score: 0.807164920299487
              precision    recall  f1-score   support

           0       0.85      0.93      0.89       592
           1       0.68      0.48      0.56       191

    accuracy                           0.82       783
   macro avg       0.76      0.70      0.72       783
weighted avg       0.81      0.82      0.81       783

RandomForest - Epoch 1:
Accuracy: 

In [6]:
# Final evaluation of the best model on the test set with precision to 5 decimal places
from sklearn.metrics import precision_score, recall_score

print(f"\nFinal Evaluation of Best Model ({best_model_name}) on Test Set:")

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Display results with up to 5 decimal places
print(f"Accuracy: {accuracy:.5f}")
print(f"Precision: {precision:.5f}")
print(f"Recall: {recall:.5f}")
print(f"F1 Score: {f1:.5f}")

# Detailed classification report
report = classification_report(y_test, y_pred, digits=5)
print("\nClassification Report:")
print(report)



Final Evaluation of Best Model (RandomForest) on Test Set:
Accuracy: 0.83908
Precision: 0.83310
Recall: 0.83908
F1 Score: 0.83493

Classification Report:
              precision    recall  f1-score   support

           0    0.87702   0.91554   0.89587       592
           1    0.69697   0.60209   0.64607       191

    accuracy                        0.83908       783
   macro avg    0.78700   0.75882   0.77097       783
weighted avg    0.83310   0.83908   0.83493       783

