<a href="https://colab.research.google.com/github/vnavya2004/BTP/blob/main/Paper(1_s2_0_S0933365723002300_main)_Bangla.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Import necessary libraries
import pandas as pd
from google.colab import files

# Upload dataset
uploaded = files.upload()

# Load dataset
file_name = list(uploaded.keys())[0]
data = pd.read_excel(file_name)  # Change this if the dataset isn't in Excel format

# Libraries for text processing
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Download VADER for sentiment analysis
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Set English stopwords
english_stopwords = set(stopwords.words('english'))


Saving Bangla2_translated.xlsx to Bangla2_translated.xlsx


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [7]:

def clean_text(text):
    # Check if the input is a string
    if isinstance(text, str):
        # Remove URLs, hashtags, mentions, and special characters
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'#\w+|@\w+', '', text)
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        # Remove stopwords
        text = ' '.join([word.lower() for word in text.split() if word not in english_stopwords])
        return text
    else:
        # Handle non-string values, e.g., by returning an empty string
        return ''

# Apply cleaning function to the dataset
data['cleaned_text'] = data['tweets_english'].apply(clean_text)  # Replace 'Tweets_english' with the actual column name

# Sentiment Analysis using VADER
vader = SentimentIntensityAnalyzer()
data['sentiment_score'] = data['cleaned_text'].apply(lambda x: vader.polarity_scores(x)['compound'])

# Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Bag of Words
bow_vectorizer = CountVectorizer(max_features=5000)
bow_features = bow_vectorizer.fit_transform(data['cleaned_text'])

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_features = tfidf_vectorizer.fit_transform(data['cleaned_text'])

# Concatenate sentiment score as an additional feature
import scipy.sparse as sp
X = sp.hstack([tfidf_features, data[['sentiment_score']]])

# Label column
y = data['labels']  # Replace 'Labels' with your label column name

# Sampling methods
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

# Initialize classifiers
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np

models = {
    'LGBM': LGBMClassifier(),
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC(),
    'LogisticRegression': LogisticRegression()
}

# Sampling techniques to try
sampling_methods = {
    'No Sampling': None,
    'Oversampling': RandomOverSampler(),
    'Undersampling': RandomUnderSampler(),
    'Combined Sampling': SMOTEENN()
}

# Number of epochs
num_epochs = 5
results = {f"{model_name}_{sampling}": [] for model_name in models.keys() for sampling in sampling_methods.keys()}

# Train and evaluate each model over multiple epochs with each sampling method
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    # Split data for each epoch
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=epoch)

    for sampling_name, sampler in sampling_methods.items():
        if sampler:
            X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
        else:
            X_resampled, y_resampled = X_train, y_train

        for model_name, model in models.items():
            model.fit(X_resampled, y_resampled)
            y_pred = model.predict(X_test)

            # Calculate F1 score for each epoch
            f1 = f1_score(y_test, y_pred, average='weighted')
            results[f"{model_name}_{sampling_name}"].append(f1)

            # Print epoch results for each model and sampling method
            print(f"{model_name} with {sampling_name} - Epoch {epoch + 1}:")
            print("Accuracy:", accuracy_score(y_test, y_pred))
            print("F1 Score:", f1)
            print(classification_report(y_test, y_pred))

# Calculate the average F1 score for each model and sampling method
avg_f1_scores = {model_name: np.mean(scores) for model_name, scores in results.items()}

# Find the model and sampling method with the highest average F1 score
best_model_name = max(avg_f1_scores, key=avg_f1_scores.get)
best_model_type, best_sampling_method = best_model_name.split('_')
best_model = models[best_model_type]

print(f"\nBest model based on average F1 score over {num_epochs} epochs: {best_model_name}")
print(f"Average F1 Score: {avg_f1_scores[best_model_name]:.4f}")

# Retrain the best model on the entire training set with the selected sampling method and evaluate on test data
if best_sampling_method != 'No Sampling':
    best_sampler = sampling_methods[best_sampling_method]
    X_resampled, y_resampled = best_sampler.fit_resample(X_train, y_train)
else:
    X_resampled, y_resampled = X_train, y_train

best_model.fit(X_resampled, y_resampled)
y_pred = best_model.predict(X_test)

print(f"\nFinal Evaluation of Best Model ({best_model_name}) on Test Set:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))



Epoch 1/5
[LightGBM] [Info] Number of positive: 796, number of negative: 2335
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007459 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6022
[LightGBM] [Info] Number of data points in the train set: 3131, number of used features: 420
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.254232 -> initscore=-1.076168
[LightGBM] [Info] Start training from score -1.076168
LGBM with No Sampling - Epoch 1:
Accuracy: 0.8288633461047255
F1 Score: 0.8191897053430318
              precision    recall  f1-score   support

           0       0.86      0.93      0.89       595
           1       0.70      0.51      0.59       188

    accuracy                           0.83       783
   macro avg       0.78      0.72      0.74       783
weighted avg       0.82      0.83      0.82       783

RandomForest wit

In [9]:
from sklearn.metrics import precision_score, recall_score

# Final Evaluation of Best Model on Test Set with results up to 5 decimal places
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\nFinal Evaluation of Best Model ({best_model_name}) on Test Set:")
print(f"Accuracy: {accuracy:.5f}")
print(f"Precision: {precision:.5f}")
print(f"Recall: {recall:.5f}")
print(f"F1 Score: {f1:.5f}")
print(classification_report(y_test, y_pred, digits=5))



Final Evaluation of Best Model (SVM_Oversampling) on Test Set:
Accuracy: 0.81992
Precision: 0.81965
Recall: 0.81992
F1 Score: 0.81978
              precision    recall  f1-score   support

           0    0.87695   0.87847   0.87771       576
           1    0.66019   0.65700   0.65860       207

    accuracy                        0.81992       783
   macro avg    0.76857   0.76774   0.76815       783
weighted avg    0.81965   0.81992   0.81978       783

