<a href="https://colab.research.google.com/github/vnavya2004/BTP/blob/main/Paper(1_s2_0_S0933365723002300_main)_Arabic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import necessary libraries
import pandas as pd
from google.colab import files

# Upload dataset
uploaded = files.upload()

# Load dataset
file_name = list(uploaded.keys())[0]
data = pd.read_excel(file_name)  # Change this if the dataset isn't in CSV format


Saving Arabic_Depression_10.000_Tweets.xlsx to Arabic_Depression_10.000_Tweets.xlsx


In [4]:
# Libraries for text processing
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Set Arabic stopwords
arabic_stopwords = set(stopwords.words('arabic'))

def clean_text(text):
    # Remove URLs, hashtags, mentions, and special characters
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'#\w+|@\w+', '', text)
    text = re.sub(r'[^ء-ي]', ' ', text)
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in arabic_stopwords])
    return text

# Apply cleaning function to the dataset
data['cleaned_text'] = data['tweet'].apply(clean_text)  # Replace 'text_column' with the actual column name


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Bag of Words
bow_vectorizer = CountVectorizer(max_features=5000)
bow_features = bow_vectorizer.fit_transform(data['cleaned_text'])

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_features = tfidf_vectorizer.fit_transform(data['cleaned_text'])


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np

# Define features and labels
X = tfidf_features  # Or use bow_features for Bag-of-Words
y = data['label']  # Replace 'label' with your label column name

# Initialize classifiers
models = {
    'LGBM': LGBMClassifier(),
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC(),
    'LogisticRegression': LogisticRegression()
}

# Number of epochs
num_epochs = 5
results = {model_name: [] for model_name in models.keys()}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate each model over multiple epochs
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    # Split data randomly for each epoch


    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calculate F1 score for each epoch
        f1 = f1_score(y_test, y_pred, average='weighted')
        results[model_name].append(f1)

        # Print epoch results for each model
        print(f"{model_name} - Epoch {epoch + 1}:")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("F1 Score:", f1)
        print(classification_report(y_test, y_pred))

# Calculate the average F1 score for each model
avg_f1_scores = {model_name: np.mean(scores) for model_name, scores in results.items()}

# Find the model with the highest average F1 score
best_model_name = max(avg_f1_scores, key=avg_f1_scores.get)
best_model = models[best_model_name]

print(f"\nBest model based on average F1 score over {num_epochs} epochs: {best_model_name}")
print(f"Average F1 Score: {avg_f1_scores[best_model_name]:.4f}")

# Retrain the best model on the entire training set and evaluate on test data
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

print(f"\nFinal Evaluation of Best Model ({best_model_name}) on Test Set:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))



Epoch 1/5
[LightGBM] [Info] Number of positive: 3988, number of negative: 4012
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027604 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8975
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 405
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498500 -> initscore=-0.006000
[LightGBM] [Info] Start training from score -0.006000
LGBM - Epoch 1:
Accuracy: 0.946
F1 Score: 0.945962561489617
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       988
           1       0.98      0.91      0.94      1012

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000

RandomForest - Epoch 1:
Accuracy: 0.958
F1 Sco