<a href="https://colab.research.google.com/github/vnavya2004/BTP/blob/main/Paper(3569580)_Arabic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
# Step 2: Install required libraries
!pip install pandas nltk torch torchvision

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

# Initialize NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:

# Step 3: Data Cleaning and Data Pre-Processing
def data_cleaning(df):
    ps = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    def preprocess_text(text):
        words =  word_tokenize(str(text).lower())
        cleaned_words = []
        for word in words:
            if word.isalpha():  # Keep only alphabetic words
                word = word.lower()  # Convert to lowercase
                if word not in stop_words:  # Remove stop words
                    word = ps.stem(word)  # Stemming
                    cleaned_words.append(word)
        return ' '.join(cleaned_words)

    df['cleaned_text'] = df['tweet_english'].apply(preprocess_text)  # Adjust column name if necessary
    return df['cleaned_text']


In [3]:

# Step 5: FCL method to detect depression (PyTorch model)
class FCLModel(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(FCLModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=5)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.dropout = nn.Dropout(0.5)
        self.lstm = nn.LSTM(embed_dim, 100, batch_first=True)
        self.fc = nn.Linear(100, 1)  # Binary classification

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)  # Add channel dimension
        x = self.pool(torch.relu(self.conv1(x)))
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x, _ = self.lstm(x.unsqueeze(1))  # Pass through LSTM
        x = self.fc(x[:, -1, :])  # Get the output from the last time step
        return torch.sigmoid(x)

In [4]:
!pip install gensim
import gensim

from google.colab import files
uploaded = files.upload()  # Manually upload your .xlsx file

# Change the file name as per the uploaded file
df = pd.read_excel(next(iter(uploaded.keys()))) # Provide the path to your Excel file
df = df.sample(frac=0.4, random_state=42)
# Data Cleaning and Pre-Processing
cleaned_data = data_cleaning(df)




Saving Arabic_Depression_10.000_Tweets_translated (2).xlsx to Arabic_Depression_10.000_Tweets_translated (2).xlsx


KeyError: 'tweets'

In [7]:

cleaned_data = data_cleaning(df)



In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,label,tweet_english
6252,1278,عيد قيامة مجيد يا شباب اتمنى ايامنا كلها تبقى ...,0,"Merry Christmas, guys. I hope all our days rem..."
4684,5141,أليس مُملًا أن تحزن لنفس السبب يومياً,1,Isn't it boring to be sad for the same reason ...
1731,1784,انا هاليومين صاير عندي ضيق في تنفس بشكل غير طب...,1,These two days I became abnormally short of br...
4742,5201,لم تعد الأفلام والمسلسلات مهرب، ولا الأغاني وا...,1,"Movies and series are no longer an escape, son..."
4521,4757,خذلاني لم أتوقع يوما أن أخذل بهاذا الشكل أشعر ...,1,He let me down. I never expected to be let dow...


In [8]:
# Load FastText model
ft_model = gensim.models.KeyedVectors.load('/content/drive/MyDrive/embeddings/cc.en.300.model')  # Change to your FastText model path

In [9]:

# Step 4: Creation of Vocabulary for FCL method
# Step 4: Creation of Vocabulary for FCL method
def create_embedding_layer(cleaned_data, ft_model):
    embeddings = []
    for tweet in cleaned_data:
        # Access word vectors using bracket notation
        tweet_embeddings = [ft_model[word] for word in tweet.split() if word in ft_model]
        # Check if the word is in the model's vocabulary to avoid KeyError
        if tweet_embeddings:  # Check if tweet_embeddings is not empty
            embeddings.append(np.mean(tweet_embeddings, axis=0))  # Average embeddings
        else:  # If tweet_embeddings is empty (no known words)
            embeddings.append(np.zeros(ft_model.vector_size))  # Append a zero vector
    return np.array(embeddings)
# Create Embedding Layer
embeddings = create_embedding_layer(cleaned_data, ft_model)


In [10]:
# Prepare labels
labels = df['label'].values  # Adjust based on your dataset

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)
# Step 5: FCL method to detect depression (PyTorch model)
# Step 5: FCL method to detect depression (PyTorch model)
class FCLModel(nn.Module):
    def __init__(self, input_dim):  # Modified constructor
        super(FCLModel, self).__init__()
        # Removed unnecessary embedding layer
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=5)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.dropout = nn.Dropout(0.5)
        self.lstm = nn.LSTM(64, 100, batch_first=True)  # Changed input_dim to 64
        self.fc = nn.Linear(100, 1)  # Binary classification

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension
        x = self.pool(torch.relu(self.conv1(x)))
        # Remove view, LSTM expects 3D input (batch_size, seq_len, features)
        # The output of pool has (batch_size, out_channels, seq_len)
        # Permute to get (batch_size, seq_len, out_channels)
        x = x.permute(0, 2, 1)
        x, _ = self.lstm(x)  # Pass through LSTM
        x = self.fc(x[:, -1, :])  # Get the output from the last time step
        return torch.sigmoid(x)
# Build FCL model
model = FCLModel(input_dim=embeddings.shape[1])






In [11]:
# progress bar
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from tqdm import tqdm  # Import tqdm for progress bar

def evaluate_model(model, x_train, y_train, x_test, y_test, epochs=10, batch_size=32):
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        model.train()
        # Use tqdm to create a progress bar for training
        for i in tqdm(range(0, len(x_train), batch_size), desc=f"Epoch {epoch + 1}/{epochs}", unit="batch"):
            # Change dtype to torch.float32 for inputs
            inputs = torch.tensor(x_train[i:i + batch_size], dtype=torch.float32)
            labels = torch.tensor(y_train[i:i + batch_size], dtype=torch.float32).view(-1, 1)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # Evaluate
    model.eval()
    with torch.no_grad():
        # Change dtype to torch.float32 for x_test
        y_pred = model(torch.tensor(x_test, dtype=torch.float32)).numpy()
        y_pred = (y_pred > 0.5).astype(int)

    # Metrics
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))


In [12]:
evaluate_model(model, x_train, y_train, x_test, y_test)


Epoch 1/10: 100%|██████████| 100/100 [00:16<00:00,  5.92batch/s]
Epoch 2/10: 100%|██████████| 100/100 [00:16<00:00,  6.16batch/s]
Epoch 3/10: 100%|██████████| 100/100 [00:09<00:00, 10.35batch/s]
Epoch 4/10: 100%|██████████| 100/100 [00:09<00:00, 10.35batch/s]
Epoch 5/10: 100%|██████████| 100/100 [00:08<00:00, 12.19batch/s]
Epoch 6/10: 100%|██████████| 100/100 [00:09<00:00, 10.99batch/s]
Epoch 7/10: 100%|██████████| 100/100 [00:10<00:00,  9.75batch/s]
Epoch 8/10: 100%|██████████| 100/100 [00:09<00:00, 10.73batch/s]
Epoch 9/10: 100%|██████████| 100/100 [00:08<00:00, 11.25batch/s]
Epoch 10/10: 100%|██████████| 100/100 [00:09<00:00, 10.62batch/s]


Confusion Matrix:
[[297 101]
 [ 93 309]]
Precision: 0.7536585365853659
Recall: 0.7686567164179104
F1 Score: 0.7610837438423645
Accuracy: 0.7575
