<a href="https://colab.research.google.com/github/vnavya2004/Depressiondetection_BTP/blob/main/Spanish(roberta)%2CBangla(hugging%20multilingual).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Roberta spanish
!pip install tensorflow_text
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Upload the file
uploaded = files.upload()
df = pd.read_csv("spanish.csv", encoding='latin1')  # or specify the correct encoding of your CSV file

X_train, X_test, y_train, y_test = train_test_split(df['traducido'], df['class'], stratify=df['class'])

X_train = X_train.astype(str)
X_test = X_test.astype(str)

y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

# Load BERT pre-trained models
bert_preprocess = hub.KerasLayer("https://kaggle.com/models/kaggle/xlm-roberta/frameworks/TensorFlow2/variations/multi-cased-preprocess/versions/1")
bert_encoder = hub.KerasLayer("https://www.kaggle.com/models/kaggle/xlm-roberta/frameworks/TensorFlow2/variations/multi-cased-l-12-h-768-a-12/versions/1")

# Build the model
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Additional LSTM layer
reshaped_output = tf.keras.layers.Reshape((1, 768))(outputs['pooled_output'])
lstm_layer = tf.keras.layers.LSTM(128, name='lstm')(reshaped_output)

# Dropout layer
dropout_layer = tf.keras.layers.Dropout(0.1, name="dropout")(lstm_layer)

# Output layer
output_layer = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(dropout_layer)

# Create the model
model = tf.keras.Model(inputs=[text_input], outputs=output_layer)
model.summary()

# Compile and train the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# Consider increasing the batch size
model.fit(X_train, y_train, epochs=5, batch_size=32)

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Collecting tensorflow_text
  Downloading tensorflow_text-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow_text
Successfully installed tensorflow_text-2.15.0


Saving spanish.csv to spanish.csv
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 text (InputLayer)           [(None,)]                    0         []                            
                                                                                                  
 keras_layer (KerasLayer)    {'input_word_ids': (None,    0         ['text[0][0]']                
                             128),                                                                
                              'input_type_ids': (None,                                            
                             128),                                                                
                              'input_mask': (None, 128)                                           
                             }                              

In [None]:
#Bangla
# Imports
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import pandas as pd
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.exceptions import UndefinedMetricWarning
import warnings
import numpy as np

# Ignore UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# Data loading and preprocessing
uploaded = files.upload()  # Assuming you're using Google Colab
df = pd.read_excel(pd.ExcelFile(list(uploaded.keys())[0]), header=0)

# Define preprocessing steps (adjust based on data format and requirements)
import re
import string

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['tweets'] = df['tweets'].apply(preprocess_text)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['tweets'], df['labels'], stratify=df['labels'])

# Tokenization with Transformers
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize inputs
train_encodings = tokenizer(X_train.tolist(), padding='max_length', truncation=True, return_tensors='tf')
test_encodings = tokenizer(X_test.tolist(), padding='max_length', truncation=True, return_tensors='tf')

# Create and train the model
model = TFBertModel.from_pretrained("bert-base-multilingual-cased")

# Adjust sequence length if necessary
max_length = 128  # Choose an appropriate length based on your data
train_encodings = {key: val[:, :max_length] for key, val in train_encodings.items()}
test_encodings = {key: val[:, :max_length] for key, val in test_encodings.items()}

input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32)
attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32)

bert_outputs = model(input_ids, attention_mask=attention_mask)

# Add layers for your specific task
dropout = tf.keras.layers.Dropout(0.1)(bert_outputs['last_hidden_state'][:, 0, :])
output = tf.keras.layers.Dense(1, activation='sigmoid')(dropout)

model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([train_encodings['input_ids'], train_encodings['attention_mask']], y_train, epochs=5, batch_size=8)

# Evaluate or predict
y_pred = model.predict([test_encodings['input_ids'], test_encodings['attention_mask']])
# y_pred = (y_pred > 0.5).astype(int)  # Assuming binary classification
y_pred = np.where(y_pred > 0.5,1,0)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)  # Adjusted to handle zero division
recall = recall_score(y_test, y_pred)  # Adjusted to handle zero division
f1 = f1_score(y_test, y_pred)  # Adjusted to handle zero division

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Saving Bangla2.xlsx to Bangla2 (3).xlsx


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.7487231869254342
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
from sklearn.metrics import f1_score
from tqdm import tqdm
from google.colab import files

uploaded = files.upload()

# Read the Excel file
df = pd.read_excel(pd.ExcelFile(list(uploaded.keys())[0]), header=0)

# Specify the columns for features (tweets) and labels
tweets_column = 'tweets'
labels_column = 'labels'
NUM_LABELS = len(df[labels_column].unique())
df.head()
possible_labels = df[labels_column].unique()
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
X_train, X_test, y_train, y_test = train_test_split(df[tweets_column], df[labels_column], stratify=df[labels_column])


tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased'
)

encoded_data_train = tokenizer.batch_encode_plus(
    X_train.tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(y_train.values)

# Tokenize the validation data
encoded_data_val = tokenizer.batch_encode_plus(
    X_test.tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(y_test.values)

dataset_train = TensorDataset(input_ids_train,
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val,
                            attention_masks_val,
                           labels_val)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=NUM_LABELS,output_attentions=False,output_hidden_states=False)
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
batch_size = 4
dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
dataloader_val = DataLoader(dataset_val, sampler=RandomSampler(dataset_val), batch_size=32)
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 10
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train)*epochs)
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in tqdm(dataloader_val):

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals
for epoch in range(1, epochs + 1):
    model.train()
    loss_train_total = 0
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write('\nEpoch {epoch}')
    tqdm.write(f'Training loss: {loss_train_avg}')
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')

# Evaluating the Model
accuracy_per_class(predictions, true_vals)



Saving Bangla2.xlsx to Bangla2 (3).xlsx


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch {epoch}
Training loss: 0.5662687711915058


100%|██████████| 31/31 [00:14<00:00,  2.10it/s]


Validation loss: 0.48463686916135973
F1 Score (weighted): 0.6916359877301393





Epoch {epoch}
Training loss: 0.536273728639943


100%|██████████| 31/31 [00:14<00:00,  2.09it/s]


Validation loss: 0.47250235609469876
F1 Score (weighted): 0.7614965301959521





Epoch {epoch}
Training loss: 0.49434868288822775


100%|██████████| 31/31 [00:14<00:00,  2.09it/s]


Validation loss: 0.6507439805615333
F1 Score (weighted): 0.7862498598167327





Epoch {epoch}
Training loss: 0.5065628905166499


100%|██████████| 31/31 [00:14<00:00,  2.09it/s]


Validation loss: 0.5740481759271314
F1 Score (weighted): 0.8070165061746973





Epoch {epoch}
Training loss: 0.4347562735419078


100%|██████████| 31/31 [00:14<00:00,  2.09it/s]


Validation loss: 0.6449720587941908
F1 Score (weighted): 0.8120129284883236





Epoch {epoch}
Training loss: 0.4115614439573327


100%|██████████| 31/31 [00:14<00:00,  2.09it/s]


Validation loss: 0.7496442237207966
F1 Score (weighted): 0.8283489181862164





Epoch {epoch}
Training loss: 0.38471898037244945


100%|██████████| 31/31 [00:14<00:00,  2.09it/s]


Validation loss: 0.8657095428676375
F1 Score (weighted): 0.8135742738207564





Epoch {epoch}
Training loss: 0.3523930533128037


100%|██████████| 31/31 [00:14<00:00,  2.09it/s]


Validation loss: 0.8261266177700411
F1 Score (weighted): 0.8363054495161606





Epoch {epoch}
Training loss: 0.3329922088062009


100%|██████████| 31/31 [00:14<00:00,  2.09it/s]


Validation loss: 0.8963103294372559
F1 Score (weighted): 0.8278171070879359





Epoch {epoch}
Training loss: 0.2988838489684124


100%|██████████| 31/31 [00:14<00:00,  2.09it/s]

Validation loss: 0.8815302396974256
F1 Score (weighted): 0.8390815837216109
Class: 0
Accuracy:657/733

Class: 1
Accuracy:165/246




