In [1]:
!pip install transformers

[0m

In [2]:
# Import required libraries
import tensorflow as tf
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

# Load the dataset
df = pd.read_csv("/kaggle/input/depression-dataset/train_data.csv")

# Split the dataset into training and testing sets
df = df.sample(frac=0.01)
train_size = int(0.8 * len(df))
train_data = df[:train_size]
val_data = df[train_size:]

# Load the BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Encode the training and testing data
train_encodings = tokenizer(train_data["text"].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_data["text"].tolist(), truncation=True, padding=True)

# Create TensorFlow datasets
with tf.device('CPU'):
  train_dataset = (
      tf.data.Dataset.from_tensor_slices(
          (dict(train_encodings), train_data["label"].tolist())
      )
      .shuffle(10000)
      .batch(4)
  )

  test_dataset = tf.data.Dataset.from_tensor_slices(
      (dict(val_encodings), val_data["label"].tolist())
  ).batch(4)

# Load the BERT model for sequence classification
model = TFDistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
model.fit(train_dataset, epochs=10, validation_data=test_dataset, batch_size=4)

# # Chat with the user to detect depression
# while True:
#     text = input("How are you feeling today? ")
#     encoding = tokenizer(text, truncation=True, padding=True, return_tensors='tf')
#     output = model(encoding)[0]
#     prediction = tf.argmax(output, axis=1)
#     if prediction == 1:
#         print("It seems like you might be feeling depressed. Please consider seeking help.")
#     else:
#         print("It's great to hear that you're doing well!")

# Save the model
model.save_pretrained("/kaggle/working/depression_bert_model.h5")


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_projector', 'vocab_layer_norm', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'classifier', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [1]:
import pickle

# saving
with open('/kaggle/working/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


NameError: name 'tokenizer' is not defined

In [4]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Load the test dataset
test_data = pd.read_csv("/kaggle/input/depression-dataset/test_data.csv")

# Encode the test data
test_encodings = tokenizer(test_data["text"].tolist(), truncation=True, padding=True)

# Create TensorFlow dataset
test_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(test_encodings), test_data["label"].tolist())
).batch(4)

# Evaluate the model on the test dataset
y_true = []
y_pred = []
for batch in test_dataset:
    batch_input = batch[0]
    batch_labels = batch[1]
    batch_output = model(batch_input)[0].numpy()
    batch_pred = np.argmax(batch_output, axis=1)
    y_true.extend(batch_labels)
    y_pred.extend(batch_pred)

# Compute evaluation metrics
acc = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
auc_roc = roc_auc_score(y_true, y_pred)

print("Accuracy: {:.4f}".format(acc))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))
print("AUC-ROC: {:.4f}".format(auc_roc))

Accuracy: 0.9127
Precision: 0.8731
Recall: 0.9660
F1 Score: 0.9172
AUC-ROC: 0.9127


In [6]:
# print confusion matrix all 4 parts
print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))
print("True Negatives: {}".format(confusion_matrix(y_true, y_pred)[0][0]))
print("False Negatives: {}".format(confusion_matrix(y_true, y_pred)[1][0]))
print("True Positives: {}".format(confusion_matrix(y_true, y_pred)[1][1]))
print("False Positives: {}".format(confusion_matrix(y_true, y_pred)[0][1]))

Confusion Matrix:
[[11162  1827]
 [  443 12573]]
True Negatives: 11162
False Negatives: 443
True Positives: 12573
False Positives: 1827


In [2]:
import pickle
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

# Load the saved tokenizer
tokenizer_path = '/kaggle/working/tokenizer.pickle'
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

# Load the saved model
model_path = '/kaggle/working/depression_bert_model.h5'
model = TFDistilBertForSequenceClassification.from_pretrained(model_path)

# Define the sentence to predict
sentence = "I'm feeling sad today"

# Tokenize and preprocess the input sentence
inputs = tokenizer(sentence, truncation=True, padding=True, return_tensors='tf')

# Make the prediction
output = model(inputs)[0]
prediction = tf.argmax(output, axis=1)

# Print the predicted value
if prediction == 1:
    print("It seems like you might be feeling depressed. Please consider seeking help.")
else:
    print("It's great to hear that you're doing well!")

EOFError: Ran out of input