In [1]:
!pip install --upgrade huggingface-hub

Collecting huggingface-hub
  Downloading huggingface_hub-0.24.2-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.24.2-py3-none-any.whl (417 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m417.2/417.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: huggingface-hub
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.23.2
    Uninstalling huggingface-hub-0.23.2:
      Successfully uninstalled huggingface-hub-0.23.2
Successfully installed huggingface-hub-0.24.2


In [2]:
from huggingface_hub import login
login(token="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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [12]:
import pandas as pd
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Load dataset
df = pd.read_csv('/kaggle/input/newdatasetwithreddit/ChatGPT combined dataset.csv')
df['Disorder'] = df['Disorder'].apply(lambda x: [x])

# Filter dataset for anxiety and depression
df = df[df['Disorder'].apply(lambda x: x[0] in ['Anxiety', 'depression'])]

# Handle missing values
df = df.dropna(subset=['Text'])

# Preprocess data
X = df['Text'].astype(str).values  # Ensure all entries are strings
y = df['Disorder'].values

# Convert labels to one-hot encoding
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y).argmax(axis=1)  # Convert to integer labels

# Check class distribution
class_counts = df['Disorder'].value_counts()
print("Class distribution:\n", class_counts)

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the tokenization function
tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased")

def tokenize_function(texts):
    return tokenizer(list(texts), padding='max_length', truncation=True, max_length=128)

train_encodings = tokenize_function(X_train)
val_encodings = tokenize_function(X_val)

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': y_train  # Ensure labels are in the correct format
})

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': y_val  # Ensure labels are in the correct format
})

datasets = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

# Define compute_metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    true = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(true, preds, average='weighted')
    acc = accuracy_score(true, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments with early stopping and learning rate scheduler
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    save_steps=1000,
    fp16=True,  # Use mixed precision training
    lr_scheduler_type='linear',
    logging_first_step=True,
)

# Load model
model = BertForSequenceClassification.from_pretrained("mental/mental-bert-base-uncased", num_labels=2) 

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets['train'],
    eval_dataset=datasets['validation'],
    compute_metrics=compute_metrics,
)

# Train and evaluate
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)

# Save model
model.save_pretrained('bert-mental-disorders-model')
tokenizer.save_pretrained('bert-mental-disorders-tokenizer')


Class distribution:
 Disorder
[Anxiety]       1479
[depression]     913
Name: count, dtype: int64


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5748,0.506833,0.817549,0.806367,0.833891,0.817549
2,0.1998,0.235247,0.912256,0.910706,0.915171,0.912256
3,0.2299,0.20904,0.931755,0.931352,0.931823,0.931755
4,0.2088,0.288868,0.933148,0.932724,0.933298,0.933148
5,0.2092,0.650474,0.924791,0.924314,0.924825,0.924791
6,0.2126,0.924631,0.93454,0.934739,0.935244,0.93454
7,0.0316,1.420146,0.906685,0.905337,0.908214,0.906685
8,0.1099,1.126914,0.931755,0.93168,0.93164,0.931755
9,0.0,1.247696,0.924791,0.924379,0.924721,0.924791
10,0.0991,1.247883,0.924791,0.924791,0.924791,0.924791




{'eval_loss': 0.9246307611465454, 'eval_accuracy': 0.9345403899721448, 'eval_f1': 0.9347394637382008, 'eval_precision': 0.9352444189487449, 'eval_recall': 0.9345403899721448, 'eval_runtime': 3.1342, 'eval_samples_per_second': 229.086, 'eval_steps_per_second': 3.829, 'epoch': 10.0}


('bert-mental-disorders-tokenizer/tokenizer_config.json',
 'bert-mental-disorders-tokenizer/special_tokens_map.json',
 'bert-mental-disorders-tokenizer/vocab.txt',
 'bert-mental-disorders-tokenizer/added_tokens.json',
 'bert-mental-disorders-tokenizer/tokenizer.json')

In [13]:
import joblib

# Save mlb
joblib.dump(mlb, 'mlb1.pkl')


['mlb1.pkl']

In [15]:
!pip install groq

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting groq
  Downloading groq-0.9.0-py3-none-any.whl.metadata (13 kB)
Downloading groq-0.9.0-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.5/103.5 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.9.0


In [17]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import joblib
from groq import Groq
from typing import Dict, List, Tuple

class MentalHealthPredictionPipeline:
    def __init__(self, model_path: str, tokenizer_path: str, mlb_path: str, groq_api_key: str):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = BertForSequenceClassification.from_pretrained(model_path).to(self.device)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.mlb = joblib.load(mlb_path)
        self.groq_client = Groq(api_key=groq_api_key)

    def summarize_text(self, input_text: str) -> str:
        try:
            completion = self.groq_client.chat.completions.create(
                model="llama3-8b-8192",
                messages=[
                    {
                        "role": "system",
                        "content": "You are a mental health expert in anxiety and depression. Summarize the content preserving emotions for a doctor's interpretation. Only summarize, do not respond to the user."
                    },
                    {
                        "role": "user",
                        "content": input_text
                    }
                ],
                temperature=0.7,
                max_tokens=1024,
                top_p=1,
                stream=False,
            )
            return completion.choices[0].message.content
        except Exception as e:
            print(f"Error during summarization: {e}")
            return "Error in summarization."

    def predict_disorders(self, text: str, threshold: float = 0.08) -> Tuple[Dict[str, float], List[str]]:
        self.model.eval()
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128).to(self.device)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        probabilities = torch.sigmoid(outputs.logits).squeeze().tolist()
        predictions = [prob > threshold for prob in probabilities]
        
        disorder_probabilities = {disorder: prob for disorder, prob in zip(self.mlb.classes_, probabilities)}
        predicted_disorders = [disorder for disorder, prediction in zip(self.mlb.classes_, predictions) if prediction]
        
        return disorder_probabilities, predicted_disorders

    def process_input(self, input_text: str) -> Dict:
        summary = self.summarize_text(input_text)
        disorder_probabilities, predicted_disorders = self.predict_disorders(input_text)
        
        return {
            "original_text": input_text,
            "summary": summary,
            "disorder_probabilities": disorder_probabilities,
            "predicted_disorders": predicted_disorders
        }

# Usage
GROQ_API_KEY = "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"
pipeline = MentalHealthPredictionPipeline(
    model_path='bert-mental-disorders-model',
    tokenizer_path='bert-mental-disorders-tokenizer',
    mlb_path='mlb1.pkl',
    groq_api_key=GROQ_API_KEY
)

input_text = "i am unable to eat and go days without talking to people"
result = pipeline.process_input(input_text)

print(f"Original Text: {result['original_text']}")
print(f"Summarized Text: {result['summary']}")
print("Probabilities by Disorder:")
for disorder, prob in result['disorder_probabilities'].items():
    print(f"{disorder}: {prob:.4f}")
print(f"Predicted Disorders: {result['predicted_disorders']}")


Original Text: i am unable to eat and go days without talking to people
Summarized Text: The individual is experiencing significant distress, struggling with basic daily needs such as eating, and is also feeling isolated and disconnected from others. This suggests a high level of anxiety and potentially depressive symptoms. The inability to eat and go without talking to people for days may indicate a loss of appetite due to emotional distress, and social withdrawal, which can be a common symptom of depression. The individual may be experiencing feelings of hopelessness, helplessness, and a lack of motivation, making it difficult for them to engage in daily activities and interact with others.
Probabilities by Disorder:
Anxiety: 0.0003
depression: 0.9997
Predicted Disorders: ['depression']


In [21]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import joblib
from datasets import load_dataset
import pandas as pd
from groq import Groq
from typing import Dict, List, Tuple

class MentalHealthPredictionPipeline:
    def __init__(self, model_path: str, tokenizer_path: str, mlb_path: str, groq_api_key: str):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = BertForSequenceClassification.from_pretrained(model_path).to(self.device)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.mlb = joblib.load(mlb_path)
        self.groq_client = Groq(api_key=groq_api_key)

    def summarize_text(self, input_text: str) -> str:
        completion = self.groq_client.chat.completions.create(
            model="llama3-8b-8192",
            messages=[
                {
                    "role": "system",
                    "content": "You are a mental health expert in anxiety and depression. Summarize the content preserving emotions for a doctor's interpretation. Only summarize, do not respond to the user."
                },
                {
                    "role": "user",
                    "content": input_text
                }
            ],
            temperature=0.7,
            max_tokens=1024,
            top_p=1,
            stream=False,
        )
        return completion.choices[0].message.content

    def predict_disorders(self, text: str, threshold: float = 0.08) -> Tuple[Dict[str, float], List[str]]:
        self.model.eval()
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128).to(self.device)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        probabilities = torch.sigmoid(outputs.logits).squeeze().tolist()
        predictions = [prob > threshold for prob in probabilities]
        
        disorder_probabilities = {disorder: prob for disorder, prob in zip(self.mlb.classes_, probabilities)}
        predicted_disorders = [disorder for disorder, prediction in zip(self.mlb.classes_, predictions) if prediction]
        
        return disorder_probabilities, predicted_disorders

    def process_input(self, input_text: str) -> Dict:
        summary = self.summarize_text(input_text)
        disorder_probabilities, predicted_disorders = self.predict_disorders(input_text)
        
        return {
            "original_text": input_text,
            "summary": summary,
            "disorder_probabilities": disorder_probabilities,
            "predicted_disorders": predicted_disorders
        }

# Usage
GROQ_API_KEY = "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"
pipeline = MentalHealthPredictionPipeline(
    model_path='bert-mental-disorders-model',
    tokenizer_path='bert-mental-disorders-tokenizer',
    mlb_path='mlb1.pkl',
    groq_api_key=GROQ_API_KEY
)

# Load the dataset
dataset = load_dataset("solomonk/reddit_mental_health_posts")

# Filter the dataset for the "depression" subreddit
depression_posts = dataset['train'].filter(lambda x: x['subreddit'] == 'depression')

# Convert to a Pandas DataFrame for easier handling
depression_posts_df = pd.DataFrame(depression_posts)

# Test the model on multiple posts
for index, row in depression_posts_df.iterrows():
    input_text = row['body']
    result = pipeline.process_input(input_text)
    
    print(f"Original Text: {result['original_text']}")
    print(f"Summarized Text: {result['summary']}")
    print("Probabilities by Disorder:")
    for disorder, prob in result['disorder_probabilities'].items():
        print(f"{disorder}: {prob:.4f}")
    print(f"Predicted Disorders: {result['predicted_disorders']}")
    print("\n" + "="*50 + "\n")



Repo card metadata block was not found. Setting CardData to empty.


Original Text: *not sure if this counts as self-pity and I’m just being a big baby self-victimizer*
Summarized Text: Summary:

The individual is expressing feelings of self-doubt and self-criticism, potentially indicating a lack of self-compassion. They may be experiencing negative self-talk and self-blame, which can exacerbate anxiety and depression. The phrase "self-victimizer" suggests a sense of guilt and shame, further highlighting the need for self-compassion and acceptance.
Probabilities by Disorder:
Anxiety: 0.0003
depression: 0.9998
Predicted Disorders: ['depression']


Original Text: [removed]
Summarized Text: I am not able to access or summarize the content as it was removed. If you would like to provide the content, I would be happy to summarize it for a doctor's interpretation, preserving the emotions and emotions involved. Please provide the content, and I will do my best to assist you.
Probabilities by Disorder:
Anxiety: 0.0157
depression: 0.9886
Predicted Disorders: ['d

KeyboardInterrupt: 

In [22]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import joblib
from datasets import load_dataset
import pandas as pd
from groq import Groq
from typing import Dict, List, Tuple

class MentalHealthPredictionPipeline:
    def __init__(self, model_path: str, tokenizer_path: str, mlb_path: str, groq_api_key: str):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = BertForSequenceClassification.from_pretrained(model_path).to(self.device)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.mlb = joblib.load(mlb_path)
        self.groq_client = Groq(api_key=groq_api_key)

    def summarize_text(self, input_text: str) -> str:
        completion = self.groq_client.chat.completions.create(
            model="llama3-8b-8192",
            messages=[
                {
                    "role": "system",
                    "content": "You are a mental health expert in anxiety and depression. Summarize the content preserving emotions for a doctor's interpretation. Only summarize, do not respond to the user."
                },
                {
                    "role": "user",
                    "content": input_text
                }
            ],
            temperature=0.7,
            max_tokens=1024,
            top_p=1,
            stream=False,
        )
        return completion.choices[0].message.content

    def predict_disorders(self, text: str, threshold: float = 0.08) -> Tuple[Dict[str, float], List[str]]:
        self.model.eval()
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128).to(self.device)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        probabilities = torch.sigmoid(outputs.logits).squeeze().tolist()
        predictions = [prob > threshold for prob in probabilities]
        
        disorder_probabilities = {disorder: prob for disorder, prob in zip(self.mlb.classes_, probabilities)}
        predicted_disorders = [disorder for disorder, prediction in zip(self.mlb.classes_, predictions) if prediction]
        
        return disorder_probabilities, predicted_disorders

    def process_input(self, input_text: str) -> Dict:
        summary = self.summarize_text(input_text)
        disorder_probabilities, predicted_disorders = self.predict_disorders(input_text)
        
        return {
            "original_text": input_text,
            "summary": summary,
            "disorder_probabilities": disorder_probabilities,
            "predicted_disorders": predicted_disorders
        }

# Usage
GROQ_API_KEY = "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"
pipeline = MentalHealthPredictionPipeline(
    model_path='bert-mental-disorders-model',
    tokenizer_path='bert-mental-disorders-tokenizer',
    mlb_path='mlb1.pkl',
    groq_api_key=GROQ_API_KEY
)

# Load the dataset
ds = load_dataset("cypsiSAS/template_dataset_anxiety")

# Filter the dataset for the "depression" subreddit
depression_posts = dataset['train']

# Convert to a Pandas DataFrame for easier handling
depression_posts_df = pd.DataFrame(depression_posts)

# Test the model on multiple posts
for index, row in depression_posts_df.iterrows():
    input_text = row['body']
    result = pipeline.process_input(input_text)
    
    print(f"Original Text: {result['original_text']}")
    print(f"Summarized Text: {result['summary']}")
    print("Probabilities by Disorder:")
    for disorder, prob in result['disorder_probabilities'].items():
        print(f"{disorder}: {prob:.4f}")
    print(f"Predicted Disorders: {result['predicted_disorders']}")
    print("\n" + "="*50 + "\n")



Downloading data:   0%|          | 0.00/68.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/66161 [00:00<?, ? examples/s]

Original Text: A few months ago I was accepted into this full time software engineering fellowship and it’s made me realize that I CANNOT work sustainably to save my life. It’s so hard to prioritize my time when I get so hyper focused on each task or just on something completely irrelevant. 

I was just diagnosed last year so I’m still learning how to learn with ADHD but I feel even more pressure to work so much harder to prove my worth bc I’m a Black woman in engineering. I’ve been falling into a really unhealthy cycle of taking more than my prescribed dose to work longer bc I’d waste so much time during the day and it’s just gotten out of hand. it’s like you go your whole life feeling so dumb and incompetent and now you don’t and you just wanna learn everything all the time but that’s just??? not sustainable or normal. anyways idk sorry for the rant, I’m just tired & don’t know what to do
Summarized Text: Here is a summarized version of the content, preserving emotions for a doctor's

KeyboardInterrupt: 