In [1]:
!pip install textaugment textblob
!pip install textblob googletrans==4.0.0-rc1

Collecting textaugment
  Downloading textaugment-2.0.0-py3-none-any.whl.metadata (12 kB)
Collecting googletrans>=2 (from textaugment)
  Downloading googletrans-3.0.0.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting nltk (from textaugment)
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting httpx==0.13.3 (from googletrans>=2->textaugment)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans>=2->textaugment)
  Downloading hstspreload-2024.7.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans>=2->textaugment)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans>=2->textaugment)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans>=2->textaugment)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metada

In [2]:
import pandas as pd
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load conversational dataset
conversational_dataset_path = '/kaggle/input/medsynfuel/Disorder Data.csv'  # Update the path accordingly
df = pd.read_csv(conversational_dataset_path)

# Preprocess data
df['disorders'] = df['disorders'].apply(lambda x: x.split(', '))  # Split disorders into list

# Filter dataset for anxiety and depression
df = df[df['disorders'].apply(lambda x: any(disorder in ['anxiety', 'depression'] for disorder in x))]

# Handle missing values
df = df.dropna(subset=['text'])

# Preprocess data
X = df['text'].astype(str).values  # Ensure all entries are strings
y = df['disorders'].values

# Convert labels to one-hot encoding
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)

# Check class distribution
class_counts = df['disorders'].apply(lambda x: ', '.join(x)).value_counts()
print("Class distribution:\n", class_counts)

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# Tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(mlb.classes_))

# Tokenize and encode inputs
def tokenize_function(texts):
    return tokenizer(list(texts), padding="max_length", truncation=True, max_length=128)

train_encodings = tokenize_function(X_train)
val_encodings = tokenize_function(X_val)

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': torch.tensor(y_train, dtype=torch.float32)  # Ensure labels are in the correct format
})
val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': torch.tensor(y_val, dtype=torch.float32)  # Ensure labels are in the correct format
})

datasets = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

# Define compute_metrics function
def compute_metrics(p):
    preds = torch.sigmoid(torch.tensor(p.predictions))
    preds = (preds > 0.5).int()
    true = torch.tensor(p.label_ids)
    
    precision, recall, f1, _ = precision_recall_fscore_support(true, preds, average='weighted')
    acc = accuracy_score(true, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments with early stopping and learning rate scheduler
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    evaluation_strategy='epoch',
    save_steps=1000,
    fp16=True,  # Use mixed precision training
    lr_scheduler_type='linear',
    logging_first_step=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets['train'],
    eval_dataset=datasets['validation'],
    compute_metrics=compute_metrics,
)

# Train and evaluate
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)

# Save model
model.save_pretrained('bert-mental-disorders-model')
tokenizer.save_pretrained('bert-mental-disorders-tokenizer')


2024-07-29 16:10:28.708773: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-29 16:10:28.708887: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-29 16:10:28.823915: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Class distribution:
 disorders
depression    1107
anxiety        893
Name: count, dtype: int64


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5881,0.589725,0.675,0.728326,0.794303,0.675
2,0.4843,0.468892,0.788333,0.80281,0.852107,0.788333
3,0.3865,0.382192,0.765,0.795406,0.839116,0.778333
4,0.3494,0.306967,0.808333,0.804433,0.864996,0.808333
5,0.3187,0.278994,0.808333,0.804433,0.864996,0.808333
6,0.2693,0.284532,0.74,0.795486,0.885496,0.74
7,0.2374,0.270268,0.808333,0.804433,0.864996,0.808333
8,0.2683,0.271113,0.823333,0.817658,0.846279,0.823333
9,0.2068,0.268459,0.76,0.818288,0.781877,0.86
10,0.2962,0.271609,0.808333,0.79725,0.858314,0.808333




{'eval_loss': 0.2684585452079773, 'eval_accuracy': 0.76, 'eval_f1': 0.8182879049262028, 'eval_precision': 0.78187738852008, 'eval_recall': 0.86, 'eval_runtime': 2.5678, 'eval_samples_per_second': 233.663, 'eval_steps_per_second': 3.894, 'epoch': 10.0}


('bert-mental-disorders-tokenizer/tokenizer_config.json',
 'bert-mental-disorders-tokenizer/special_tokens_map.json',
 'bert-mental-disorders-tokenizer/vocab.txt',
 'bert-mental-disorders-tokenizer/added_tokens.json')

In [3]:
import joblib

# Save mlb
joblib.dump(mlb, 'mlb2.pkl')


['mlb2.pkl']

In [4]:
%%capture
!pip install groq

In [6]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import joblib
from groq import Groq
from typing import Dict, List, Tuple

class MentalHealthPredictionPipeline:
    def __init__(self, model_path: str, tokenizer_path: str, mlb_path: str, groq_api_key: str):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = BertForSequenceClassification.from_pretrained(model_path).to(self.device)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.mlb = joblib.load(mlb_path)
        self.groq_client = Groq(api_key=groq_api_key)

    def summarize_text(self, input_text: str) -> str:
        try:
            completion = self.groq_client.chat.completions.create(
                model="llama3-8b-8192",
                messages=[
                    {
                        "role": "system",
                        "content": "You are a mental health expert in anxiety and depression. Summarize the content preserving emotions for a doctor's interpretation. Only summarize, do not respond to the user."
                    },
                    {
                        "role": "user",
                        "content": input_text
                    }
                ],
                temperature=0.7,
                max_tokens=1024,
                top_p=1,
                stream=False,
            )
            return completion.choices[0].message.content
        except Exception as e:
            print(f"Error during summarization: {e}")
            return "Error in summarization."

    def predict_disorders(self, text: str, threshold: float = 0.08) -> Tuple[Dict[str, float], List[str]]:
        self.model.eval()
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128).to(self.device)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        probabilities = torch.sigmoid(outputs.logits).squeeze().tolist()
        predictions = [prob > threshold for prob in probabilities]
        
        disorder_probabilities = {disorder: prob for disorder, prob in zip(self.mlb.classes_, probabilities)}
        predicted_disorders = [disorder for disorder, prediction in zip(self.mlb.classes_, predictions) if prediction]
        
        return disorder_probabilities, predicted_disorders

    def process_input(self, input_text: str) -> Dict:
        summary = self.summarize_text(input_text)
        disorder_probabilities, predicted_disorders = self.predict_disorders(input_text)
        
        return {
            "original_text": input_text,
            "summary": summary,
            "disorder_probabilities": disorder_probabilities,
            "predicted_disorders": predicted_disorders
        }

# Usage
GROQ_API_KEY = "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"
pipeline = MentalHealthPredictionPipeline(
    model_path='bert-mental-disorders-model',
    tokenizer_path='bert-mental-disorders-tokenizer',
    mlb_path='mlb2.pkl',
    groq_api_key=GROQ_API_KEY
)

input_text = "i am unable to eat and go days without talking to people"
result = pipeline.process_input(input_text)

print(f"Original Text: {result['original_text']}")
print(f"Summarized Text: {result['summary']}")
print("Probabilities by Disorder:")
for disorder, prob in result['disorder_probabilities'].items():
    print(f"{disorder}: {prob:.4f}")
print(f"Predicted Disorders: {result['predicted_disorders']}")


Original Text: i am unable to eat and go days without talking to people
Summarized Text: The individual is experiencing significant social withdrawal and changes in appetite, which are common symptoms of depression. The inability to eat and days without talking to people can be indicative of a lack of interest or pleasure in activities, a hallmark symptom of depression. Additionally, this pattern may be accompanied by feelings of hopelessness, helplessness, and worthlessness, which can further exacerbate the individual's emotional distress.
Probabilities by Disorder:
anxiety: 0.5265
depression: 0.5227
Predicted Disorders: ['anxiety', 'depression']


In [9]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import joblib
from datasets import load_dataset
import pandas as pd
from groq import Groq
from typing import Dict, List, Tuple

class MentalHealthPredictionPipeline:
    def __init__(self, model_path: str, tokenizer_path: str, mlb_path: str, groq_api_key: str):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = BertForSequenceClassification.from_pretrained(model_path).to(self.device)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.mlb = joblib.load(mlb_path)
        self.groq_client = Groq(api_key=groq_api_key)

    def summarize_text(self, input_text: str) -> str:
        completion = self.groq_client.chat.completions.create(
            model="llama3-8b-8192",
            messages=[
                {
                    "role": "system",
                    "content": "You are a mental health expert in anxiety and depression. Summarize the content preserving emotions for a doctor's interpretation. Only summarize, do not respond to the user."
                },
                {
                    "role": "user",
                    "content": input_text
                }
            ],
            temperature=0.7,
            max_tokens=1024,
            top_p=1,
            stream=False,
        )
        return completion.choices[0].message.content

    def predict_disorders(self, text: str, threshold: float = 0.07) -> Tuple[Dict[str, float], List[str]]:
        self.model.eval()
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128).to(self.device)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        probabilities = torch.sigmoid(outputs.logits).squeeze().tolist()
        predictions = [prob > threshold for prob in probabilities]
        
        disorder_probabilities = {disorder: prob for disorder, prob in zip(self.mlb.classes_, probabilities)}
        predicted_disorders = [disorder for disorder, prediction in zip(self.mlb.classes_, predictions) if prediction]
        
        return disorder_probabilities, predicted_disorders

    def process_input(self, input_text: str) -> Dict:
        summary = self.summarize_text(input_text)
        disorder_probabilities, predicted_disorders = self.predict_disorders(input_text)
        
        return {
            "original_text": input_text,
            "summary": summary,
            "disorder_probabilities": disorder_probabilities,
            "predicted_disorders": predicted_disorders
        }

# Usage
GROQ_API_KEY = "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"
pipeline = MentalHealthPredictionPipeline(
    model_path='bert-mental-disorders-model',
    tokenizer_path='bert-mental-disorders-tokenizer',
    mlb_path='mlb2.pkl',
    groq_api_key=GROQ_API_KEY
)

# Load the dataset
dataset = load_dataset("solomonk/reddit_mental_health_posts")

# Filter the dataset for the "depression" subreddit
depression_posts = dataset['train'].filter(lambda x: x['subreddit'] == 'depression')

# Convert to a Pandas DataFrame for easier handling
depression_posts_df = pd.DataFrame(depression_posts)

# Test the model on multiple posts
for index, row in depression_posts_df.iterrows():
    input_text = row['body']
    result = pipeline.process_input(input_text)
    
    print(f"Original Text: {result['original_text']}")
    print(f"Summarized Text: {result['summary']}")
    print("Probabilities by Disorder:")
    for disorder, prob in result['disorder_probabilities'].items():
        print(f"{disorder}: {prob:.4f}")
    print(f"Predicted Disorders: {result['predicted_disorders']}")
    print("\n" + "="*50 + "\n")



Repo card metadata block was not found. Setting CardData to empty.


Original Text: *not sure if this counts as self-pity and I’m just being a big baby self-victimizer*
Summarized Text: The individual is expressing feelings of self-doubt and self-criticism, potentially indicating low self-esteem and negative self-talk. They may be struggling with self-compassion and are unsure if their emotions are justified or if they are being overly critical towards themselves. This could be a sign of underlying anxiety or depression, and it may be beneficial to explore these feelings further to understand the root cause and develop more adaptive coping mechanisms.
Probabilities by Disorder:
anxiety: 0.5345
depression: 0.5191
Predicted Disorders: ['anxiety', 'depression']


Original Text: [removed]
Summarized Text: Here is a summary of the content, written in a neutral and objective tone to facilitate a doctor's interpretation:

**Chief Complaint:** The patient presents with symptoms of anxiety and depression, reporting feelings of excessive worry, fear, and sadness 

KeyboardInterrupt: 

In [3]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import joblib

model_path = 'bert-mental-disorders-model'
tokenizer_path = 'bert-mental-disorders-tokenizer'
mlb_path = 'mlb2.pkl'

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

mlb = joblib.load(mlb_path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def predict_disorders(text, threshold=0.08):
    model.eval()
    
    # Tokenize and encode the input text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
    
    # Get the model's output without gradient calculation
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Calculate the probabilities using sigmoid activation
    logits = outputs.logits
    probabilities = torch.sigmoid(logits).squeeze().tolist()
    
    # Create a dictionary of disorder probabilities
    disorder_probabilities = {disorder: prob for disorder, prob in zip(mlb.classes_, probabilities)}
    
    # Determine the predicted disorder with the highest probability above the threshold
    max_prob = max(probabilities)
    if max_prob > threshold:
        predicted_disorder = mlb.classes_[probabilities.index(max_prob)]
    else:
        predicted_disorder = None
    
    return disorder_probabilities, predicted_disorder

# Example usage
input_text = """My cousin has been missing nearly a week and they just found her body. Last time I saw her, she was full of life and her beautiful, bubbly self; she was working part time with a circus while she studied and learning stilt walking, a huge change for her at nearly 5' tall."""

# Predict disorders for the input text
disorder_probabilities, predicted_disorder = predict_disorders(input_text)

# Print the results
print(f"Input Text: {input_text}")
print("Probabilities by Disorder:")
for disorder, prob in disorder_probabilities.items():
    print(f"{disorder}: {prob:.4f}")
print(f"Predicted Disorder: {predicted_disorder}")


Input Text: My cousin has been missing nearly a week and they just found her body. Last time I saw her, she was full of life and her beautiful, bubbly self; she was working part time with a circus while she studied and learning stilt walking, a huge change for her at nearly 5' tall.
Probabilities by Disorder:
anxiety: 0.6340
depression: 0.4209
Predicted Disorder: anxiety


In [5]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import joblib

# Paths
model_path = 'bert-mental-disorders-model'
tokenizer_path = 'bert-mental-disorders-tokenizer'
mlb_path = 'mlb2.pkl'

# Load model, tokenizer, and multilabel binarizer
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
mlb = joblib.load(mlb_path)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def predict_disorders(text, threshold=0.08):
    model.eval()
    
    # Tokenize input text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
    
    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Apply sigmoid to logits to get probabilities
    logits = outputs.logits
    probabilities = torch.sigmoid(logits).squeeze().tolist()
    
    # Create a dictionary of disorder probabilities
    disorder_probabilities = {disorder: prob for disorder, prob in zip(mlb.classes_, probabilities)}
    
    # Select the disorder with the highest probability above the threshold
    max_prob = max(probabilities)
    if max_prob > threshold:
        predicted_disorder = mlb.classes_[probabilities.index(max_prob)].lower()
    else:
        predicted_disorder = None
    
    return disorder_probabilities, predicted_disorder

# Load dataset
dataset_path = '/kaggle/input/text-and-mental-disorder-data/ChatGPT Mental Health Data.csv'
data = pd.read_csv(dataset_path)

results = []

# Iterate through the dataset and make predictions
for index, row in data.iterrows():
    text = row['text']
    actual_disorder = row['disorder'].lower()
    
    disorder_probabilities, predicted_disorder = predict_disorders(text)
    
    results.append({
        'Text': text,
        'Actual Disorder': actual_disorder,
        'Predicted Disorder': predicted_disorder,
        'Probabilities by Disorder': disorder_probabilities
    })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Function to calculate accuracy
def calculate_accuracy(df):
    correct_predictions = 0
    for index, row in df.iterrows():
        actual = row['Actual Disorder']
        predicted = row['Predicted Disorder']
        if actual == predicted:
            correct_predictions += 1
    return correct_predictions / len(df)

# Calculate accuracy
accuracy = calculate_accuracy(results_df)
print(f"Accuracy: {accuracy:.4f}")

# Save results to a CSV file
results_df.to_csv('model_prediction_results.csv', index=False)

# Print results for verification
for result in results:
    print(f"Actual Disorder: {result['Actual Disorder']}")
    print(f"Predicted Disorder: {result['Predicted Disorder']}")
    print("\n")


Accuracy: 0.5260
Actual Disorder: depression
Predicted Disorder: anxiety


Actual Disorder: depression
Predicted Disorder: anxiety


Actual Disorder: depression
Predicted Disorder: anxiety


Actual Disorder: anxiety
Predicted Disorder: anxiety


Actual Disorder: anxiety
Predicted Disorder: anxiety


Actual Disorder: depression
Predicted Disorder: anxiety


Actual Disorder: anxiety
Predicted Disorder: anxiety


Actual Disorder: anxiety
Predicted Disorder: anxiety


Actual Disorder: depression
Predicted Disorder: anxiety


Actual Disorder: depression
Predicted Disorder: anxiety


Actual Disorder: depression
Predicted Disorder: depression


Actual Disorder: anxiety
Predicted Disorder: anxiety


Actual Disorder: depression
Predicted Disorder: anxiety


Actual Disorder: depression
Predicted Disorder: depression


Actual Disorder: anxiety
Predicted Disorder: anxiety


Actual Disorder: depression
Predicted Disorder: anxiety


Actual Disorder: depression
Predicted Disorder: anxiety


Actual Di