In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mental-health-corpus/mental_health.csv


In [2]:
import pandas as pd
import re
import string
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from torchtext.vocab import build_vocab_from_iterator

In [4]:
def remove_punctuation_and_emojis(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove emojis
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA00-\U0001FA6F"  # Extended Symbols and Pictographs
        u"\U0001FAB0-\U0001FABF"  # Additional animals & nature
        u"\U0001FAC0-\U0001FAFF"  # Additional people & body
        u"\U0001FAD0-\U0001FAFF"  # Additional food & drink
        u"\U00002000-\U0000206F"  # General Punctuation (includes ‼)
        u"\U00002300-\U000023FF"  # Miscellaneous Technical
        u"\U00002600-\U000026FF"  # Miscellaneous Symbols
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    return text

def process_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation and emojis
    text = remove_punctuation_and_emojis(text)

    return text

In [5]:
df = pd.read_csv('/kaggle/input/mental-health-corpus/mental_health.csv')
df['text'] = df['text'].apply(lambda x: process_text(str(x)))
df.head()

Unnamed: 0,text,label
0,dear american teens question dutch person hear...,0
1,nothing look forward lifei dont many reasons k...,1
2,music recommendations im looking expand playli...,0
3,im done trying feel betterthe reason im still ...,1
4,worried year old girl subject domestic physic...,1


In [6]:
# Create label mapping
unique_labels = df['label'].unique()
label_dict = {label: i for i, label in enumerate(unique_labels)}
num_labels = len(unique_labels)
print("Label mapping:", label_dict)

Label mapping: {0: 0, 1: 1}


In [7]:
# Build vocabulary
def yield_tokens(data_iter):
    for text in data_iter:
        yield text.split()

vocab = build_vocab_from_iterator(yield_tokens(df['text']), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [10]:
# LSTM Model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, (hidden, cell) = self.lstm(embedded)
        if self.lstm.bidirectional:
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        else:
            hidden = hidden[-1,:,:]
        return self.fc(self.dropout(hidden))

In [11]:
# Dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Convert text to tensor of indices
        numericalized_text = [self.vocab[token] for token in text.split()]
        if len(numericalized_text) < self.max_len:
            numericalized_text += [self.vocab["<unk>"]] * (self.max_len - len(numericalized_text))
        else:
            numericalized_text = numericalized_text[:self.max_len]
        
        return torch.tensor(numericalized_text), torch.tensor(label)

In [12]:
# Prepare data
max_len = 128  # or any other suitable length
X_train, X_val, y_train, y_val = train_test_split(df['text'], df['label'].map(label_dict), test_size=0.2, random_state=42)

train_dataset = CustomDataset(X_train.tolist(), y_train.tolist(), vocab, max_len)
val_dataset = CustomDataset(X_val.tolist(), y_val.tolist(), vocab, max_len)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4)

In [13]:
# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = num_labels
n_layers = 2
bidirectional = True
dropout = 0.5

model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout)
model = nn.DataParallel(model, device_ids=[0, 1])  # Enable multi-GPU support
model = model.to(device)
print(f"Using device: {device}")

Using device: cuda


In [14]:
# Training loop
optimizer = Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    for batch_text, batch_labels in train_loader:
        batch_text, batch_labels = batch_text.to(device), batch_labels.to(device)
        
        optimizer.zero_grad()
        predictions = model(batch_text)
        loss = criterion(predictions, batch_labels)
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
    
    model.eval()
    total_val_loss = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch_text, batch_labels in val_loader:
            batch_text, batch_labels = batch_text.to(device), batch_labels.to(device)
            
            predictions = model(batch_text)
            loss = criterion(predictions, batch_labels)
            total_val_loss += loss.item()
            
            _, predicted = torch.max(predictions, 1)
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(batch_labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions, average='weighted')
    recall = recall_score(all_labels, all_predictions, average='weighted')
    f1 = f1_score(all_labels, all_predictions, average='weighted')
    
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Training Loss: {total_train_loss/len(train_loader):.4f}")
    print(f"Validation Loss: {total_val_loss/len(val_loader):.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("--------------------")

Epoch 1/10
Training Loss: 0.4953
Validation Loss: 0.3461
Accuracy: 0.8595
Precision: 0.8649
Recall: 0.8595
F1 Score: 0.8590
--------------------
Epoch 2/10
Training Loss: 0.3906
Validation Loss: 0.2880
Accuracy: 0.8869
Precision: 0.8870
Recall: 0.8869
F1 Score: 0.8869
--------------------
Epoch 3/10
Training Loss: 0.2915
Validation Loss: 0.2968
Accuracy: 0.8944
Precision: 0.8961
Recall: 0.8944
F1 Score: 0.8943
--------------------
Epoch 4/10
Training Loss: 0.2528
Validation Loss: 0.2357
Accuracy: 0.9085
Precision: 0.9088
Recall: 0.9085
F1 Score: 0.9085
--------------------
Epoch 5/10
Training Loss: 0.2393
Validation Loss: 0.2719
Accuracy: 0.8980
Precision: 0.8995
Recall: 0.8980
F1 Score: 0.8979
--------------------
Epoch 6/10
Training Loss: 0.2101
Validation Loss: 0.2139
Accuracy: 0.9146
Precision: 0.9146
Recall: 0.9146
F1 Score: 0.9146
--------------------
Epoch 7/10
Training Loss: 0.1884
Validation Loss: 0.2450
Accuracy: 0.9146
Precision: 0.9151
Recall: 0.9146
F1 Score: 0.9146
------