In [1]:
import pandas as pd

# Load the dataset from the specified path
dataset = pd.read_csv('./dataset/500_Reddit_users_posts_labels.csv')

print(dataset.shape)  # Print the shape of the dataset
print(dataset.head())  # Print the first few rows of the dataset

(500, 3)
     User                                               Post       Label
0  user-0  ['Its not a viable option, and youll be leavin...  Supportive
1  user-1  ['It can be hard to appreciate the notion that...    Ideation
2  user-2  ['Hi, so last night i was sitting on the ledge...    Behavior
3  user-3  ['I tried to kill my self once and failed badl...     Attempt
4  user-4  ['Hi NEM3030. What sorts of things do you enjo...    Ideation


In [2]:
# preprocess the dataset 
# from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import re

# 1. Drop the 'User' column
dataset = dataset.drop(columns=['User'])
print(dataset.head())

# 2. delect [''] in Post column
dataset['Post'] = dataset['Post'].str.strip("['']")
print(dataset.head())

# 3. Encode the 'Label' column
# Supportive = 0, Indicator = 1, Ideation = 2, Behavior = 3, Attempt = 4
label_mapping = {
    'Supportive': 0,
    'Indicator': 1,
    'Ideation': 2,
    'Behavior': 3,
    'Attempt': 4
}
dataset['Label'] = dataset['Label'].map(label_mapping)
print(dataset.head())

# 4. Remove punctuation and special characters from the 'Post' column and switch to lowercase
dataset['Post'] = dataset['Post'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
dataset['Post'] = dataset['Post'].str.lower()
print(dataset.head())

                                                Post       Label
0  ['Its not a viable option, and youll be leavin...  Supportive
1  ['It can be hard to appreciate the notion that...    Ideation
2  ['Hi, so last night i was sitting on the ledge...    Behavior
3  ['I tried to kill my self once and failed badl...     Attempt
4  ['Hi NEM3030. What sorts of things do you enjo...    Ideation
                                                Post       Label
0  Its not a viable option, and youll be leaving ...  Supportive
1  It can be hard to appreciate the notion that y...    Ideation
2  Hi, so last night i was sitting on the ledge o...    Behavior
3  I tried to kill my self once and failed badly ...     Attempt
4  Hi NEM3030. What sorts of things do you enjoy ...    Ideation
                                                Post  Label
0  Its not a viable option, and youll be leaving ...      0
1  It can be hard to appreciate the notion that y...      2
2  Hi, so last night i was sitting on th

In [3]:
# devide the dataset into training, validation, and test sets
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42, stratify=dataset['Label'])
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42, stratify=test_data['Label'])

In [4]:
# Load the pretrained BERT model and tokenizer
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Tokenize the text data and prepare it for training
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, classification_report

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data.iloc[idx]['Post']
        label = self.data.iloc[idx]['Label']

        # use the tokenizer to encode the text
        encoding = self.tokenizer(
            text, 
            truncation=True, # 输入超过512个token会被自动截断
            padding='max_length', 
            max_length=512, # BERT的最大输入长度为512
            return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(), # 每个样本的input_ids（每个词在词汇表中的索引）
            'attention_mask': encoding['attention_mask'].squeeze(), # 每个样本的attention_mask（指示哪些token是padding）
            'labels': torch.tensor(label, dtype=torch.long) # 每个样本的标签
        }
    
train_dataset = CustomDataset(train_data, tokenizer)
val_dataset = CustomDataset(val_data, tokenizer)
test_dataset = CustomDataset(test_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

print(train_dataset[0])  # Print the first sample from the training dataset
# print the size of a sample in the training dataset
print(f"Sample size: {len(train_dataset[0]['input_ids'])} tokens")


{'input_ids': tensor([  101,  5791,  2025,  3733,  1045,  2444,  1999,  1996,  4643,  2149,
         2004,  2005, 12163,  2490,  2004,  5976,  2004,  2009,  2089,  4025,
         1045,  2428,  2123,  2102,  2215,  2008,  1045,  2215,  2000,  7796,
         2026,  2769,  1045,  2215,  2000,  2113,  2008,  2026,  2769,  2003,
         2445,  2000,  2033,  2138,  1045,  2106,  2242,  2025,  3432,  2138,
        10047,  2091,  2006,  2026,  6735,  2004,  1045,  2056,  4921,  2063,
         2042, 11573,  1998,  1045,  2064,  2102,  2425,  2017,  2129,  2172,
         2009,  7807,  2026,  2540,  2000,  2156,  2619,  2008,  2015,  5525,
         8084,  2507,  2033,  2769,  2005,  1037,  7954,  2009,  2074,  5683,
         3308,  2025,  2008,  1045,  2123,  2102,  9120,  2009,  2021,  1045,
         2514,  2008,  2009,  3475,  2102,  2157,  2635,  2769,  2013,  2178,
         3969,  2040,  3791,  2009, 13367,  1045,  5247,  2026,  6385,  1999,
        15967, 12039,  2004,  2026,  2814,  4025, 

In [6]:
# train the model and evaluate it
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Training Loss: {avg_loss:.4f}")

def evaluate(model, val_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Validation Accuracy: {accuracy:.4f}")
    report = classification_report(all_labels, all_preds, target_names=list(label_mapping.keys()))
    print("Classification Report:\n", report)

# Main training loop
num_epochs = 5
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_loader, optimizer, criterion, device)
    evaluate(model, val_loader, device)


Epoch 1/5


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 3.33 GiB is allocated by PyTorch, and 107.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# test the model on the test set and print the accuracy and classification report
def test(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=['Supportive', 'Indicator', 'Ideation', 'Behavior', 'Attempt'])
    return accuracy, report

test_accuracy, test_report = test(model, test_loader, device)
print('Test Accuracy:', test_accuracy)
print('Test Classification Report:')
print(test_report)

Test Accuracy: 0.36
Test Classification Report:
              precision    recall  f1-score   support

  Supportive       0.46      0.55      0.50        11
   Indicator       0.00      0.00      0.00        10
    Ideation       0.34      0.71      0.46        17
    Behavior       0.00      0.00      0.00         8
     Attempt       0.00      0.00      0.00         4

    accuracy                           0.36        50
   macro avg       0.16      0.25      0.19        50
weighted avg       0.22      0.36      0.27        50



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


: 

In [8]:
# Save the trained model
import os
os.makedirs('./saved_model_0', exist_ok=True)  # Create directory if it doesn't exist
model_save_path = './saved_model_0'

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('./saved_model_0\\tokenizer_config.json',
 './saved_model_0\\special_tokens_map.json',
 './saved_model_0\\vocab.txt',
 './saved_model_0\\added_tokens.json')

In [11]:
import json

# Save test accuracy and classification report to a JSON file
test_results = {
    'accuracy': test_accuracy,
    'classification_report': test_report
}

results_save_path = './test_results.json'
with open(results_save_path, 'w') as f:
    json.dump(test_results, f, indent=4)

print(f'Test results saved to {results_save_path}')

Test results saved to ./test_results.json
