In [1]:
import re
import os
import torch
import string
import pandas as pd
from unidecode import unidecode
from nltk.corpus import stopwords
from transformers import LongformerTokenizer, LongformerModel

os.environ['HF_HOME'] = 'D:\transformers_cache'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

df = pd.read_csv('500_Reddit_users_posts_labels.csv')
label_map = {'Supportive': 0, 'Indicator': 1, 'Ideation': 2, 'Behavior': 3, 'Attempt': 4}
df['label'] = df['Label'].map(label_map)

# load NLTK stopwords
stop_words = set(stopwords.words('english'))
words_to_remove = ['ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'don', "don't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'no', 'nor', 'not', 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]                 
stop_words.difference_update(words_to_remove)

def preprocess_text(text):
    # Remove Email 和 URL
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'http\S+|www\S+', '', text) 
    
    # transform to lowercase
    text = text.lower()
    
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # remove accents
    text = unidecode(text)

    # remove stop words
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # remove extra whitespace
    text = text.strip()

    return text


# Load Longformer model and tokenizer
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained('allenai/longformer-base-4096').to(device)

# 对 df['Post'] 列的每个元素进行预处理和嵌入
def process_post(posts):
    post = posts.split("', '")
    print(f"Number of posts is: {len(post)}")
    
    # 获取每个子元素的 embedding
    embeddings_list = []
    for text in post:
        preprocessed_text = preprocess_text(text)

        inputs = tokenizer(preprocessed_text, return_tensors='pt', truncation=True, max_length=512).to(device)
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze(0).detach().cpu().numpy()  # [CLS] token embedding
        embeddings_list.append(cls_embedding)
    
    print("-----")
    return embeddings_list

# 应用到 df['Post'] 列并存储到新列 'emb'
df['enc'] = df['Post'].apply(process_post)

df.to_pickle('reddit_data_processed_without_frames.pkl')


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


Input ids are automatically padded to be a multiple of `config.attention_window`: 512


Number of posts is: 1
-----
Number of posts is: 8
-----
Number of posts is: 4
-----
Number of posts is: 4
-----
Number of posts is: 6
-----
Number of posts is: 27
-----
Number of posts is: 30
-----
Number of posts is: 18
-----
Number of posts is: 3
-----
Number of posts is: 4
-----
Number of posts is: 1
-----
Number of posts is: 4
-----
Number of posts is: 3
-----
Number of posts is: 33
-----
Number of posts is: 4
-----
Number of posts is: 22
-----
Number of posts is: 9
-----
Number of posts is: 2
-----
Number of posts is: 3
-----
Number of posts is: 56
-----
Number of posts is: 2
-----
Number of posts is: 9
-----
Number of posts is: 10
-----
Number of posts is: 1
-----
Number of posts is: 21
-----
Number of posts is: 19
-----
Number of posts is: 130
-----
Number of posts is: 1
-----
Number of posts is: 25
-----
Number of posts is: 5
-----
Number of posts is: 16
-----
Number of posts is: 14
-----
Number of posts is: 9
-----
Number of posts is: 34
-----
Number of posts is: 4
-----
Numbe

In [2]:
# 打开emb.pkl文件并打印打印head
df = pd.read_pickle('reddit_data_processed_without_frames.pkl')
print(df.head())

     User                                               Post       Label  \
0  user-0  ['Its not a viable option, and youll be leavin...  Supportive   
1  user-1  ['It can be hard to appreciate the notion that...    Ideation   
2  user-2  ['Hi, so last night i was sitting on the ledge...    Behavior   
3  user-3  ['I tried to kill my self once and failed badl...     Attempt   
4  user-4  ['Hi NEM3030. What sorts of things do you enjo...    Ideation   

   label                                                enc  
0      0  [[-0.080956124, 0.017096339, 0.035033993, -0.0...  
1      2  [[-0.05704766, 0.05428099, 0.017478283, -0.049...  
2      3  [[-0.04890902, -0.009876962, 0.024664512, -0.0...  
3      4  [[-0.078842014, 0.0068427566, -0.007107684, -0...  
4      2  [[-0.027730918, 0.06775871, -0.013432516, -0.0...  
