In [1]:
import re
import os
import torch
import spacy
import string
import numpy as np
import pandas as pd
from unidecode import unidecode
from nltk.corpus import stopwords
from nltk.corpus import framenet as fn
from transformers import LongformerTokenizer, LongformerModel

# 加载 spaCy 模型
nlp = spacy.load("en_core_web_lg")

os.environ['HF_HOME'] = 'D:\transformers_cache'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

df = pd.read_csv('500_Reddit_users_posts_labels.csv')
label_map = {'Supportive': 0, 'Indicator': 1, 'Ideation': 2, 'Behavior': 3, 'Attempt': 4}
df['label'] = df['Label'].map(label_map)

# load NLTK stopwords
stop_words = set(stopwords.words('english'))
words_to_remove = ['ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'don', "don't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'no', 'nor', 'not', 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]                 
stop_words.difference_update(words_to_remove)

# Load Longformer model and tokenizer
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained('allenai/longformer-base-4096').to(device)

# Function to preprocess text
def preprocessing(text):
    # Remove Email 和 URL
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'http\S+|www\S+', '', text) 
    
    # transform to lowercase
    text = text.lower()
    
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # remove accents
    text = unidecode(text)

    # remove stop words
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # remove extra whitespace
    text = text.strip()

    return text

# Function to get embeddings
def text_embedding(preprocessed_text):
    inputs = tokenizer(preprocessed_text, return_tensors='pt', truncation=True, max_length=512).to(device)
    outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze(0).detach().cpu().numpy()  # [CLS] token embedding
    
    return cls_embedding

# Function to detect frames from specific tokens
def detect_framenet_frames(tokens):
    frame_results = {}
    for token in tokens:
        # frames_by_lemma returns a list of Frame objects for the given token
        frames = fn.frames_by_lemma(token)
        if frames:
            frame_results[token] = [frame.name for frame in frames]
        else:
            frame_results[token] = [] 
    return frame_results

# Function to extract frames from preprocessed text
def extract_frames(preprocessed_text):
    # Frame extraction from text
    doc = nlp(preprocessed_text)

    # get verb and adjective tokens
    verb_tokens = [token.text for token in doc if token.pos_ == "VERB"]
    adj_tokens = [token.text for token in doc if token.pos_ == "ADJ"]
    combined_tokens = verb_tokens + adj_tokens

    # detect frames for the combined tokens
    frame_results = detect_framenet_frames(combined_tokens)

    # count occurrences of each frame
    frame_count = {}
    for frames in frame_results.values():
        for frame in frames:
            frame_count[frame] = frame_count.get(frame, 0) + 1

    # sort frames by their count in descending order
    sorted_frame_count = dict(sorted(frame_count.items(), key=lambda item: item[1], reverse=True))

    # if no frames found, return an empty list
    if not sorted_frame_count:
        return []

    # get the most common frames
    max_count = next(iter(sorted_frame_count.values()))
    most_common_frames = [frame for frame, count in sorted_frame_count.items() if count == max_count]

    return most_common_frames

# Function to concatenate text and frame embeddings
def concatenating(text_emb, frame_emb):
    return np.concatenate((text_emb, frame_emb))

# Function to process each user's posts
def process_single_user_posts(row):
    # get post list from a user's posts
    posts = row['Post']
    post = posts.split("', '")
    print(f"Number of posts is: {len(post)}")

    embeddings_list = []
    frames_list = []
    frame_embeddings_list = []
    concatenated_embeddings_list = []

    # iterate through each post
    for text in post:
        # post in posts after preprocessing
        preprocessed_text = preprocessing(text)
        # print(f"Preprocessed text: {preprocessed_text}")

        # 1. tokenize and get embeddings
        text_emb = text_embedding(preprocessed_text)
        embeddings_list.append(text_emb)

        # 2. extract frames
        frames = extract_frames(preprocessed_text)
        # print(f"Extracted frames: {frames}")
        frames_list.append(frames)

        if frames:
            # 3. concatenate frames and get embeddings
            concatenated_frames = ' '.join(frames)
            # preprocess
            concatenated_frames = concatenated_frames.replace('_', ' ')
            concatenated_frames = preprocessing(concatenated_frames)
            # print(f"Concatenated frames: {concatenated_frames}")
            frame_emb = text_embedding(concatenated_frames)
        else:
            frame_emb = np.zeros(text_emb.shape)
        frame_embeddings_list.append(frame_emb)

        # 4. concatenate text embeddings and frame embeddings
        concatenated_embeddings = concatenating(text_emb, frame_emb)
        concatenated_embeddings_list.append(concatenated_embeddings)

    return pd.Series({
        'text_emb': embeddings_list,
        'frames': frames_list,
        'frames_emb': frame_embeddings_list,
        'enc': concatenated_embeddings_list
    })


df[['text_emb', 'frames', 'frames_emb', 'enc']] = df.apply(process_single_user_posts, axis=1)

# Save the DataFrame to pickle files
df.to_pickle('reddit_data_processed.pkl')


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


Input ids are automatically padded to be a multiple of `config.attention_window`: 512


Number of posts is: 1
Number of posts is: 8
Number of posts is: 4
Number of posts is: 4
Number of posts is: 6
Number of posts is: 27
Number of posts is: 30
Number of posts is: 18
Number of posts is: 3
Number of posts is: 4
Number of posts is: 1
Number of posts is: 4
Number of posts is: 3
Number of posts is: 33
Number of posts is: 4
Number of posts is: 22
Number of posts is: 9
Number of posts is: 2
Number of posts is: 3
Number of posts is: 56
Number of posts is: 2
Number of posts is: 9
Number of posts is: 10
Number of posts is: 1
Number of posts is: 21
Number of posts is: 19
Number of posts is: 130
Number of posts is: 1
Number of posts is: 25
Number of posts is: 5
Number of posts is: 16
Number of posts is: 14
Number of posts is: 9
Number of posts is: 34
Number of posts is: 4
Number of posts is: 1
Number of posts is: 5
Number of posts is: 4
Number of posts is: 3
Number of posts is: 21
Number of posts is: 1
Number of posts is: 3
Number of posts is: 2
Number of posts is: 30
Number of posts

In [2]:
print(df.head())

     User                                               Post       Label  \
0  user-0  ['Its not a viable option, and youll be leavin...  Supportive   
1  user-1  ['It can be hard to appreciate the notion that...    Ideation   
2  user-2  ['Hi, so last night i was sitting on the ledge...    Behavior   
3  user-3  ['I tried to kill my self once and failed badl...     Attempt   
4  user-4  ['Hi NEM3030. What sorts of things do you enjo...    Ideation   

   label                                           text_emb  \
0      0  [[-0.080956124, 0.017096339, 0.035033993, -0.0...   
1      2  [[-0.05704766, 0.05428099, 0.017478283, -0.049...   
2      3  [[-0.04890902, -0.009876962, 0.024664512, -0.0...   
3      4  [[-0.078842014, 0.0068427566, -0.007107684, -0...   
4      2  [[-0.027730918, 0.06775871, -0.013432516, -0.0...   

                                              frames  \
0                           [[Awareness, Certainty]]   
1  [[Emotion_directed], [Desirability], [Stimulus...

In [1]:
import pandas as pd

df = pd.read_pickle('emb_with_negative.pkl')

# 打印行数
print(f"Number of rows in the DataFrame: {len(df)}")

Number of rows in the DataFrame: 59
