In [5]:
from transformers import BertTokenizer, BertModel
import torch

# 加载分词器和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def encode_text(text):
    # 将文本编码为token IDs
    input_ids = tokenizer.encode(text, add_special_tokens=True, return_tensors='pt')
    
    # 使用BERT模型
    with torch.no_grad():
        outputs = model(input_ids)
    
    # 获取最后一层的隐藏状态
    last_hidden_state = outputs.last_hidden_state
    # 取[CLS] token的表示作为整个句子的表示
    sentence_representation = last_hidden_state[:, 0, :]
    return sentence_representation

# 示例文本
text = "Here is an example of a normal-length text."
representation = encode_text(text)

print(representation.shape)  # 输出表示的形状


torch.Size([1, 768])


In [7]:
from transformers import BertTokenizer, BertModel
import torch

# 初始化tokenizer和model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# 假设我们有一个字典，键为用户ID，值为该用户的帖子列表
users_posts = {
    "user1": ["This is the first post content. Here are more details.", "Discussion about the second topic."],
    "user2": ["Here is another user's first post content.", "Here's the second post, including multiple viewpoints.", "Content of the third post."]
}

# 准备处理所有帖子
all_embeddings = {}

for user, posts in users_posts.items():
    # 处理每个用户的帖子
    processed_posts = [tokenizer.cls_token + " " + post for post in posts]
    encoded_input = tokenizer(processed_posts, add_special_tokens=True, return_tensors='pt', padding=True, truncation=True, max_length=4096)
    
    with torch.no_grad():
        outputs = model(**encoded_input)
    
    # 提取每个帖子的[CLS]标记的输出作为帖子的嵌入表示
    embeddings = outputs.last_hidden_state[:, 0, :]
    all_embeddings[user] = embeddings

# 输出每个用户的帖子嵌入维度，以确保正确处理
for user, embeddings in all_embeddings.items():
    print(f"User {user} has embeddings with shape: {embeddings.shape}")


User user1 has embeddings with shape: torch.Size([2, 768])
User user2 has embeddings with shape: torch.Size([3, 768])


In [1]:

# 对用户的帖子文本进行文本表示

from transformers import BertTokenizer, BertModel
import torch
import pickle

import pandas as pd
# 初始化tokenizer和model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


# 将模型移到GPU上
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)


# 将reddit_json中的subreddit中set集合转化为列表
def contents_elements(data_set):
    # 验证数据类型
    # print(f"Data type: {type(data_set)}")
    # 如果是集合，则转换为列表并打印
    if isinstance(data_set, set):
        data_list = list(data_set)
        return data_list
    else:
        print("Data is not a set.")

# 读取帖子pkl——json数据
def read_posts_data(file_path):
    with open(file_path, 'rb') as f:
        posts_data = pickle.load(f)
    return posts_data


def encode_posts(posts_data):
    # all_embeddings = {}

    user_labels = []
    user_names = []

    user_posts_embedings_dic = {
        'user':[],
        'embeddings':[],
        'labels':[]
    }
    
    data = read_posts_data(posts_data)
    for i in range(len(data)):
        user_posts  = []
        user_labels.append(data[i]['label'])
        user_names.append(data[i]['user'])
        for j in range(len(data[i]['subreddit'])):
            content = contents_elements(data[i]['subreddit'][j])[0]
            content = content.lower()
            user_posts.append(content)
        processed_posts = [tokenizer.cls_token + " " + post for post in user_posts]

        # print(processed_posts)

        encoded_input = tokenizer(processed_posts, add_special_tokens=True, return_tensors='pt', padding=True, truncation=True, max_length=512)

        with torch.no_grad():
            outputs = model(**encoded_input)

        # 提取每个帖子的[CLS]标记的输出作为帖子的嵌入表示
        embeddings = outputs.last_hidden_state[:, 0, :]
        # print(embeddings.shape)
        # 一定要加这行代码不然，存储的文件会很大，因为embeddings是tensor类型
        embeddings =  embeddings.cpu().numpy()

        user_posts_embedings_dic['user'].append(user_names[i])
        user_posts_embedings_dic['embeddings'].append(embeddings)
        user_posts_embedings_dic['labels'].append(user_labels[i])

        # print(user_posts_embedings_dic)
        # break

        # user_posts_embeddings_dic['user'] = user_names[i]
        # user_posts_embeddings['label'] = user_labels[i]
        # user_posts_embeddings['embeddings'] = embeddings

        
        # user_posts_embedings_dic[]
        # user_posts_embedings.append(user_posts_embeddings)

    # 将user_posts_embedings_dic转化为df对象
    user_posts_embedings_df = pd.DataFrame(user_posts_embedings_dic)
    # print(user_posts_embedings_df)


    # 将得到的user_posts_embeddings写入pkl文件
    with open('../data/bert_embeddings_csv.pkl', 'wb') as f:
        pickle.dump(user_posts_embedings_df, f)


posts_data = '../data/reddit_json.pkl'
encode_posts(posts_data)


In [23]:
# 读取存储的帖子文本表示

def read_emb_pkl(file_path):
    with open(file_path, 'rb') as f:
        posts_data = pickle.load(f)
    return posts_data

def read_posts_embeddings(path):
    data = read_emb_pkl(path)
    # 读取其中embeddings的嵌入表示
    for i in range(len(data)):
        # print(data[i]['embeddings'])
        if i == 1:
            print(data[i]['embeddings'].shape)
            break

path = '../data/bert_embeddings.pkl'
read_posts_embeddings(path= path)

(8, 768)


In [20]:
import numpy as np
import pandas as pd
import pickle

# 读取帖子文本表示
def read_embeddings(file_path):
    with open(file_path, 'rb') as f:
        embeddings = pickle.load(f)
    return embeddings

reddit_bert = '../data/bert_embeddings.pkl'
embeddings = read_embeddings(reddit_bert)

post_bert = []

# 提取嵌入表示和标签
for con in embeddings:
    X = con['embeddings']
    X = np.array2string(X)
    post_bert.append(X)
    # break
# 将post_bert转化为PLK
with open('../data/post_bert.pkl', 'wb') as f:
    pickle.dump(post_bert, f)

# 读取post_bert.pkl文件
def read_post_bert(file_path):
    with open(file_path, 'rb') as f:
        post_bert = pickle.load(f)
    return post_bert

post_bert = read_post_bert('../data/post_bert.pkl')
print(len(post_bert[0])) 

clean_str = post_bert[0].strip('[]')
array_restored = np.fromstring(clean_str, dtype=int, sep=' ')

print(array_restored.shape)
# df = pd.DataFrame(post_bert)

# df.to_csv('../data/post_bert.csv', index=False, header=False)

12673
(1,)


  array_restored = np.fromstring(clean_str, dtype=int, sep=' ')


In [9]:
# 读取pkl文件

import pickle

def read_pkl(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    return data

# 读取reddit_json.pkl文件
reddit_json = '../data/bert_embeddings_csv.pkl'



data = read_pkl(reddit_json)

# 找到data中的embeddings的最大长度
max_length = 0

for i in range(len(data)):
    # print(len(data))
    if len(data.iloc[i]['embeddings']) > max_length:
        max_length = len(data.iloc[i]['embeddings'])
        # print(max_length)
        # break

print(max_length)

# print(data[0]['embeddings'].shape)


# 找到reddit_json.pkl文件中的subreddit中的set集合


292
