# 数据导入

In [1]:
import json

datas = []
# 打开JSON文件
with open('data\\TweetReviews_new.json', 'r', encoding='utf-8') as file:
    datas = json.load(file)

print(datas[0:5])

[{'_id': {'$oid': '65b08e2adc0113065d559971'}, 'o_tid': '1502290921521618952', 'cid': '1574597356313616389', 'seq': 1, 'retweet_num': 2, 'quote_num': 0, 'comment_num': 1, 'like_num': 3, 'view_num': 0, 'created_t': {'$date': '2022-09-27T03:10:39.000Z'}, 'ts': {'$date': '2024-01-24T04:12:26.364Z'}, 'comment_user': {'uid': '1390526374893457408', 'screen_name': 'Pastie1999', 'nick_name': 'King Saifalaah', 'created_t': {'$date': '2021-05-07T04:38:51.000Z'}, 'brief_introduction': 'Second King of the UK 🇬🇧 Martyn Son of Martin grandson of Martin. The Pastie King, i.e., King Saifalaah 👽\n\nSaifalaah27-Ytube \nAskTheAudience-Deso', 'pos': 'England, United Kingdom', 'follow_num': 99, 'fan_num': 192, 'post_num': 2775}}, {'_id': {'$oid': '65b08e2adc0113065d559975'}, 'o_tid': '1501962856555757577', 'cid': '1502291189868998661', 'seq': 1, 'retweet_num': 0, 'quote_num': 0, 'comment_num': 0, 'like_num': 0, 'view_num': 0, 'created_t': {'$date': '2022-03-11T14:31:45.000Z'}, 'ts': {'$date': '2024-01-24T0

# 关键数据提取

In [2]:
import datetime
import pandas as pd
def getDate(t_str):
    t_str = t_str.split('.')[0]
    d = datetime.datetime.strptime(t_str, '%Y-%m-%dT%H:%M:%S')
    return d
ct = datas[0]['created_t']['$date']
print(ct)
d = getDate(ct)
print(d)

o_tids = [] # 原推文id
seqs = [] # 排序
like_nums = [] # 点赞
retweet_nums = [] # 快转
quote_nums = [] # 转发
comment_nums = [] # 评论
view_nums = [] # 浏览数
follow_nums = [] # 关注
fan_nums = [] # 粉丝
post_nums = [] # 发推数
created_ts = [] # 创建时间

for data in datas:
    o_tids.append(data['o_tid'])
    seqs.append(data['seq'])
    like_nums.append(data['like_num'])
    retweet_nums.append(data['retweet_num'])
    quote_nums.append(data['quote_num'])
    comment_nums.append(data['comment_num'])
    view_nums.append(data['view_num'])
    follow_nums.append(data['comment_user']['follow_num'])
    fan_nums.append(data['comment_user']['fan_num'])
    post_nums.append(data['comment_user']['post_num'])
    created_ts.append(getDate(data['created_t']['$date']).timestamp())

# df = pd.DataFrame(data=[seqs, like_nums, retweet_nums, quote_nums, comment_nums, view_nums, follow_nums, fan_nums, post_nums, created_ts], columns=['seq', 'like_num', 'retweet_num', 'quote_num', 'comment_num', 'view_num', 'follow_num', 'fan_num', 'post_num', 'date'])
df = pd.DataFrame(data={
    'o_tid': o_tids,
    'seq': seqs,
    'like_num': like_nums,
    'retweet_num': retweet_nums,
    'quote_num': quote_nums,
    'comment_num': comment_nums,
    'view_num': view_nums,
    'follow_num': follow_nums,
    'fan_num': fan_nums,
    'post_num': post_nums,
    'date': created_ts
})
df.sort_values(['o_tid', 'seq'], inplace=True)
df.head(100)
df.to_csv('data/res.csv', index=False)


2022-09-27T03:10:39.000Z
2022-09-27 03:10:39


In [3]:
group = df.groupby(df.o_tid)
for g in group:
    # print(type(g[0])) # 0是组号，1是数据
    # print(g[0])
    g[1].to_csv(f'data/tweet_id/{g[0]}.csv', index=False)
    # break

# 模型定义

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(in_features=9, out_features=32, dtype=float),
            nn.ReLU(),
            nn.Linear(in_features=32, out_features=1, dtype=float)
        )

    def forward(self, x):
        x = self.model(x)
        return x


# 数据集定义

In [5]:
from torch.utils.data import Dataset
class MyDataSet(Dataset):
    def __init__(self, df):
        self.x = df.drop(labels=['seq'], axis=1)
        self.x = torch.from_numpy(self.x.values)
        self.x = self.x.double()
        self.y = torch.from_numpy(df['seq'].values)
        self.y = self.y.double()
        # print(self.x.size())

    def __len__(self):
        return len(self.x)

    def __getitem__(self, item):
        x = self.x[item]
        y = self.y[item]
        return x, y

class PairWiseDataset(Dataset):
    def __init__(self, df):
        self.df = df.drop(labels=['seq'], axis=1)
        self.df = torch.from_numpy(self.df.values)
        self.index = []
        n = self.df.shape[0]
        print(self.df.shape)
        print(n)
        for i in range(n):
            for j in range(i+1, n):
                self.index.append([i, j])

        self.y = torch.from_numpy(df['seq'].values)

    def __len__(self):
        return len(self.index)

    def __getitem__(self, item):
        i = self.index[item]
        x1 = self.df[i[0]]
        x2 = self.df[i[1]]
        y1 = self.y[i[0]]
        y2 = self.y[i[1]]
        return x1, x2, y1, y2




# 准备数据集

In [6]:
from torch.utils.data import DataLoader
# # 超参数
# lr = 1e-8
# epochs = 300
# batch_size = 16
# train_rate = 0.8
# train_len = int(train_rate * len(df))
#
# df = pd.read_csv('data/res.csv')
# train_ds = MyDataSet(df[:train_len])
# trainLoader = DataLoader(train_ds, shuffle=True, batch_size=batch_size)
# test_ds = MyDataSet(df[train_len:])
# testLoader = DataLoader(test_ds, shuffle=True, batch_size=1)


In [7]:
import torch.optim as optim
# def train_model(model):
#     loss_list = []
#
#     optimizer = optim.Adam(model.parameters(), lr=lr)
#     criterion = nn.MSELoss(reduction='mean')
#     model.train()
#     for epoch in range(epochs):
#         tot_loss = 0.0
#         batch_num = 0
#         for i, (x, y) in enumerate(trainLoader):
#             batch_num += 1
#             optimizer.zero_grad()
#             pred_y = model(x)
#
#             loss = criterion(pred_y, y)
#             loss.backward()
#             optimizer.step()
#             # print(loss)
#             # print(y.size(0))
#             tot_loss += loss.item() / y.size(0)
#
#         print(f'epoch = {epoch}  loss = {tot_loss / batch_num}')
#         loss_list.append(tot_loss / batch_num)
#
#     loss_df = pd.DataFrame(data=loss_list, columns=['loss'])
#     loss_df.to_csv('loss.csv', index=False)
#
# model = MyModel()
# train_model(model)


In [8]:
from torch.utils.data import DataLoader

# df = pd.read_csv('data/tweet_id/474188805541748736.csv')
# df = pd.read_csv('data/tweet_id/1319748517397659654.csv')
# df = pd.read_csv('data/tweet_id/1324002766386323456.csv')
# df = pd.read_csv('data/tweet_id/1329901868315623424.csv')
# df = pd.read_csv('data/tweet_id/1332477653685825536.csv')
# df = pd.read_csv('data/tweet_id/1332518860944076802.csv')
# df = pd.read_csv('data/tweet_id/1332552283553476608.csv')
df = pd.read_csv('data/tweet_id/1332713121413214208.csv')
# df = df[:9]
# 超参数
lr = 1e-7
epochs = 300
batch_size = 1
train_rate = 0.1
train_len = int(train_rate * len(df))


# train_ds = PairWiseDataset(df[:train_len])
# trainLoader = DataLoader(train_ds, shuffle=True, batch_size=batch_size)
# test_ds = PairWiseDataset(df[train_len:])
# testLoader = DataLoader(test_ds, shuffle=True, batch_size=1)

train_ds_x = df.drop(labels=['seq', 'o_tid'], axis=1)
train_ds_x = torch.from_numpy(train_ds_x.values)
train_ds_y = torch.from_numpy(df['seq'].values)

tot = train_ds_x.shape[0]

# model = MyModel()
model = torch.load('./model/model_model.pth')
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.MarginRankingLoss()
for epoch in range(epochs):
    tot_loss = 0.0
    batch_num = 0
    for i in range(tot):
        for j in range(i+1, tot):
            batch_num += 1
            optimizer.zero_grad()
            pred_y_1 = model(train_ds_x[i])
            pred_y_2 = model(train_ds_x[j])
            target = 1
            if train_ds_y[i] > train_ds_y[j]:
                target = -1
            loss = criterion(pred_y_1, pred_y_2, torch.tensor([target]))
            loss.backward()
            optimizer.step()
            tot_loss += loss.item()
        # print(i)

    print(f'epoch = {epoch}  loss = {tot_loss / batch_num}')



# for i, (x1, x2, y1, y2) in enumerate(trainLoader):
#     print("--------------------------------")
#     print("x1")
#     print(x1)
#     print("x2")
#     print(x2)
#     print("y1")
#     print(y1)
#     print("y2")
#     print(y2)
#     break

epoch = 0  loss = 0.039777707860988824
epoch = 1  loss = 0.03595682270376549
epoch = 2  loss = 0.19388461785009867
epoch = 3  loss = 0.037724785868922556
epoch = 4  loss = 0.03647122701687503
epoch = 5  loss = 0.33824767368913383
epoch = 6  loss = 0.03437943354810426
epoch = 7  loss = 0.0472337856744516
epoch = 8  loss = 0.03507786830258519
epoch = 9  loss = 0.083509106964871
epoch = 10  loss = 0.036786019853557485
epoch = 11  loss = 0.048675060750208
epoch = 12  loss = 0.03596132951903131
epoch = 13  loss = 0.06806215870870089
epoch = 14  loss = 0.03713265269183788
epoch = 15  loss = 0.07741613195422493
epoch = 16  loss = 0.0363564938825347
epoch = 17  loss = 0.052442323913566735
epoch = 18  loss = 0.03471307193208339
epoch = 19  loss = 0.08611982412429645
epoch = 20  loss = 0.03482061955650897
epoch = 21  loss = 0.06467269602431981
epoch = 22  loss = 0.03699148698339033
epoch = 23  loss = 0.06063010275855169
epoch = 24  loss = 0.036311151724735646
epoch = 25  loss = 0.035561942932557

In [9]:
WEIGHT = './model/model_weights.pth'
MODEL = './model/model_model.pth'
torch.save(model.state_dict(), WEIGHT)
torch.save(model, MODEL)

In [10]:
# model = torch.load('./model/model_model.pth')