In [1]:
# Required modules:
# numpy
# transformers
# pandas
# sklearn
# matplotlib
# seaborn

import numpy as np
# pip install transformers
import transformers # 安装过程比较麻烦，如果出现安装错误并且说找不到某个包，那就先安装那个需要的包再继续尝试安装transformers
import pandas
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
tokenizer = transformers.BertTokenizer.from_pretrained("hfl/rbt3")
model = transformers.BertModel.from_pretrained("hfl/rbt3")

# Measure 函数，用于测量某个新闻条例是否为政策相关
# 与已知为政策相关的条目测量相似度
# Parameters:
#     sentence：需要测量的条目，必须为文字
#     tokenizer：分词+编码器
#     model：BERT模型
#     policy_matrix：已知政策相关的条目预测结果
#     boundary：衡量相似度的标准，默认0.9

def measure(sentence, tokenizer, model, policy_matrix, boundary=0.9):
    encode = tokenizer(sentence, padding=True, return_tensors = "pt")
    result = model(encode['input_ids'], attention_mask = encode['attention_mask'])
    criteria_matrix = np.vstack((np.matrix(policy_matrix), result[1][0].detach()))
    similarities = []
    for i in range(criteria_matrix.shape[0]-1):
        similarities.append(cosine_similarity(criteria_matrix[i], criteria_matrix[-1]))
    mean_similarity = np.mean(similarities)
    return mean_similarity > boundary

In [3]:
# 构建policy_matrix的函数
# Parameters:
#     dataset需要通过pandas读取后将数据集对象传入，必须要有title列
#     tokenizer：分词+编码器
#     model：BERT模型

def build_policy_matrix(dataset, tokenizer, model):
    enc = tokenizer(list(dataset['title']), padding=True, return_tensors="pt")
    res = model(enc['input_ids'], attention_mask = enc['attention_mask'])
    transformed_policies = res[1].detach()
    return transformed_policies

In [4]:
tr = pandas.read_csv("./training/test.csv")

In [5]:
mtx = build_policy_matrix(tr, tokenizer, model)

In [6]:
tr2 = pandas.read_csv("./training/cleaned.csv").iloc[:10]
newo = tr2.iloc[2]

In [7]:
measure(newo['title'], tokenizer, model, mtx)

False

In [8]:
newo['title']

'中秋庆佳节,同心谱新篇:特莱维狮董事长徐建成中秋致辞'