In [53]:
##加载模型
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
tokenizer = AutoTokenizer.from_pretrained("detected_model")
model = AutoModelForCausalLM.from_pretrained("detected_model", torch_dtype=torch.bfloat16)

In [54]:
#加载数据集
import os
os.environ['TOKENIZERS_PARALLELISM'] = "True"
flatten = lambda l : [x for s in l for x in s]
shuffle = lambda l : random.sample(l, k=len(l))
data_path='dataset/valid.json'
print("loading from json...")
with open(data_path, 'r') as f:
    data = f.read()
    lines=json.loads(data)
tokens=[d['text'] for d in lines]

loading from json...


In [55]:

sum_input=[]
sum_output=[]
for i in range(tokens):
    #对tokens[0]进行实验，查看一下效果
    token=tokens[i].split()
    print(token)
    #本来想取一半的，考虑到模型本身的能力，改成3/4（这个地方可以多试试，我也不确定取什么好）
    quarter_index=len(token)//4

    input=' '.join(token[:quarter_index])
    input_text=input.replace('Ġ','')
    input_id=tokenizer.encode(input_text, return_tensors='pt')
    output=model.generate(input_id, max_length=len(token),num_return_sequences=1, temperature=0.7)
    output_text=tokenizer.decode(output[0], skip_special_tokens=True)
    output_text=output_text.replace('Ġ','')
    sum_input.append(input_text)
    sum_output.append(output_text)
    print("input:",input_text)
    print("output:",output_text)
    print(tokens[i])

['Authorities', 'in', 'eastern', 'Switzerland', 'have', 'ordered', 'residents', 'of', 'the', 'village', 'of', 'Brienz', 'to', 'evacuate', 'by', 'Friday', 'evening', 'because', 'geologists', 'say', 'a', 'mass', 'of', '2m', 'cubic', 'metres', 'of', 'Alpine', 'rock', 'looming', 'overhead', 'could', 'break', 'loose', 'and', 'spill', 'down', 'in', 'coming', 'weeks.Local', 'leaders', 'told', 'a', 'town', 'hall', 'and', 'press', 'event', 'on', 'Tuesday', 'that', 'residents', 'would', 'have', 'to', 'leave', 'by', '6pm', 'on', 'Friday', 'but', 'could', 'return', 'to', 'the', 'village', 'from', 'time', 'to', 'time', 'starting', 'on', 'Saturday,', 'depending', 'on', 'the', 'risk', 'level,', 'but', 'not', 'stay', 'overnight.Officials', 'said', 'measurements', 'indicated', 'a', '“strong', 'acceleration', 'over', 'a', 'large', 'area”', 'in', 'recent', 'days,', 'and', '“up', 'to', '2m', 'cubic', 'metres', 'of', 'rock', 'material', 'will', 'collapse', 'or', 'slide', 'in', 'the', 'coming', 'seven', 'to



input: Authorities in eastern Switzerland have ordered residents of the village of Brienz to evacuate by Friday evening because geologists say a mass of 2m cubic metres of Alpine rock looming overhead could break loose and spill down in coming weeks.Local leaders told a town hall and press event on Tuesday that residents would have to leave by 6pm on Friday but could return to the village from time to time starting on Saturday, depending on the risk level, but not stay overnight.Officials said measurements indicated a “strong acceleration over a large area” in recent days, and “up to 2m cubic metres of rock material will collapse or slide in the coming seven to 24 days”.The centuries-old village straddles German- and Romansch-speaking parts of the eastern Graubünden region, sitting south-west of Davos at an altitude of about 1,150 metres (3,800ft).
output: Authorities in eastern Switzerland have ordered residents of the village of Brienz to evacuate by Friday evening because geologists

In [59]:
#评估相似度(使用SequenceMatcher)（效果不是很好，不能很好地区分）
from difflib import SequenceMatcher
sum_similarity=[]
for i in range(len(tokens)):
    matcher=SequenceMatcher(None,sum_input[i], sum_output[i])
    similarity=matcher.ratio()
    sum_similarity.append(similarity)
with open('sum_similarity.json','w') as f:
    json.dump(sum_similarity,f)

0.29906229068988616


In [60]:
#评估相似度(使用余弦相似度) 莫名其妙很高，我还没发现哪里有问题
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
sum1_similarity=[]
for i in range(tokens):
    # 将tokens转换为token ids
    token_ids1 = tokenizer.encode(' '.join(sum_input[i]), return_tensors='pt')
    token_ids2 = tokenizer.encode(' '.join(sum_output[i]), return_tensors='pt')

    # 将token ids转换为向量
    vec1 = model.get_input_embeddings()(token_ids1).to(dtype=torch.float32).detach().numpy()
    vec2 = model.get_input_embeddings()(token_ids2).to(dtype=torch.float32).detach().numpy()

    # 将三维张量转换为二维张量
    vec1 = np.mean(vec1, axis=1)
    vec2 = np.mean(vec2, axis=1)

    # 计算vec1和vec2的余弦相似度
    similarity = cosine_similarity(vec1, vec2)
    sum1_similarity.append(similarity)
with open('sum1_similarity.json','w') as f:
    json.dump(sum1_similarity,f)

Token indices sequence length is longer than the specified maximum sequence length for this model (2866 > 512). Running this sequence through the model will result in indexing errors


[[0.9986857]]


In [62]:
#喂给bert看cls的相似度 也感觉有点高
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# 初始化tokenizer和model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
sum2_similarity=[]
for i in range(tokens):
    # 使用tokenizer对句子进行编码，然后将编码的结果喂给model
    inputs1 = tokenizer(sum_input[i], return_tensors='pt', truncation=True, max_length=512, padding='max_length')
    inputs2 = tokenizer(output_text, return_tensors='pt', truncation=True, max_length=512, padding='max_length')

    with torch.no_grad():
        outputs1 = model(**inputs1)
        outputs2 = model(**inputs2)

    # 提取出CLS标记
    cls1 = outputs1[0][0][0].numpy()
    cls2 = outputs2[0][0][0].numpy()

    # 计算cls1和cls2的余弦相似度
    similarity = cosine_similarity(cls1.reshape(1, -1), cls2.reshape(1, -1))
    sum2_similarity.append(similarity)

with open('sum2_similarity.json','w') as f:
    json.dump(sum2_similarity,f)




[[0.84120107]]
