In [1]:
import os
import random
import numpy as np
import tensorflow as tf
from collections import defaultdict

读取数据

In [2]:
def read_question(filename, data_path):
    """Read the question bank"""
    question_dict = {}
    with open(os.path.join(data_path, filename), 'r') as lines:
        # lists of characters of the inflected word and the lemma
        question_ids = []
        questions = []
        next(lines)
        for line in lines:
            if not line.strip():
                break
            question_id, question = line.strip().split('\t')
            #question_ids.append(question_id)
            #questions.append(question)
            question_dict[question_id] = question

    return question_dict

In [3]:
def read_answer(filename, data_path="answer"):
    """Read the answer"""
    with open(os.path.join(data_path, filename), 'r') as lines:
        test_queries = []
        test_answers = []
        for line in lines:
            if not line.strip():
                break
            query, answer = line.strip().split('\t')
            test_queries.append(query.strip().split(' '))
            test_answers.append(answer.strip().split(','))
    return test_queries, test_answers

In [4]:
id_to_question = read_question("question_bank.tsv","")
question_to_id = {v : k for k, v in id_to_question.items()}

In [5]:
def read_dataset(filename, data_path, isTrain):
    """Read the dataset and combine queries with random clarifying questions"""
    query_to_question = defaultdict(list)
    if isTrain == True:
        train_queries = []
        clarifying_questions = []
        with open(os.path.join(data_path, filename), 'r') as lines:
            next(lines)
            for line in lines:
                if not line.strip():
                    break
                query, clarifying_question = line.strip().split('\t')
                train_queries.append(query)
                clarifying_questions.append(clarifying_question)
                query_to_question[query].append(clarifying_question)
    else:
        test_queries = []
        with open(os.path.join(data_path, filename), 'r') as lines:
            for line in lines:
                if not line.strip():
                    break
                query = line.strip()
                test_queries.append(query)
        return test_queries

    return train_queries, clarifying_questions, query_to_question

In [6]:
train_queries, clarifying_questions, query_to_question = read_dataset("training.tsv", "", True)
test_queries = read_dataset("test_set.tsv", "", False)

In [7]:
print(len(test_queries))
print(len(set(train_queries)))
print(len(clarifying_questions))
print(len(set(clarifying_questions)))
if test_queries[1] in query_to_question.keys():
    print("True")
print(len(query_to_question[train_queries[0]]))

46
237
10727
3033
36


想法1: 数据集长度就是len(clarifying_questions)
     通过随机判断是接原有对应解释问题还是随机解释问题

In [8]:
def data_preprocessing2(train_queries, clarifying_questions, query_to_question):
    """Get train dataset, this method was so slow"""
    train_sentence1 = []
    train_sentence2 = []
    labels = []
    
    count = 0
    newlist = []
    previous_query = ""
    for (query, question) in zip(train_queries, clarifying_questions):
        if random.random() >= 0.5:
            train_sentence1.append(query)
            train_sentence2.append(question)
            labels.append(0)
        else:
            train_sentence1.append(query)
            list1 = query_to_question[query]
            if previous_query != query :
                newlist = [item for item in clarifying_questions if item not in list1]
            elif labels[-1] == 0 :
                newlist = [item for item in clarifying_questions if item not in list1]
            sentence = random.choice(newlist)
            train_sentence2.append(sentence)
            count += 1
            labels.append(1)
        previous_query = query
    print("Number of negative samples: ",count)
    #train_input = zip(train_sentence1, train_sentence2, labels)
    return train_sentence1, train_sentence2, labels

train_sentence1, train_sentence2, labels = data_preprocessing2(train_queries, clarifying_questions, query_to_question)
print(train_sentence1[:4])
print(train_sentence2[:4])
print(labels[:4])

Number of negative samples:  5302
['Tell me about Obama family tree.', 'Tell me about Obama family tree.', 'Tell me about Obama family tree.', 'Tell me about Obama family tree.']
['are you interested in seeing barack obamas family', 'are you interested in learning the history of the beatles', 'do you need technical support related to your service', 'would you like to know who is currently alive from president obamas family tree']
[0, 1, 1, 0]


想法2: 数据集长度就是2 * len(clarifying_questions)
1）通过随机判断是接原有对应解释问题还是随机解释问题
2）通过洗牌来获得消极样本

In [8]:
def data_preprocessing(train_queries, clarifying_questions, query_to_question):
    """Get train dataset"""
    train_sentence1 = []
    train_sentence2 = []
    shuffle_questions = []
    labels = []
    count = 0
    # append positive samples
    for (query, question) in zip(train_queries, clarifying_questions):
        train_sentence1.append(query)
        train_sentence2.append(question)
        shuffle_questions.append(question)
        # positive label is 0
        labels.append(0)
    # append negative samples
    random.shuffle(shuffle_questions)
    for (query, question) in zip(train_queries, shuffle_questions):
        train_sentence1.append(query)
        train_sentence2.append(question)
        if question in query_to_question[query]:
            labels.append(0)
        else:
            count += 1
            labels.append(1)
    print("Number of negative samples: ",count)
    train_input = list(zip(train_sentence1, train_sentence2, labels))
    return train_input

In [9]:
train_input = data_preprocessing(train_queries, clarifying_questions, query_to_question)
random.shuffle(train_input)
input_len = len(train_input)

train_s1, train_s2, train_labels = zip(*(train_input[:(input_len*8) // 10]))
dev_s1, dev_s2, dev_labels = zip(*(train_input[(input_len*8) // 10:]))

Number of negative samples:  10661


实现2

In [10]:
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from sentence_transformers.cross_encoder.evaluation import CEBinaryAccuracyEvaluator, CECorrelationEvaluator, CEBinaryClassificationEvaluator
from sentence_transformers.cross_encoder import CrossEncoder
from torch.utils.data import DataLoader
from datetime import datetime
from transformers import optimization
import torch
import sentence_transformers

model = CrossEncoder('distilroberta-base', num_labels=1)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight'

In [11]:
train_input = data_preprocessing(train_queries, clarifying_questions, query_to_question)
random.shuffle(train_input)
input_len = len(train_input)

train_s1, train_s2, train_labels = zip(*(train_input[:(input_len*8) // 10]))
dev_s1, dev_s2, dev_labels = zip(*(train_input[(input_len*8) // 10:]))

train_samples = []
for (s1, s2, label) in zip(train_s1, train_s2, train_labels):
    train_samples.append(InputExample(texts=[s1, s2],label=float(label)))
dev_samples = []
for (s1, s2, label) in zip(dev_s1, dev_s2, dev_labels):
    dev_samples.append(InputExample(texts=[s1, s2],label=float(label)))

Number of negative samples:  10659


In [12]:
model_save_path = 'result/model1-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)
train_loss = losses.ContrastiveLoss(model)
evaluator = CEBinaryClassificationEvaluator.from_input_examples(dev_samples, name='nsp_dev')#dev_s1, dev_s2, dev_labels
model.fit(train_dataloader=train_dataloader,
          optimizer_class=optimization.AdamW,
          optimizer_params={'lr':3e-06},
          epochs=10,
          warmup_steps=1000,
          evaluator=evaluator,
          evaluation_steps=1000,
          save_best_model=True,
          output_path=model_save_path)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/430 [00:00<?, ?it/s]

Iteration:   0%|          | 0/430 [00:00<?, ?it/s]

Iteration:   0%|          | 0/430 [00:00<?, ?it/s]

Iteration:   0%|          | 0/430 [00:00<?, ?it/s]

Iteration:   0%|          | 0/430 [00:00<?, ?it/s]

Iteration:   0%|          | 0/430 [00:00<?, ?it/s]

Iteration:   0%|          | 0/430 [00:00<?, ?it/s]

Iteration:   0%|          | 0/430 [00:00<?, ?it/s]

Iteration:   0%|          | 0/430 [00:00<?, ?it/s]

Iteration:   0%|          | 0/430 [00:00<?, ?it/s]

In [13]:
evaluator = CEBinaryAccuracyEvaluator.from_input_examples(dev_samples, name='nsp_dev')
evaluator(model)

0.9459333488697274

In [14]:
# 初步想法是外面再套一个循环 即list of lists， 每次就预测一个切片
test_sentences = []
for query in test_queries:
    test_sentences.append([[query, question] for question in list(id_to_question.values())])
print(test_sentences[0][:4])

[['Tell me about Computers', 'a total cholesterol of 180 to 200 mgdl 10 to 111 mmoll or less is considered best levels between 70 and 189 mgdl 39 and 105 mmoll are most often considered too high'], ['Tell me about Computers', 'about how many years experience do you want the instructor to have'], ['Tell me about Computers', 'according to anima the bible or what other source'], ['Tell me about Computers', 'ae you looking for examples of septic system design']]


In [15]:
# 跟下面连起来一整个循坏
model = CrossEncoder(model_save_path)

In [16]:
def writeAnswer(test_queries, test_top_50, filename, data_path="answer"):
    with open(os.path.join(data_path, filename), 'w') as f:
        string = test_queries[0] + "\t"

#         for item in test_top_50:
#             string = string + item + ","
#         f.write(string[:-1]+"\n")

        for i, query in enumerate(test_queries):
            string = query + "\t"
            for item in test_top_50[i]:
                string = string + item + ","
            f.write(string[:-1]+"\n")
    return

In [17]:
test_top_50 = []
for sentences in test_sentences:
    top_50 = []
    predictions = model.predict(sentences)
    b = np.argsort(predictions)[:50]
    for i in b:
        top_50.append(question_to_id[sentences[i][1]])
    test_top_50.append(top_50)

# predictions = model.predict(test_sentences[0])
# b = np.argsort(predictions)[:50]
# top_50 = []
# for i in b:
#     print(question_to_id[test_sentences[0][i][1]])
#     print(test_sentences[0][i])
#     top_50.append(question_to_id[test_sentences[0][i][1]])
writeAnswer(test_queries, test_top_50, "zsyanswer-epochs10-3e06-batch40.txt")

In [19]:
for i in test_top_50[-1]:
    print(id_to_question[i])

what information about woman are you looking for
why are you interested in learning more about male menopause
are you interested in the differences between male and female menopause
would you like the evolution of men or women
are you looking for a playlist of mothers days songs
are you male or female
do you want to see images of her
are you looking for lyrics from the songs from the music man
would you like to know about its benefit for a pregnant woman
do you want to listen to songs from the music man
are you looking for a summary of the music man plot
do you want songs that were released on mothers day in a specific year
would you like to know about specific artists related to mothers day
would you like a list of reviews for the film rain man
do you have an artist that you want to hear in mothers day songs
do you want to know about a specific person
do you want to watch the music man film
are you referring to the voyager 1 or voyager 2
are you referring to the person
do you want to 

In [27]:
test_q, test_answers = read_answer("zsyanswer-epochs9-3e06-batch16.txt")
for item in test_answers[-1]:
    print(item)
    print(id_to_question[item])

Q04000
what information about woman are you looking for
Q00429
are you interested in the differences between male and female menopause
Q03113
why are you interested in learning more about male menopause
Q03269
would you like the evolution of men or women
Q00091
are you interested in a particular hair dye or a history of hair dye
Q03988
are you male or female
Q02435
do you want to see images of her
Q00067
are you inquiring about mister rogers the show or mister rogers the television personality
Q03947
do you want to know her name
Q00845
are you looking for general information about teddy bears and are you looking to buy a teddy bear
Q01891
do you want the name of one person
Q02690
were you interested in how much weight people lost
Q01539
do you have any type of asbestos in mind when you ask about the dangers of asbestos
Q03995
male or female
Q03392
would you like to know about its benefit for a pregnant woman
Q01279
are you referring to the person
Q01242
are you referring to roosevelt i

In [173]:
if __name__ == "__main__":
    print(tf.test.is_gpu_available())
    print(tf.config.list_physical_devices('GPU'))
    print(tf.test.gpu_device_name())
    #print(device_lib.list_local_devices())

True
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
/device:GPU:0


实现1

In [9]:
from transformers import BertTokenizer, BertForNextSentencePrediction
import torch, gc

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
inputs = tokenizer(train_s1, train_s2, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [11]:
inputs['labels'] = torch.LongTensor([train_labels]).T

In [12]:
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [19]:
dataset = MeditationsDataset(inputs)
loader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True)

In [25]:
device =torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [15]:
from transformers import AdamW

model.train()
optim = AdamW(model.parameters(), lr=5e-6)

In [21]:
gc.collect()
torch.cuda.empty_cache()

In [22]:
from tqdm import tqdm

epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                       token_type_ids=token_type_ids,
                       labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        torch.cuda.empty_cache()

  """
Epoch 0: 100%|█████████████████████████████████████████████████████████| 4291/4291 [28:30<00:00,  2.51it/s, loss=0.866]
Epoch 1: 100%|█████████████████████████████████████████████████████| 4291/4291 [1:13:06<00:00,  1.02s/it, loss=0.00776]


In [23]:
torch.save(model, 'model1')

In [22]:
dev_inputs =tokenizer(dev_s1[1], dev_s2[1], return_tensors='pt', max_length=512, truncation=True, padding='max_length').to(device)
dev_outputs = model(**dev_inputs)
dev_outputs.keys()
print(dev_s1[1] + "\n..." + dev_s2[1])
print(dev_outputs.logits)
torch.softmax(dev_outputs.logits)

NameError: name 'tokenizer' is not defined

分类问题

In [25]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")

classes = ["negative", "positive"]

sequence_0 = "The company HuggingFace is based in New York City"
sequence_1 = "Apples are especially bad for your health"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

# The tokenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to the sequence, as well as compute the attention masks.
paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="tf")
not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="tf")

paraphrase_classification_logits = model(paraphrase)[0]
not_paraphrase_classification_logits = model(not_paraphrase)[0]

paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0]
not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0]
print(paraphrase_results)
print(not_paraphrase_results)
# Should be paraphrase
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")
    
# Should not be paraphrase
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")

Some layers from the model checkpoint at bert-base-cased-finetuned-mrpc were not used when initializing TFBertForSequenceClassification: ['dropout_183']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at bert-base-cased-finetuned-mrpc.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


[0.09536304 0.904637  ]
[0.94038326 0.05961675]
negative: 10%
positive: 90%
negative: 94%
positive: 6%
