In [1]:
from bert import *

print("Loading BERT...")
trained = False
try:
    model = BertForSequenceClassification.from_pretrained("./results")
    trained = False
except:
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=10
    )
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Loading BERT...


In [2]:
# Split Dataset into 0.2 test and 0.8 train
raw_data = Dataset("./train.txt")

# Oversampling data
from collections import Counter
count = Counter([label[0] for label in raw_data.label_train])
print(count)

# for idx in range(len(raw_data.label_train)):
#     if raw_data.label_train[idx][0] in {"Content-Container","Instrument-Agency"}:
#         raw_data.data_train.append(raw_data.data_train[idx])
#         raw_data.label_train.append(raw_data.label_train[idx])

from collections import Counter
count = Counter([label[0] for label in raw_data.label_train])
print(count)

# Preparing Dataset
train_encodings = tokenizer(raw_data.data_train, truncation=True, padding=True)
test_encodings = tokenizer(raw_data.data_test, truncation=True, padding=True)
label_encoder = LabelEncoder()
train_label_ids = label_encoder.fit_transform([label[0] for label in raw_data.label_train])
train_dataset = RelationExtractionDataset(
    train_encodings,
    train_label_ids
)
test_label_ids = label_encoder.transform([label[0] for label in raw_data.label_test])
test_dataset = RelationExtractionDataset(
    test_encodings,
    test_label_ids
)

training_args = TrainingArguments(
    output_dir="./results",  # output directory
    num_train_epochs=3,  # total # of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=4,  # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir="./logs",  # directory for storing logs
    evaluation_strategy="steps",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,  # the instantiated huggingface Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=test_dataset,  # evaluation dataset
)

Counter({'Other': 929, 'Entity-Destination': 929, 'Component-Whole': 929, 'Member-Collection': 929, 'Instrument-Agency': 929, 'Entity-Origin': 929, 'Product-Producer': 929, 'Message-Topic': 929, 'Cause-Effect': 929, 'Content-Container': 929})
Counter({'Other': 929, 'Entity-Destination': 929, 'Component-Whole': 929, 'Member-Collection': 929, 'Instrument-Agency': 929, 'Entity-Origin': 929, 'Product-Producer': 929, 'Message-Topic': 929, 'Cause-Effect': 929, 'Content-Container': 929})


In [3]:
if not trained:
    trainer.train()
else:
    trainer.evaluate()

 14%|█▍        | 500/3486 [00:48<04:51, 10.25it/s]
  0%|          | 0/160 [00:00<?, ?it/s][A
  7%|▋         | 11/160 [00:00<00:01, 104.05it/s][A{'loss': 0.14686508178710939, 'learning_rate': 5e-05, 'epoch': 0.43029259896729777}

 13%|█▎        | 21/160 [00:00<00:01, 100.11it/s][A
 19%|█▉        | 31/160 [00:00<00:01, 98.10it/s] [A
 26%|██▌       | 41/160 [00:00<00:01, 97.02it/s][A
 32%|███▏      | 51/160 [00:00<00:01, 96.00it/s][A
 38%|███▊      | 61/160 [00:00<00:01, 95.58it/s][A
 44%|████▍     | 71/160 [00:00<00:00, 95.28it/s][A
 51%|█████     | 81/160 [00:00<00:00, 95.07it/s][A
 57%|█████▋    | 91/160 [00:00<00:00, 94.66it/s][A
 63%|██████▎   | 101/160 [00:01<00:00, 94.37it/s][A
 69%|██████▉   | 111/160 [00:01<00:00, 94.17it/s][A
 76%|███████▌  | 121/160 [00:01<00:00, 94.30it/s][A
 82%|████████▏ | 131/160 [00:01<00:00, 94.12it/s][A
 88%|████████▊ | 141/160 [00:01<00:00, 94.26it/s][A

 14%|█▍        | 500/3486 [00:50<04:51, 10.25it/s]
100%|██████████| 160/160 [00:01<00

In [13]:
trainer.evaluate()


  0%|          | 0/160 [00:00<?, ?it/s][A
  5%|▌         | 8/160 [00:00<00:01, 79.10it/s][A
 10%|█         | 16/160 [00:00<00:01, 77.14it/s][A
 15%|█▌        | 24/160 [00:00<00:01, 77.05it/s][A
 20%|██        | 32/160 [00:00<00:01, 76.99it/s][A
 25%|██▌       | 40/160 [00:00<00:01, 76.98it/s][A
 30%|███       | 48/160 [00:00<00:01, 76.97it/s][A
 35%|███▌      | 56/160 [00:00<00:01, 76.96it/s][A
 40%|████      | 64/160 [00:00<00:01, 77.08it/s][A
 45%|████▌     | 72/160 [00:00<00:01, 77.11it/s][A
 50%|█████     | 80/160 [00:01<00:01, 77.06it/s][A
 55%|█████▌    | 88/160 [00:01<00:00, 76.81it/s][A
 60%|██████    | 96/160 [00:01<00:00, 77.04it/s][A
 65%|██████▌   | 104/160 [00:01<00:00, 77.22it/s][A
 70%|███████   | 112/160 [00:01<00:00, 77.37it/s][A
 75%|███████▌  | 120/160 [00:01<00:00, 77.24it/s][A
 80%|████████  | 128/160 [00:01<00:00, 77.58it/s][A
 85%|████████▌ | 136/160 [00:01<00:00, 76.85it/s][A
 90%|█████████ | 144/160 [00:01<00:00, 76.74it/s][A
 95%|█████████▌

{'eval_loss': 1.5048869848251343, 'epoch': 8.0}

In [14]:
trainer.save_model()

Output result on test.txt

In [4]:
# Acurracy

raw_score = trainer.predict(test_dataset, ["labels"])[0]
score = torch.softmax(torch.tensor(raw_score), 1, torch.float32)
label_ids = [line.argmax() for line in score]
correct_num = 0
for idx, result in enumerate(label_ids):
    if result == test_label_ids[idx]:
        correct_num += 1
print(f"\nCorrect: {correct_num} of {len(test_label_ids)}, {correct_num / len(test_label_ids)}")

 94%|█████████▍| 151/160 [00:01<00:00, 94.14it/s]
Correct: 620 of 640, 0.96875


In [5]:
def print_matrix(real_label, predict_label):
    ret = numpy.zeros((10,10), dtype=numpy.float)
    for i in range(len(real_label)):
        ret[real_label[i]][predict_label[i]] += 1
    ret_sum = numpy.sum(ret, axis=1)
    return numpy.matmul(numpy.diag(1 / ret_sum), ret)
m = print_matrix(test_label_ids.tolist(), [t.item() for t in label_ids])
print(label_encoder.inverse_transform(list(range(10))))
for i in m:
    for j in i:
        print("%.2f" % (j), end=", ")
    print()

['Cause-Effect' 'Component-Whole' 'Content-Container' 'Entity-Destination'
 'Entity-Origin' 'Instrument-Agency' 'Member-Collection' 'Message-Topic'
 'Other' 'Product-Producer']
0.97, 0.00, 0.00, 0.00, 0.02, 0.00, 0.00, 0.00, 0.02, 0.00, 
0.01, 0.96, 0.00, 0.00, 0.00, 0.00, 0.00, 0.02, 0.01, 0.00, 
0.00, 0.00, 0.97, 0.00, 0.00, 0.00, 0.00, 0.00, 0.03, 0.00, 
0.00, 0.00, 0.00, 0.99, 0.00, 0.00, 0.00, 0.00, 0.01, 0.00, 
0.00, 0.00, 0.00, 0.00, 0.97, 0.00, 0.00, 0.00, 0.01, 0.01, 
0.00, 0.00, 0.00, 0.00, 0.00, 0.97, 0.00, 0.00, 0.03, 0.00, 
0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.98, 0.00, 0.02, 0.00, 
0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 1.00, 0.00, 0.00, 
0.01, 0.00, 0.00, 0.00, 0.00, 0.02, 0.02, 0.01, 0.93, 0.01, 
0.00, 0.00, 0.00, 0.00, 0.00, 0.02, 0.00, 0.00, 0.00, 0.98, 
100%|██████████| 160/160 [00:12<00:00, 94.14it/s]

In [6]:
test_file = open("./test.txt")
text = []
for line in test_file:
    quote_index = line.index('"')
    text.append(line[quote_index + 1 : -2])

text_batch = tokenizer(text, truncation=True, padding=True)
test_batch = RelationExtractionDataset(text_batch, torch.zeros(len(text)))
raw_score = trainer.predict(test_batch, ["labels"])[0]
score = torch.softmax(torch.tensor(raw_score), 1, torch.float32)

559it [00:21, 91.38it/s]

In [7]:
label_ids = [line.argmax() for line in score]
labels = label_encoder.inverse_transform(label_ids)

In [8]:
output_file = open("./output.txt", "w")
output_file.writelines([line + "\n" for line in labels])
output_file.close()

In [11]:
data_statistics = Counter([label[0] for label in raw_data.label_train])
print(data_statistics)

Counter({'Instrument-Agency': 2052, 'Content-Container': 2052, 'Component-Whole': 1026, 'Other': 1026, 'Member-Collection': 1026, 'Cause-Effect': 1026, 'Entity-Destination': 1026, 'Message-Topic': 1026, 'Product-Producer': 1026, 'Entity-Origin': 1026})
