In [0]:
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime

In [0]:
!pip install bert-tensorflow

In [0]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

In [0]:
#设置保存模型文件的输出目录

OUTPUT_DIR = 'OUTPUT_DIR_NAME'
DO_DELETE = True 
USE_BUCKET = False 
BUCKET = 'BUCKET_NAME' 

if USE_BUCKET:
  OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)
  from google.colab import auth
  auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
    pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print(' 输出目录为: {} '.format(OUTPUT_DIR))

In [0]:
from tensorflow import keras
import os
import re

# 从github上载入GDELT新闻模型数据文件
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in os.listdir(directory):
    with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+--(\d+)\.txt", file_path))
  return pd.DataFrame.from_dict(data)

# 载入正例和反例
def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# 下载地址
def download_and_load_datasets(force_download=False):
  dataset = tf.keras.utils.get_file(
      fname="chinaNews.zip", 
      origin="https://raw.githubusercontent.com/sunlizhuang/GDELT_chinaNews/master/chinaNews.zip", 
      extract=True)
  
  train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "chinaNews", "train"))
  test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                      "chinaNews", "test"))
  
  return train_df, test_df

In [0]:
train, test = download_and_load_datasets()

In [0]:
train = train.sample(11998)
test = test.sample(11998)

In [0]:
train.columns

In [0]:
DATA_COLUMN = 'sentence'
LABEL_COLUMN = 'polarity'
# 以0代表消极情感倾向，以1代表积极情感倾向
label_list = [0, 1]

In [0]:
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

In [0]:
#载入预训练模型
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

In [0]:
# 设置长度为400
MAX_SEQ_LENGTH = 400
# 将我们的训练和测试功能转换为BERT可以接受的向量输入特征。
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

In [0]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
#创建模型BERT新闻情感倾向分类模型
  bert_module = hub.Module(
      BERT_MODEL_HUB,
      trainable=True)
  bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
  bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

  output_layer = bert_outputs["pooled_output"]

  hidden_size = output_layer.shape[-1].value

  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):

    # Dropout层，可以防止过你和
    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    # one-hot标记
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
    #我们需要预测的标签和概率。
    if is_predicting:
      return (predicted_labels, log_probs)

    #计算预测和实际标签之间的损失
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, predicted_labels, log_probs)

In [0]:
# 设置参数
def model_fn_builder(num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
  def model_fn(features, labels, mode, params): 
    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]

    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
    
    # 训练和评估
    if not is_predicting:

      (loss, predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      train_op = bert.optimization.create_optimizer(
          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

      #计算性能表现结果
      def metric_fn(label_ids, predicted_labels):
        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
        f1_score = tf.contrib.metrics.f1_score(
            label_ids,
            predicted_labels)
        auc = tf.metrics.auc(
            label_ids,
            predicted_labels)
        recall = tf.metrics.recall(
            label_ids,
            predicted_labels)
        precision = tf.metrics.precision(
            label_ids,
            predicted_labels) 
        true_pos = tf.metrics.true_positives(
            label_ids,
            predicted_labels)
        true_neg = tf.metrics.true_negatives(
            label_ids,
            predicted_labels)   
        false_pos = tf.metrics.false_positives(
            label_ids,
            predicted_labels)  
        false_neg = tf.metrics.false_negatives(
            label_ids,
            predicted_labels)
        return {
            "accuracy": accuracy,
            "f1_score": f1_score,
            "precision": precision,
            "recall": recall,
            "true_positives": true_pos,
            "true_negatives": true_neg,
            "false_positives": false_pos,
            "false_negatives": false_neg
        }

      eval_metrics = metric_fn(label_ids, predicted_labels)

      if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode=mode,
          loss=loss,
          train_op=train_op)
      else:
          return tf.estimator.EstimatorSpec(mode=mode,
            loss=loss,
            eval_metric_ops=eval_metrics)
    else:
      (predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      predictions = {
          'probabilities': log_probs,
          'labels': predicted_labels
      }
      return tf.estimator.EstimatorSpec(mode, predictions=predictions)

  # Return the actual model function in the closure
  return model_fn

In [0]:
# 设置参数
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
WARMUP_PROPORTION = 0.1
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

In [0]:
#计算经过了多少步训练
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [0]:
# 保存checkpoint
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [0]:
model_fn = model_fn_builder(
  num_labels=len(label_list),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

In [0]:
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

In [0]:
print(f'开始训练!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("耗时 ", datetime.now() - current_time)

In [0]:
test_input_fn = run_classifier.input_fn_builder(
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

In [0]:
estimator.evaluate(input_fn=test_input_fn, steps=None)

In [0]:
def predictNews(in_sentences):
  labels = ["Negative", "Positive"]
  input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label
  input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
  predictions = estimator.predict(predict_input_fn)
  return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]

In [0]:
News = [
  '''UN Chief Should Lead by Example on Human Rights
Louis Charbonneau is United Nations Director, Human Rights Watch UNITED NATIONS,
Feb 25 2020 (IPS) - United Nations Secretary-General Antonio Guterres has long needed
to overhaul his approach to human rights. Hopefully his call to action announced in Geneva 
yesterday is the start of something new.Guterres’ low-key approach to human rights may 
have been calculated to avoid conflicts with big powers like the United States, Russia, 
China, and Saudi Arabia. But human rights groups and former senior UN officials have criticized 
it for being ineffectual.The secretary-general’s new initiative contains some excellent ideas. 
The link he makes between human rights and the impacts of climate change is crucial, and those 
who fight to protect the environment are increasingly at risk.Forest defenders in Brazil 
and elsewhere are threatened, attacked, and killed by those who seek to benefit from the 
forests’ destruction. And Guterres is right to highlight the risks posed by new technologies, 
whether it involves government surveillance, artificial intelligence, or fully autonomous weapons, 
so-called “killer robots.”The test for any initiative is the implementation. No one is suggesting 
the secretary-general do everything alone. But he needs to lead by example.That means publicly 
calling out rights abusers and advocating for victims. Human rights violations aren’t like 
natural disasters.They are frequently planned and executed by government officials or their 
agents – whether it’s the mass arbitrary detention of Uyghurs in China, Myanmar’s ethnic 
cleansing campaign against Rohingya Muslims, indiscriminate Russian-Syrian bombing of 
civilians in Idlib, or the forced separation of children from their parents at the US border.
It also means using the authority of the secretary-general’s office to launch investigations 
and fact-finding missions when appropriate. That includes launching an inquiry into China’s 
massive rights violations in Xinjiang, and pressing for an international accountability 
mechanism on Sri Lanka.The secretary-general should order a follow-up inquiry into the murder 
of Washington Post columnist Jamal Khashoggi to help determine whether Saudi Arabia’s top leadership 
ordered his slaying. He should also publicly release the findings of his inquiry into attacks on 
hospitals and other protected facilities in Syria, likely carried out by the Russian-Syrian alliance.
None of this is to say Guterres should abandon “private diplomacy” with governments. But he should 
re-emphasize public diplomacy on human rights at the UN. Human rights advocacy shouldn’t be the sole 
responsibility of High Commissioner for Human Rights Michelle Bachelet and her office.The secretary-general
should be the UN’s leading voice on human rights, not only working in the background.
Secretary-General Guterres has issued a call to action on human rights. Now it’s up to him to act.''',
]

In [0]:
predictions = predictNews(News)

In [0]:
print(predictions)