<a href="https://colab.research.google.com/github/thanit456/NeuroSummary/blob/two/headline_classification_task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load preprocessed data

In [1]:
import tensorflow as tf
tf.__version__

'2.2.0-rc4'

In [0]:
import pickle
import numpy as np
import tensorflow.keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tqdm

THAIGOV_PATH = '/content/drive/Shared drives/NeuroSummary/data/b_data_playground/thai_gov_split/'

with open(THAIGOV_PATH + 'train.pkl', 'rb') as f:
  train_df = pickle.load(f)

with open(THAIGOV_PATH + 'val.pkl', 'rb') as f:
  val_df = pickle.load(f)

with open(THAIGOV_PATH + 'test.pkl', 'rb') as f:
  test_df = pickle.load(f)

In [0]:
with open(THAIGOV_PATH + 'train_stop.pkl', 'rb') as f:
  train_stop_df = pickle.load(f)

with open(THAIGOV_PATH + 'val_stop.pkl', 'rb') as f:
  val_stop_df = pickle.load(f)

with open(THAIGOV_PATH + 'test_stop.pkl', 'rb') as f:
  test_stop_df = pickle.load(f)

In [0]:
def create_label(df_headline, df_content):
  labels = []
  for idx in tqdm.tqdm(range(len(df_content))):
    tmp = []
    for word in df_content[idx]:
      if word in df_headline.iloc[idx]:
        tmp.append(1)
      else:
        tmp.append(0)
    labels.append(tmp)
  return labels

In [5]:
train_df.head()

Unnamed: 0,content,headline,class
8608,"[กรมเจ้าท่า, จท., กระทรวงคมนาคม, อาศัย, อำนาจ,...","[กรมเจ้าท่า, ปรับปรุง, อัตรา, ค่า, โดยสาร, เรื...",ด้านเศรษฐกิจ
8593,"[รอง, นรม, พล.อ.ฉัตรชัยฯ, เป็น, ประธาน, การ, ป...","[รอง, นรม, พล.อ.ฉัตรชัยฯ, เป็น, ประธาน, การ, ป...",ข่าวทำเนียบรัฐบาล
10488,"[พลเอก สุรศักดิ์ กาญจนรัตน์, รัฐมนตรี, ว่าการ,...","[รมว., ทส., เปิด, งาน, ประชารัฐร่วมใจ, คน, ลำ,...",ด้านความมั่นคง
12624,"[เมื่อ, วัน, ที่, 5, กุมภาพันธ์, 2561, เวลา, 1...","[นายก, รัฐมนตรี, ลง, พื้นที่, เยี่ยมชม, วิถี, ...",ด้านเศรษฐกิจ
17242,"[กระทรวงดิจิทัลฯ, กระทรวงดิจิทัล, ฯ, สนับสนุน,...","[กระทรวงดิจิทัลฯ, หนุน, สดช., จับ, มือ, ทีโอที...",ด้านสังคม


# Create dictionary

In [0]:
train_set = train_stop_df 
val_set = val_stop_df
test_set = test_stop_df

In [7]:
## ! use only n first words for headline generation
def use_firt_n_words(df_content, n):
  new_ls = []
  for content in df_content:
    new_ls.append(content[:n])
  return new_ls

words_300_train_contents = use_firt_n_words(train_set['content'], n=300)
words_300_val_contents = use_firt_n_words(val_set['content'], n=300)
words_300_test_contents = use_firt_n_words(test_set['content'], n=300)

train_labels = create_label(train_set['headline'],  words_300_train_contents)
val_labels = create_label(val_set['headline'],  words_300_val_contents)
test_labels = create_label(test_set['headline'],  words_300_test_contents)

100%|██████████| 10632/10632 [00:24<00:00, 436.92it/s]
100%|██████████| 2619/2619 [00:05<00:00, 462.58it/s]
100%|██████████| 2619/2619 [00:05<00:00, 443.60it/s]


In [8]:
import collections
def create_index(input_data,threshold):
    input_text = [data for data in input_data]
    # counts of word type has to be above or equal threshold
    words = [word for sublist in input_text for word in sublist]
    print("words :",words)
    word_count_all = list()
    word_count = list()
    #use set and len to get the number of unique words
    word_count_all.extend(collections.Counter(words).most_common(len(set(words))))
    unkcnt = 0
    for (word,cnt) in word_count_all:
      if cnt >= threshold:
        word_count.append((word,cnt))
      else:
        unkcnt+=cnt
    #include a token for unknown word
    word_count.append(("UNK",unkcnt))
    #print out 10 most frequent words
    print("top 10: ",word_count[:10])
    print("bottom 10: ",word_count[-10:])
    dictionary = dict()
    dictionary["for_keras_zero_padding"] = 0
    for word in word_count:
      dictionary[word[0]] = len(dictionary)
    dictionary['<s>'] = len(dictionary) 
    dictionary['</s>'] = len(dictionary)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    
    return dictionary, reverse_dictionary
dict_t, rev_dict_t = create_index(input_data=words_300_train_contents,threshold=0)
print('Vocab size (Content): ',len(dict_t))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



top 10:  [('งาน', 36859), ('.', 24363), ('รัฐมนตรี', 21161), ('พัฒนา', 19106), ('ปี', 18514), ('ทำ', 17615), ('ประเทศ', 15572), ('คน', 14600), ('-', 14314), ('ไทย', 14233)]
bottom 10:  [('ดั้ง', 1), ('ประเทศไทยก', 1), ('พระบาทสมเด็จพระปรมินทรมหาภูมิ พลอดุลยเดช', 1), ('พระบรมราชโชวาท', 1), ('พระบามสมเด็จพระรัฐมนตรี', 1), ('เตารีด', 1), ('นาบ', 1), ('คณะวิศวะ', 1), ('มุมา', 1), ('UNK', 0)]
Vocab size (Content):  65244


In [10]:
dict_t['<s>']
list(dict_t.values())[-5:]

[65239, 65240, 65241, 65242, 65243]

In [0]:
def word_to_idx(input_data,dictionary) :
  X = list()
  for data in input_data:
    sub_data = []
    for word in data:
      if word in dictionary:
        sub_data.append(dictionary[word])
      else:
        sub_data.append(dictionary["UNK"])
    X.append(sub_data)
  return np.array(X)

In [0]:
X_train = word_to_idx(words_300_train_contents,dict_t)
y_train = train_labels
X_val = word_to_idx(words_300_val_contents,dict_t)
y_val = val_labels
X_test = word_to_idx(words_300_test_contents,dict_t)
y_test = test_labels

# Modelling

In [0]:
maxlen = 300
maxlen_output = 300
vocab_size = len(dict_t)
# output_vocab_size = len(dict_t)
m=15000
Tx=maxlen
Ty=maxlen_output

In [0]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply,Add,Conv1D,GRU,TimeDistributed
from tensorflow.keras.layers import RepeatVector, Dense, Activation, Lambda, Embedding,concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping, TensorBoard
import tensorflow.keras.backend as K

import numpy as np
from datetime import datetime

In [0]:
# callbacks
curr_datetime = datetime.now().strftime("%Y%m%d-%H%M%S")

drive_path = '/content/drive/My Drive'
checkpoint_path=  drive_path + '/headline_thaigov_classification/weights/' + curr_datetime + "_weights-improvement-{epoch:02d}-{val_loss:.2f}.hdf5"


callback_list = [     
      ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min'), 
      EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=100),
      # ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=100, min_lr=1e-10),
]

In [0]:
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import TimeDistributed
def extractive_classifcation_model():
  inputs = Input(shape=(maxlen, ))
  x = Embedding(vocab_size, maxlen, trainable=True)(inputs)
  x = LSTM(maxlen, return_sequences=True)(x)
  x = LSTM(maxlen, return_sequences=True)(x) 
  outputs = TimeDistributed(Dense(1, activation='sigmoid'))(x)
  model = Model(inputs=inputs, outputs=outputs)
  model.compile(optimizer=RMSprop(lr=1e-5), loss="binary_crossentropy", metrics=["accuracy"])
  model.summary()
  return model

In [19]:
extractive_model = extractive_classifcation_model()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 300)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 300, 300)          19573200  
_________________________________________________________________
lstm (LSTM)                  (None, 300, 300)          721200    
_________________________________________________________________
lstm_1 (LSTM)                (None, 300, 300)          721200    
_________________________________________________________________
time_distributed (TimeDistri (None, 300, 1)            301       
Total params: 21,015,901
Trainable params: 21,015,901
Non-trainable params: 0
_________________________________________________________________


In [0]:
padded_X_train = np.array(pad_sequences(X_train, maxlen=maxlen, padding='post'))
padded_X_val = np.array(pad_sequences(X_val, maxlen=maxlen, padding='post'))
padded_X_test = np.array(pad_sequences(X_test, maxlen=maxlen, padding='post'))

padded_y_train = np.array(pad_sequences(y_train, maxlen=maxlen_output, padding='post'))
padded_y_val = np.array(pad_sequences(y_val, maxlen=maxlen_output, padding='post'))
padded_y_test = np.array(pad_sequences(y_test, maxlen=maxlen_output, padding='post'))

reshaped_y_train = padded_y_train.reshape(-1, maxlen_output, 1)
reshaped_y_val = padded_y_val.reshape(-1, maxlen_output, 1)
reshaped_y_test = padded_y_test.reshape(-1, maxlen_output, 1)


In [21]:
padded_X_train[:5]

array([[ 264,   20,    3, ...,  432,   20,    3],
       [7587,   13,  100, ...,   55,   16,  624],
       [  87,   77, 2260, ...,    0,    0,    0],
       [ 209,   62,  371, ...,    0,    0,    0],
       [  35,  405,   26, ...,  329,   25,   82]], dtype=int32)

In [22]:
padded_X_val[:5]

array([[  264,    20,     3, ...,   811,    55,   807],
       [ 2309,    93,  8147, ...,    82, 10272,  8147],
       [  431,   529,   163, ...,   167,   108,     7],
       [  485,   369,    23, ..., 16251, 29276,   508],
       [  485,   886,   315, ...,     0,     0,     0]], dtype=int32)

In [23]:
extractive_model

<tensorflow.python.keras.engine.training.Model at 0x7f6221067a20>

In [0]:
extractive_model.load_weights('/content/drive/My Drive/headline_thaigov_classification/weights/20200509-210429_weights-improvement-141-0.31.hdf5')
# K.clear_session()
# extractive_model.fit(padded_X_train, reshaped_y_train, batch_size=1024, validation_data=(padded_X_val, reshaped_y_val), epochs=1000,
#                      callbacks=callback_list)
# extractive_model.fit(, y_train, batch_size = 32,epochs = 1)
# extractive_model.fit_generator(generator=training_generator, validation_data=validation_generator, epochs=10)

In [25]:
extractive_model.evaluate(padded_X_test, reshaped_y_test)



[0.32329538464546204, 0.8733994960784912]

In [0]:
y_pred = extractive_model.predict(padded_X_test)
y_pred = np.where(y_pred >= 0.5, 1, 0)

In [0]:
# test_df.head()
first_300_words = []
for i in test_df['content']:
  first_300_words.append([i[:300]])
test_df['first_300_words_content'] = first_300_words

In [44]:
y_pred = y_pred.reshape()

(2619, 300, 1)

In [45]:
def extractive_text(y_pred, chunk_text):
  ls = []
  for i in tqdm.tqdm(range(len(chunk_text))):
    tmp = []
    for j in range(len(chunk_text[i][0])):
      if y_pred[i][j] > 0:
        tmp.append(chunk_text[i][0][j])
    ls.append(tmp)
  return ls

np_test_df = test_df['first_300_words_content'].to_numpy()
extraction = extractive_text(y_pred, np_test_df)

# y_pred = [e[0] for e in y_pred]
for i in range(20):
  print('Content (at most 300 words) :', test_df.iloc[i]['first_300_words_content'])
  print('Headline                    :', test_df.iloc[i]['headline'])
  print('Prediction.                 :', extraction[i])
  print('Predicted Labels            :', *(y_pred.reshape(-1, 300))[i])
  print('Truth Labels                :', *(reshaped_y_test.reshape(-1, 300))[i])
  print()

100%|██████████| 2271/2271 [00:00<00:00, 3068.72it/s]


Content (at most 300 words) : [['นางอภิรดี ตันตราภรณ์', 'รัฐมนตรี', 'ว่าการ', 'กระทรวงพาณิชย์', 'ใน', 'ฐานะ', 'ประธาน', 'คณะ', 'กรรมการ', 'พิจารณา', 'การ', 'ทุ่ม', 'ตลาด', 'และ', 'การ', 'อุดหนุน', 'ทตอ.', 'ได้', 'เปิดเผย', 'ว่า', 'เมื่อ', 'ต้น', 'เดือน', 'มีนาคม', '2560', 'ที่', 'ผ่าน', 'มา', 'ได้', 'มี', 'การ', 'ประชุม', 'คณะ', 'กรรมการ ทตอ.', 'ซึ่ง', 'มี', 'มติ', 'ดัง', 'นี้', '1', '.', 'ให้', 'ขยาย', 'ระยะ', 'เวลา', 'การ', 'ใช้', 'มาตรการ', 'ชั่วคราว', 'ตอบโต้', 'การ', 'ทุ่มตลาด', 'ออก', 'ไป', 'อีก', '2', 'เดือน', 'ของ', 'สินค้า', '2', 'รายการ', 'ดัง', 'ต่อ', 'ไป', 'นี้', '1', '.', '1', 'สินค้า', 'หลอด', 'และ', 'ท่อ', 'ทำ', 'ด้วย', 'เหล็ก', 'หรือ', 'เหล็กกล้า', 'ที่', 'มี', 'แหล่ง', 'กำเนิด', 'จาก', 'สาธารณรัฐประชาชนจีน', 'และ', 'สาธารณรัฐเกาหลี', '1', '.', '2', 'สินค้า', 'เหล็ก', 'แผ่น', 'รีดร้อน', 'ชนิด', 'เป็นม้วน', 'และ', 'ไม่', 'เป็นม้วน', 'ที่', 'มี', 'แหล่ง', 'กำเนิด', 'จาก', 'สหพันธ์สาธารณรัฐบราซิล', 'สาธารณรัฐอิสลาม', 'อิหร่าน', 'และ', 'สาธารณรัฐตุรกี', '2', '.', 'ให้', 'เร

In [55]:
eval_y_true = reshaped_y_test.reshape(-1, maxlen)
eval_y_pred = y_pred.reshape(-1, 300)

true_positive = 0
false_positive = 0
false_negative = 0
true_negative = 0

for i in tqdm.tqdm(range(eval_y_true.shape[0])):
  for j in range(eval_y_true.shape[1]):
    if eval_y_true[i][j] == 1 and eval_y_pred[i][j] == 1:
      true_positive += 1
    elif eval_y_true[i][j] == 0 and eval_y_pred[i][j] == 1:
      false_positive += 1
    elif eval_y_true[i][j] == 1 and eval_y_pred[i][j] == 0:
      false_negative += 1
    elif eval_y_true[i][j] == 0 and eval_y_pred[i][j] == 0:
      true_negative += 1

print()
print('True positive : ', true_positive)
print('False positive : ', false_positive)
print('False negative : ', false_negative)
print('True negative : ', true_negative)

recall = true_positive / (true_positive + false_negative)
precision = true_positive / (true_positive + false_negative)
f1_score = 2 * true_positive / (2 * true_positive + false_positive + false_negative)
print('Recall    : ', recall)
print('Precision : ', precision)
print('F1 score  : ', f1_score)

100%|██████████| 2619/2619 [00:06<00:00, 381.66it/s]


True positive :  1247
False positive :  833
False negative :  98637
True negative :  684983
Recall    :  0.012484481999118977
Precision :  0.012484481999118977
F1 score  :  0.024459613196814563





In [49]:
# x = np.array([[1,0,0], [0,1,0]])
# count_zero = np.count_nonzero(x == 0)
# count_one = np.count_nonzero(x == 1)

AttributeError: ignored

In [0]:
def extractive_text(y_pred, chunk_text):
  ls = []
  for i in tqdm.tqdm(range(len(chunk_text))):
    tmp = []
    for j in range(len(chunk_text[i][0])):
      if y_pred[i][j] == 1:
        tmp.append(chunk_text[i][0][j])
    ls.append(tmp)
  return ls

np_test_df = test_df['first_300_words_content'].to_numpy()
extraction = extractive_text(y_pred, np_test_df)

for i in range(5):
  print('Content (at most 300 words) :', test_df.iloc[i]['first_300_words_content'])
  print('Headline                    :', test_df.iloc[i]['headline'])
  print('Prediction.                 :', extraction[i])
  print('Labels                      :', *y_pred[i])
  print()

100%|██████████| 2271/2271 [00:00<00:00, 3135.41it/s]


Content (at most 300 words) : [['นางอภิรดี ตันตราภรณ์', 'รัฐมนตรี', 'ว่าการ', 'กระทรวงพาณิชย์', 'ใน', 'ฐานะ', 'ประธาน', 'คณะ', 'กรรมการ', 'พิจารณา', 'การ', 'ทุ่ม', 'ตลาด', 'และ', 'การ', 'อุดหนุน', 'ทตอ.', 'ได้', 'เปิดเผย', 'ว่า', 'เมื่อ', 'ต้น', 'เดือน', 'มีนาคม', '2560', 'ที่', 'ผ่าน', 'มา', 'ได้', 'มี', 'การ', 'ประชุม', 'คณะ', 'กรรมการ ทตอ.', 'ซึ่ง', 'มี', 'มติ', 'ดัง', 'นี้', '1', '.', 'ให้', 'ขยาย', 'ระยะ', 'เวลา', 'การ', 'ใช้', 'มาตรการ', 'ชั่วคราว', 'ตอบโต้', 'การ', 'ทุ่มตลาด', 'ออก', 'ไป', 'อีก', '2', 'เดือน', 'ของ', 'สินค้า', '2', 'รายการ', 'ดัง', 'ต่อ', 'ไป', 'นี้', '1', '.', '1', 'สินค้า', 'หลอด', 'และ', 'ท่อ', 'ทำ', 'ด้วย', 'เหล็ก', 'หรือ', 'เหล็กกล้า', 'ที่', 'มี', 'แหล่ง', 'กำเนิด', 'จาก', 'สาธารณรัฐประชาชนจีน', 'และ', 'สาธารณรัฐเกาหลี', '1', '.', '2', 'สินค้า', 'เหล็ก', 'แผ่น', 'รีดร้อน', 'ชนิด', 'เป็นม้วน', 'และ', 'ไม่', 'เป็นม้วน', 'ที่', 'มี', 'แหล่ง', 'กำเนิด', 'จาก', 'สหพันธ์สาธารณรัฐบราซิล', 'สาธารณรัฐอิสลาม', 'อิหร่าน', 'และ', 'สาธารณรัฐตุรกี', '2', '.', 'ให้', 'เร

In [0]:
len(np_test_df[0][0])

300

In [0]:
ls = [1,2,3,4]
res = []
res.append(ls)
res.append(ls)
res = np.array(res)
res.shape

In [0]:
ls = []
for i in y_train:
  print(np.array(i))
  ls.append(np.array(i))
  break
ls = np.array(ls)
ls

In [0]:
# def define_models(n_input=maxlen,n_output=maxlen_output,n_units=32) :
#   #define training encoder model
#   encoder_inputs = Input(shape=(maxlen,))
#   encoder_embedding = Embedding(vocab_size, n_units)(encoder_inputs)
#   encoder  = LSTM(n_units, return_state=True)
#   encoder_outputs, state_h, state_c = encoder(encoder_embedding)
#   encoder_states = [state_h, state_c]

#   # define training decoder model
#   decoder_inputs = Input(shape=(maxlen_output,vocab_size ))
#   # decoder_embedding = Embedding(vocab_size, n_units)
#   # decoder_inputs2 = decoder_embedding(decoder_inputs)
#   decoder_lstm = LSTM(n_units, return_state=True, return_sequences=True)
#   decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

#   # dense_layer = Dense(VOCAB_SIZE, activation='softmax')
#   decoder_dense  = Dense(vocab_size, activation='softmax')
#   decoder_outputs = decoder_dense(decoder_outputs)
#   model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

#   #define inferencing encoder model
#   encoder_model = Model(encoder_inputs,encoder_states)
#   #define inference decoder
#   decoder_state_input_h = Input(shape=(n_units,))
#   decoder_state_input_c = Input(shape=(n_units,))
#   decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
#   decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
#   decoder_states = [state_h, state_c]
#   decoder_outputs = decoder_dense(decoder_outputs)
#   decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
#   # return all models
#   return model,encoder_model,decoder_model

In [0]:
# configure problem
# n_features = 50 + 1
n_steps_in = maxlen
n_steps_out = maxlen_output
train, infenc, infdec = define_models(n_units=64)
# train.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
train.compile(loss='categorical_crossentropy', optimizer="rmsprop", metrics=['accuracy'])
train.summary()

In [0]:
train.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    epochs=300, callbacks=callback_list)