In [1]:
import json
import torch
import torch.nn as nn
import transformers
from transformers import BertTokenizer, BertModel
import tensorflow as tf
import keras
import numpy as np

2024-04-22 00:26:18.708307: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-22 00:26:18.708435: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-22 00:26:18.860293: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
#####
# Global variables
#####

# Check if CUDA can be used to speed up training/reasoning
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# CE
# Load BERT-large tokenizer and BERT-Large model
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
bert_model = BertModel.from_pretrained('bert-large-uncased').to(device)  # Make sure the model is on the correct device

# Define BiGRU layer for CE
hidden_size = 1024  # For BERT-Large，the hidden_size should be 1024
bigru_layer = nn.GRU(input_size=1024, hidden_size=hidden_size, bidirectional=True, batch_first=True).to(device)

# NE
# Character-to-index mapping
char_to_index = {str(i): i for i in range(10)}
char_to_index['.'] = 10  # The index for the decimal point

# Maximum numeric length and character dimension
max_num_length = 10
char_dim = 11

# Initialize BiGRU for NE
input_size_NE = char_dim
hidden_size = 1024
bigru_model = nn.GRU(input_size=input_size_NE, hidden_size=hidden_size, bidirectional=True, batch_first=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

In [3]:
# Define a function to encode text using BERT and BiGRU
def encode_with_ce(texts):
    # Encode the texts
    encoded_input = tokenizer(texts, return_tensors='pt',padding='max_length', truncation=True, max_length=512,
                              add_special_tokens=True)

    # Make sure the input is also on the correct device
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}

    # Get the embedding using BERT-Large
    with torch.no_grad():
        output = bert_model(**encoded_input)

    # The BERT model outputs a tuple, and we are interested in the first element - the hidden state
    embeddings = output.last_hidden_state

    # Pass the embed to BiGRU
    bigru_output, _ = bigru_layer(embeddings)

    return bigru_output

In [4]:
# Convert answer's number into representations
def encode_and_process_number(number, max_num_length=10, char_dim=11, bigru_model=bigru_model):
    # Create an all-zero tensor
    encoded = torch.zeros(max_num_length, char_dim)

    # Calculate the left fill amount
    padding_size = max_num_length - len(number)

    # Fill encoding according to character
    for i, char in enumerate(number):
        if char in char_to_index:
            encoded[padding_size + i, char_to_index[char]] = 1

    # Add batch dimension
    encoded = encoded.unsqueeze(0)  # Make the tensor shape [1, max_num_length, char_dim]

    # Input the encoded tensor into BiGRU
    bigru_output, _ = bigru_model(encoded)

    # Return the output of BiGRU
    return bigru_output

In [5]:
# Converts the correct answer index to a unique thermal encoding
def one_hot_encode(index, num_classes):
    encoding = [0] * num_classes
    encoding[index] = 1
    return encoding

In [15]:
##########
# 1. Load the data
##########

# Import the training data
with open('/kaggle/input/nquad-dataset/NQuAD_train_first_10k.json', 'r', encoding='utf-8') as file:
        data_train = json.load(file)

# Import the testing data
with open('/kaggle/input/nquad-dataset/NQuAD_test_first_2k.json', 'r', encoding='utf-8') as file:
        data_test = json.load(file)

In [9]:
data_train[0]

{'news_article': ['宏達電（2498）昨(21)日股東會上，董事長王雪紅應允小股東請求，自掏腰包贈送股東會議程結束前完成報到程序且人在會場的小股東（不含公司內部股東），每人一台宏達電旗艦手機新HTC One 32G手機、單機市價約2.19萬元，創下台灣股東會史上單價最高的贈禮。同時，宏達電董監改選，前台積電（2330）總經理蔡力行當選新任董事，被外界認為是跟台積電聯手抗韓，針對三星電子的科技競爭而進行的結盟。',
  '據統計，當日符合條件股東數約100多位，依單機2.19萬元計算，王雪紅大手筆送給在場關心宏達電營運的小股東們逾200多萬元大禮。經過各家媒體近兩日報導，「宏達電」、「HTC」、「新HTC One」在網路上討論的熱烈程度又升高了，達到品牌與知名度的提升效果，宏達電行銷功力可謂更上層樓。',
  '宏達電今年的股東會也全面進行董監事改選，當選董事名單包括王雪紅、陳文琦、卓火土、蔡力行、David Bruce Yoffie；獨立董事為林振國、Josef Felder；新任監察人為威智投資(股)公司、朱黃傑。',
  '除了蔡力行是新赴任的宏達電董事外，其餘本次當選董事、監察人都為續任。',
  '蔡力行是台積電（2330）前總經理、現任台積電太陽能與固態照明董事長；因此外界也多解讀，在台積電董事長張忠謀公開多次盛讚宏達電手機產品，並倡議聯發科（2454）、鴻海（2317）、台積電與宏達電在各自的半導體IC、面板、晶圓產業、消費性手機領域一同對付三星電子；蔡力行昨日獲選出任宏達電董事一職，可謂是台灣科技廠聯手抗韓的最新發展。'],
 'question_stem': '王雪紅贈股東___G新One，創台股東會單價最高贈品                                                       ',
 'answer_options': ['100', '2.19', '21', '32'],
 'ans': 3,
 'target_num': '32',
 'sentences_containing_the_numeral_in_answer_options': [['當日符合條件股東數約100多位'],
  ['單機市價約2.19萬元', '依單機2.19萬元計算'],
  ['宏達電（2498）昨(2

In [13]:
data_train[1]

{'news_article': ['印刷電板大廠燿華(2367)於今(24)日召開股東常會，會議順利結束，議程照案通過(見圖)。燿華總經理許正弘於營業報告時指出，燿華去年營運狀況不佳，但今年以來，第二季營收比第一季好，獲利也有很大的改善，第三季比第二季好，至少會有2位數的成長，目前燿華幾乎都滿載。',
  '許正弘表示，第四季展望仍佳，但還不準，還要看消費性產品銷售狀況好不好，會再調整，許正弘表示，今年大環境景氣還是不好，但因為燿華在客戶調整以及開發，所以下半年營運成長幅度會好。',
  '許正弘表示，目前產能還是不夠，今年資本支出至少是15億元，這15億已都下訂單，有機會再上修，主要擴充會在任意層，中長期仍要觀察中低階智慧型手機的發展以及各家品牌廠的消長。',
  '法人預估，今年上下半年營收比重可望朝45:55分佈。',
  '燿華回顧2012年表示，2012年的全球消費主力只有智慧型手機及平板電腦，而且更過度在少數客戶，形成供需嚴重失衡，相對削價競爭歷年來罕見。燿華雖深耕於高端HDI產品，卻因供需失序、產品售價大幅下滑，導致2012年營收及獲利大幅衰退。',
  '燿華今股東常會順利通過財報。燿華2012年營收119.03億元，毛利率為7.6%，稅後虧損5.06億元，每股虧損0.88元。股東會通過不配發股利。'],
 'question_stem': '燿華：Q___獲利有很大改善，Q3至少二位數成長                                                          ',
 'answer_options': ['0.88', '2', '5.06', '7.6'],
 'ans': 1,
 'target_num': '2',
 'sentences_containing_the_numeral_in_answer_options': [['每股虧損0.88元'],
  ['至少會有2位數的成長'],
  ['稅後虧損5.06億元'],
  ['毛利率為7.6%']]}

In [18]:
##########
# 2. Generate question representations for training data
##########

# Prepare training data and one-hot labels lists
question_representations_train = []
one_hot_labels_train = []
 
# Iterate each sample in training data
for sample in data_train[:2]:
     # CE output list
        ce_output_list = []
     # NE output list
        ne_output_list = []
        
     # 2.1 CE
        # Process question stem
        stem_result = encode_with_ce(sample['question_stem'])
        ce_output_list.append(stem_result)
        
        # Process sentences_containing_the_numeral_in_answer_options
        for sentence_list in sample["sentences_containing_the_numeral_in_answer_options"]:
            
                if len(sentence_list) > 1:
                    combined_sentence = '。'.join(sentence.strip() for sentence in sentence_list)
                    print(combined_sentence)
                    result = encode_with_ce(combined_sentence)
                    ce_output_list.append(result)
                else:
                    print(sentence_list[0])
                    result = encode_with_ce(sentence_list[0].strip())
                    ce_output_list.append(result)
        for each in ce_output_list:
            print(each.shape)
    
    # 2.2 NE
        numbers = sample["answer_options"]
        for number in numbers:
            result = encode_and_process_number(number)
            ne_output_list.append(result)
        #             print(result.shape)
        
        for each in ne_output_list:
            print(each.shape)

    # 2.3 Concatenate
        all_output_list = ce_output_list + ne_output_list
        all_numpy_arrays_list = [tensor.detach().cpu().numpy() for tensor in all_output_list]

        # Use tf. Keras. The layers. Concatenate to joining together all these tensor
        concat_layer = tf.keras.layers.Concatenate(axis=1)
        concatenated_tensors = concat_layer(all_numpy_arrays_list)
        print("concatenated_tensors.shape")
        print(concatenated_tensors.shape)

        # 2.4 Apply global average pooling
        global_average_layer = tf.keras.layers.GlobalAveragePooling1D()
        pooled_tensor = global_average_layer(concatenated_tensors)
        print("pooled_tensor.shape")
        print(pooled_tensor.shape)

        # Convert it to a TensorFlow tensor
        pooled_tensor = tf.convert_to_tensor(pooled_tensor)

        # Add pooled tensor to question_representations
        question_representations_train.append(pooled_tensor)

        # 2.5 Convert index of answer into one-hot vector
        correct_answer_index = [sample['ans']]
        # Converts the correct answer index to TensorFlow's uniquely thermal coded tensor
        one_hot_label = tf.one_hot(correct_answer_index, depth=4)
        #         print(one_hot_label)
        #         print(type(one_hot_label))
        one_hot_labels_train.append(one_hot_label)
        print()

print(len(question_representations_train))
print(len(one_hot_labels_train))

當日符合條件股東數約100多位
單機市價約2.19萬元。依單機2.19萬元計算
宏達電（2498）昨(21)日股東會上
每人一台宏達電旗艦手機新HTC One 32G手機
torch.Size([1, 512, 2048])
torch.Size([1, 512, 2048])
torch.Size([1, 512, 2048])
torch.Size([1, 512, 2048])
torch.Size([1, 512, 2048])
torch.Size([1, 10, 2048])
torch.Size([1, 10, 2048])
torch.Size([1, 10, 2048])
torch.Size([1, 10, 2048])
concatenated_tensors.shape
(1, 2600, 2048)
pooled_tensor.shape
(1, 2048)

每股虧損0.88元
至少會有2位數的成長
稅後虧損5.06億元
毛利率為7.6%
torch.Size([1, 512, 2048])
torch.Size([1, 512, 2048])
torch.Size([1, 512, 2048])
torch.Size([1, 512, 2048])
torch.Size([1, 512, 2048])
torch.Size([1, 10, 2048])
torch.Size([1, 10, 2048])
torch.Size([1, 10, 2048])
torch.Size([1, 10, 2048])
concatenated_tensors.shape
(1, 2600, 2048)
pooled_tensor.shape
(1, 2048)

2
2


In [16]:
data_test[0]

{'news_article': ['日本蘋果情報網站iPhone Mania 23日報導，據調查公司IHS Markit公布的統計資料顯示，2016年全球最暢銷(出貨量最多)的智慧手機產品不是蘋果(Apple)去年9月推出的iPhone 7、而是前一世代的iPhone 6s，iPhone 7排第2；去年全球前10大暢銷智慧手機產品中，iPhone包辦前4名，除iPhone 6s、iPhone 7之外，iPhone 7 Plus、iPhone 6s Plus分居第3、4位。',
  '若單就上季(2016年10-12月)情況來看，全球最暢銷機種為iPhone 7，iPhone 7 Plus排第2。',
  '在蘋果死對頭三星電子的部分，旗艦機種Galaxy Note 7雖因連環爆而召回停售，不過因三星積極進行促銷活動、包含套裝販售可和Galaxy智慧手機連接使用的虛擬實境(VR)裝置Gear VR，故在前10大暢銷機種中、三星Galaxy產品拿下5個席次，其中Galaxy S7 Edge、Galaxy S7分居第5、9位。Galaxy S7/S7 Edge去年全球出貨量較前一代機種(Galaxy S6/S6 Edge)多出1,000萬支。',
  '另外，華為雖為全球第3大智慧手機廠，不過因採機海策略、推出各種價格帶的機種，故未能有1款產品擠進前10大行列。',
  'OPPO推出的OPPO A53位居第7位，是前10大暢銷機種中唯一一款非蘋果、三星的機種，且表現優於三星S7。OPPO去年全球智慧手機出貨量暴增109%，市佔排名從2015年的第7位躍升至第4位。',
  '據全球市場研究機構TrendForce公布的資料顯示，2016年全球智慧型手機出貨量為13.6億支，年成長4.7%。其中三星以22.8%的市佔率位居首位，其次分別為蘋果的15.3%、華為的9.6%、OPPO的7.2%、BBK/VIVO的6.0%、LG的5.5%、小米的3.7%、聯想的3.7%、TCL的3.7%和中興通訊(ZTE)的3.5%。',
  '＊編者按：本文僅供參考之用，並不構成要約、招攬或邀請、誘使、任何不論種類或形式之申述或訂立任何建議及推薦，讀者務請運用個人獨立思考能力，自行作出投資決定，如因相關建議招致損失，概與《精實財經媒體》、編者及作者無涉。'],
 'question_ste

In [19]:
##########
# 3. Make MLP model and put question representations and one-hot label list into MLP model
##########

# Construct a MLP model
mlp = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(2048,)),
        tf.keras.layers.Dropout(0.3),  # Add dropout
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(4, activation='softmax')
])

# Compile model
mlp.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
mlp.fit(question_representations_train, one_hot_labels_train, epochs=10, batch_size=32)

#####
# 4. Prepare testing data and make predictions
#####

# Prepare training data and one-hot labels lists
question_representations_test = []
one_hot_labels_test = []

# Iterate each sample in training data
for sample in data_test[:1]:
     # CE output list
        ce_output_list = []
     # NE output list
        ne_output_list = []
        
     # 2.1 CE
        # Process question stem
        stem_result = encode_with_ce(sample['question_stem'])
        ce_output_list.append(stem_result)
        
        # Process sentences_containing_the_numeral_in_answer_options
        for sentence_list in sample["sentences_containing_the_numeral_in_answer_options"]:
                if len(sentence_list) > 1:
                    combined_sentence = '。'.join(sentence.strip() for sentence in sentence_list)
                    print(combined_sentence)
                    result = encode_with_ce(combined_sentence)
                    ce_output_list.append(result)
                else:
                    print(sentence_list[0])
                    result = encode_with_ce(sentence_list[0].strip())
                    ce_output_list.append(result)
        for each in ce_output_list:
            print(each.shape)
    
    # 2.2 NE
        numbers = sample["answer_options"]
        for number in numbers:
            result = encode_and_process_number(number)
            ne_output_list.append(result)
        #             print(result.shape)
        
        for each in ne_output_list:
            print(each.shape)

    # 2.3 Concatenate
        all_output_list = ce_output_list + ne_output_list
        all_numpy_arrays_list = [tensor.detach().cpu().numpy() for tensor in all_output_list]

        # Use tf. Keras. The layers. Concatenate to joining together all these tensor
        concat_layer = tf.keras.layers.Concatenate(axis=1)
        concatenated_tensors = concat_layer(all_numpy_arrays_list)
        print("concatenated_tensors.shape")
        print(concatenated_tensors.shape)

        # 2.4 Apply global average pooling
        global_average_layer = tf.keras.layers.GlobalAveragePooling1D()
        pooled_tensor = global_average_layer(concatenated_tensors)
        print("pooled_tensor.shape")
        print(pooled_tensor.shape)

        # Convert it to a TensorFlow tensor
        pooled_tensor = tf.convert_to_tensor(pooled_tensor)

        # Add pooled tensor to question_representations
        question_representations_test.append(pooled_tensor)

        # 2.5 Convert index of answer into one-hot vector
        correct_answer_index = [sample['ans']]
        # Converts the correct answer index to TensorFlow's uniquely thermal coded tensor
        one_hot_label = tf.one_hot(correct_answer_index, depth=4)
        #         print(one_hot_label)
        #         print(type(one_hot_label))
        one_hot_labels_test.append(one_hot_label)
        print()

print(len(question_representations_test))
print(len(one_hot_labels_test))

# Evaluate the model
# Evaluate the model using the test set data question_representations_test and one_hot_labels_test
loss, accuracy = mlp.evaluate(question_representations_test, one_hot_labels_test)

print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.0000e+00 - loss: 1.2536
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 1.0000 - loss: 0.7425
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 1.0000 - loss: 0.3804
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 1.0000 - loss: 0.1426
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 1.0000 - loss: 0.0678
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 1.0000 - loss: 0.0303
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 1.0000 - loss: 0.0164
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 1.0000 - loss: 0.0023
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 