In [1]:
#Import Package
import pandas as pd
import numpy as np
import json, re, unicodedata, string, typing, time ,os
from collections import Counter
import pickle
from pythainlp.tokenize import word_tokenize
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [4]:
os.chdir('C:/Users/taiti/OneDrive/MasterDegree/BADS9000_IS/ThaiDrQA')
print(f"Current working directory {os.getcwd()}")

Current working directory C:\Users\taiti\OneDrive\MasterDegree\BADS9000_IS\ThaiDrQA


In [29]:
def load_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    print("Length of data: ", len(data['data']))
    print("Data Keys: ", data['data'][0].keys())   
    return data

def load_article(file):
    try :
        with open(file, 'r', encoding='utf-8') as f:
            context = f.read()
    except :
        context = "-"
        print(f"Error Article id {id}")
    return  context

def gather_text_for_vocab(dfs:list):
    '''
    Gathers text from contexts and questions to build a vocabulary.
    '''
    text = []
    total = 0
    for df in dfs:
        unique_contexts = list(df.context.unique())
        unique_questions = list(df.question.unique())
        total += df.context.nunique() + df.question.nunique()
        text.extend(unique_contexts + unique_questions)
    
    assert len(text) == total #For debugging code
    print("Number of sentences in dataset: ", len(vocab_text))
    return text

def build_word_vocab(vocab_text):
    """Tokenizer Ref : https://github.com/PyThaiNLP/pythainlp """
    words = []
    for sent in vocab_text:
        for word in word_tokenize(sent ,engine='newmm'):
            words.append(word)
    word_counter = Counter(words)
    word_vocab = sorted(word_counter, key=word_counter.get, reverse=True)
    print(f"raw-vocab: {len(word_vocab)}")
    print(f"vocab-length: {len(word_vocab)}")
    word2idx = {word:idx for idx, word in enumerate(word_vocab)}
    print(f"word2idx-length: {len(word2idx)}")
    idx2word = {v:k for k,v in word2idx.items()}
    
    return word2idx, idx2word, word_vocab

def word_to_ids(text, word2idx):
    '''
    Converts word text to their respective ids by mapping each word
    using word2idx. Input text is tokenized using spacy tokenizer first.
    :param str text: context text to be converted
    :param dict word2idx: word to id mapping
    :returns list context_ids: list of mapped ids
    :raises assertion error: sanity check
    '''
    words_tokens = [w for w in word_tokenize(text ,engine='newmm') ]
    words_ids = [word2idx[word] for word in words_tokens]
    
    assert len(words_ids) == len(words_tokens)
    return words_ids

In [6]:
#Open Json File
data = load_json('./data/ThaiQACorpus-DevelopmentDataset.json')

Length of data:  4000
Data Keys:  dict_keys(['question_id', 'question', 'answer', 'answer_begin_position ', 'answer_end_position', 'article_id'])


In [7]:
#load data JSON to DataFrame
qa_lst = []
for qa in data['data']:
    qa_dict = {}
    qa_dict['article_id'] = qa['article_id']
    qa_dict['id'] = qa['question_id']
    qa_dict['context']  = load_article( f"./data/documents-nsc/{qa['article_id']}.txt")
    qa_dict['question'] = qa['question']
    qa_dict['label'] = [qa['answer_begin_position '],qa['answer_end_position']]
    qa_dict['answer'] = qa['answer']
    qa_lst.append(qa_dict)

df_qa = pd.DataFrame(qa_lst)
print(f"Shap:{df_qa.shape}")
display(df_qa.head(2))

Shap:(4000, 6)


Unnamed: 0,article_id,id,context,question,label,answer
0,115035,1,"<doc id=""115035"" url=""https://th.wikipedia.org...",สุนัขตัวแรกรับบทเป็นเบนจี้ในภาพยนตร์เรื่อง Ben...,"[529, 538]",ฮิกกิ้นส์
1,376583,2,"<doc id=""376583"" url=""https://th.wikipedia.org...",ลูนา 1 เป็นยานอวกาศลำแรกในโครงการลูนาของโซเวีย...,"[139, 144]",เมชตา


In [46]:
#Save data 
df_qa.to_pickle('C:/Users/taiti/OneDrive/MasterDegree/BADS9000_IS/ThaiDrQA/data/df_thaiqa.pkl')

In [181]:
# gather text to build vocabularies
%time vocab_text = gather_text_for_vocab([df_qa])
%time word2idx, idx2word, word_vocab = build_word_vocab(vocab_text)

Number of sentences in dataset:  6266
Wall time: 192 ms
raw-vocab: 63648
vocab-length: 63648
word2idx-length: 63648
Wall time: 35 s


In [30]:
# numericalize context and questions
%time df_qa['context_ids']   = df_qa.context.apply(word_to_ids,   word2idx=word2idx)
%time df_qa['question_ids'] = df_qa.question.apply(word_to_ids,  word2idx=word2idx)
df_qa.head(2)

Wall time: 58.3 s
Wall time: 535 ms


In [137]:
def test_indices(df, idx2word):
    '''
    Performs the tests mentioned above. This method also gets the start and end of the answers
    with respect to the context_ids for each example.
    :param dataframe df: SQUAD df
    :param dict idx2word: inverse mapping of token ids to words
    :returns
        list start_value_error: example idx where the start idx is not found in the start spans
                                of the text
        list end_value_error: example idx where the end idx is not found in the end spans
                              of the text
        list assert_error: examples that fail assertion errors. A majority are due to the above errors
    '''

    start_value_error = []
    end_value_error = []
    assert_error = []
    for index, row in df.iterrows():
        answer_tokens = [w for w in word_tokenize(row['answer'] ,engine='newmm')]
        
        context_tokens = word_tokenize(row['context'] ,engine='newmm')
        context_span  = [(len("".join(context_tokens[0:i])), len("".join(context_tokens[0:i+1]))) 
                         for i,w in enumerate(context_tokens)]
        starts, ends = zip(*context_span)

        answer_start, answer_end = (row['label'][0]-1,row['label'][1]-1)

        try:
            start_idx = starts.index(answer_start)
        except:
            start_value_error.append(index)
        try:
            end_idx  = ends.index(answer_end)
        except:
            end_value_error.append(index)

        try:
            assert idx2word[row['context_ids'][start_idx]] == answer_tokens[0]
            assert idx2word[row['context_ids'][end_idx]] == answer_tokens[-1]
        except:
            assert_error.append(index)


    return start_value_error, end_value_error, assert_error

def get_error_indices(df, idx2word):
    
    start_value_error, end_value_error, assert_error = test_indices(df, idx2word)
    err_idx = start_value_error + end_value_error + assert_error
    err_idx = set(err_idx)
    print(f"Number of error indices: {len(err_idx)}")
    
    return err_idx

In [139]:
%%time 
train_err = get_error_indices(df_qa, idx2word)

df_qa.drop(train_err, inplace=True)
print(f"Shape of data frame after drop error row: {df_qa.shape}")
#Some row is error explore that how to fixed that#

Number of error indices: 149
Wall time: 13min 3s


In [141]:
def index_answer(row, idx2word):
    '''
    Takes in a row of the dataframe or one training example and
    returns a tuple of start and end positions of answer by calculating 
    spans.
    '''
    context_tokens = word_tokenize(row['context'] ,engine='newmm')
    context_span  = [(len("".join(context_tokens[0:i])), len("".join(context_tokens[0:i+1]))) 
                     for i,w in enumerate(context_tokens)]
    starts, ends = zip(*context_span)
    
    answer_start, answer_end = (row['label'][0]-1,row['label'][1]-1)
    
    start_idx = starts.index(answer_start)
    end_idx  = ends.index(answer_end)
    
    answer_tokens = [w for w in word_tokenize(row['answer'] ,engine='newmm')]
    assert idx2word[row.context_ids[start_idx]] == answer_tokens[0]
    assert idx2word[row.context_ids[end_idx]] == answer_tokens[-1]
    
    return [start_idx, end_idx]

In [145]:
%%time
#label_idx is position in context_ids ,which value convert to word by idx2word[]
label_idx = df_qa.apply(index_answer, axis=1, idx2word=idx2word)
df_qa['label_idx'] = label_idx
print(f"df_qa.shape:{df_qa.shape}")
display(df_qa.head(2))

train_df.shape:(3851, 9)


Unnamed: 0,article_id,id,context,question,label,answer,context_ids,question_ids,label_idx
0,115035,1,"<doc id=""115035"" url=""https://th.wikipedia.org...",สุนัขตัวแรกรับบทเป็นเบนจี้ในภาพยนตร์เรื่อง Ben...,"[529, 538]",ฮิกกิ้นส์,"[101, 43, 0, 99, 27, 26953, 12, 0, 102, 27, 97...","[2036, 167, 82, 111, 6, 2705, 6355, 1, 114, 57...","[163, 164]"
1,376583,2,"<doc id=""376583"" url=""https://th.wikipedia.org...",ลูนา 1 เป็นยานอวกาศลำแรกในโครงการลูนาของโซเวีย...,"[139, 144]",เมชตา,"[101, 43, 0, 99, 27, 26958, 12, 0, 102, 27, 97...","[1257, 188, 0, 62, 0, 6, 7727, 946, 82, 1, 487...","[55, 57]"


Wall time: 20.3 ms


In [201]:
def save_to_pickle(save_obj, path_file):
    with open(path_file, 'wb') as file:
        pickle.dump(save_obj, file)
    print(f"save {path_file[27:-4]} to {path_file} success")

In [202]:
#Save this State
save_to_pickle(df_qa , "./drqa/1-tokenizers/result/df_qa.pkl")
save_to_pickle(word2idx , "./drqa/1-tokenizers/result/dict_word2idx.pkl")
save_to_pickle(idx2word , "./drqa/1-tokenizers/result/dict_idx2word.pkl")
save_to_pickle(word_vocab , "./drqa/1-tokenizers/result/list_word_vocab.pkl")

save df_qa to ./drqa/1-tokenizers/result/df_qa.pkl success
save dict_word2idx to ./drqa/1-tokenizers/result/dict_word2idx.pkl success
save dict_idx2word to ./drqa/1-tokenizers/result/dict_idx2word.pkl success
save list_word_vocab to ./drqa/1-tokenizers/result/list_word_vocab.pkl success


In [203]:
# load object to use
def load_pickle(path_file):
    with open(path_file, 'rb') as file:
        load_obj = pickle.load(file)
        print(f"load object from {path_file} success,that is {type(load_obj)}")
        return load_obj

In [205]:
df_load = load_pickle("./drqa/1-tokenizers/result/df_qa.pkl")
word2idx_load = load_pickle("./drqa/1-tokenizers/result/dict_word2idx.pkl")
idx2word_load = load_pickle("./drqa/1-tokenizers/result/dict_idx2word.pkl")
vocab_load = load_pickle("./drqa/1-tokenizers/result/list_word_vocab.pkl")

load object from ./drqa/1-tokenizers/result/dict_word2idx.pkl success,that is <class 'dict'>
load object from ./drqa/1-tokenizers/result/dict_idx2word.pkl success,that is <class 'dict'>
load object from ./drqa/1-tokenizers/result/list_word_vocab.pkl success,that is <class 'list'>


In [209]:
thwiki_itos = load_pickle("./data/pretrained_wiki/thwiki_itos.pkl")

load object from ./data/pretrained_wiki/thwiki_itos.pkl success,that is <class 'list'>


In [211]:
len(thwiki_itos)

60005

In [None]:
def create_glove_matrix():
    '''
    Parses the glove word vectors text file and returns a dictionary with the words as
    keys and their respective pretrained word vectors as values.
    '''
    glove_dict = {}
    with open("./data//glove.840B.300d/glove.840B.300d.txt", "r", encoding="utf-8") as f:
        for line in f:
            values = line.split(' ')
            word = values[0]
            vector = np.asarray(values[1:], dtype="float32")
            glove_dict[word] = vector

    f.close()
    
    return glove_dict

In [213]:
import fasttext

ModuleNotFoundError: No module named 'fasttext'