# Set environment

## pip install new package

In [1]:
!pip install pythainlp
!pip install fasttext

Collecting pythainlp
  Downloading pythainlp-3.0.5-py3-none-any.whl (11.5 MB)
[K     |████████████████████████████████| 11.5 MB 23.5 MB/s 
Collecting tinydb>=3.0
  Downloading tinydb-4.7.0-py3-none-any.whl (24 kB)
Installing collected packages: tinydb, pythainlp
Successfully installed pythainlp-3.0.5 tinydb-4.7.0
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 6.1 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.9.1-py2.py3-none-any.whl (211 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3134959 sha256=93ccb5e50bdf4209d84c4f76f8e3e4f8f70ab2b696a08b24c062d67b1d3f7f5a
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Success

## import package to environment 

In [1]:
#Import Package
import pandas as pd
import numpy as np
import json, re, unicodedata, string, typing, time ,os
from collections import Counter
import pickle
from pythainlp.tokenize import word_tokenize
from bs4 import BeautifulSoup
#import fasttext
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Google drive

In [3]:
#mount my google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/Colab Notebooks/BADS9000_IS/Colab-DrQA

/content/drive/MyDrive/Colab Notebooks/BADS9000_IS/Colab-DrQA


## Function

In [3]:
def load_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    print("Length of data: ", len(data['data']))
    print("Data Keys: ", data['data'][0].keys())   
    return data

def load_article(file):
    try :
        with open(file, 'r', encoding='utf-8') as f:
            context = f.read()
    except :
        context = "-"
        print(f"Error Article id {id}")
    return  context

def gather_text_for_vocab(dfs:list):
    '''
    Gathers text from contexts and questions to build a vocabulary.
    '''
    text = []
    total = 0
    for df in dfs:
        unique_contexts = list(df.context.unique())
        unique_questions = list(df.question.unique())
        total += df.context.nunique() + df.question.nunique()
        text.extend(unique_contexts + unique_questions)
    
    assert len(text) == total #For debugging code
    print("Number of sentences in dataset: ", len(text))
    return text

def build_word_vocab(vocab_text):
    """Tokenizer Ref : https://github.com/PyThaiNLP/pythainlp """
    words = []
    for sent in vocab_text:
        for word in word_tokenize(sent ,engine='newmm'):
            words.append(word)
    word_counter = Counter(words)
    word_vocab = sorted(word_counter, key=word_counter.get, reverse=True)
    #ref : https://github.com/nicolas-ivanov/tf_seq2seq_chatbot/issues/15#issuecomment-246106807
    word_vocab.insert(0, '<unk>')  #"replace the unknown word --> My name is _unk_"
    word_vocab.insert(1, '<pad>') #"padded from either side to fit this length"
    print(f"raw-vocab: {len(word_vocab)}")
    print(f"vocab-length: {len(word_vocab)}")
    word2idx = {word:idx for idx, word in enumerate(word_vocab)}
    print(f"word2idx-length: {len(word2idx)}")
    idx2word = {v:k for k,v in word2idx.items()}
    
    return word2idx, idx2word, word_vocab

def word_to_ids(text, word2idx):
    '''
    Converts word text to their respective ids by mapping each word
    using word2idx. Input text is tokenized using spacy tokenizer first.
    :param str text: context text to be converted
    :param dict word2idx: word to id mapping
    :returns list context_ids: list of mapped ids
    :raises assertion error: sanity check
    '''
    words_tokens = [w for w in word_tokenize(text ,engine='newmm') ]
    words_ids = [word2idx[word] for word in words_tokens]
    
    assert len(words_ids) == len(words_tokens)
    return words_ids

def save_to_pickle(save_obj, path_file):
    with open(path_file, 'wb') as file:
        pickle.dump(save_obj, file)
    print(f"save {path_file[27:-4]} to {path_file} success")

def load_pickle(path_file):
    with open(path_file, 'rb') as file:
        load_obj = pickle.load(file)
        print(f"load object from {path_file} success,that is {type(load_obj)}")
        return load_obj

# Built data for Training

## Load Data JSON to DataFrame (PC)

In [4]:
#For PC
os.chdir('C:/Users/taiti/OneDrive/MasterDegree/BADS9000_IS/ThaiDrQA')
print(f"Current working directory {os.getcwd()}")

Current working directory C:\Users\taiti\OneDrive\MasterDegree\BADS9000_IS\ThaiDrQA


In [18]:
#Open Json File ThaiQA04K ThaiQA15K
file = "ThaiQA04K"
data = load_json(f"./data/{file}.json")

Length of data:  4000
Data Keys:  dict_keys(['question_id', 'question', 'answer', 'answer_begin_position', 'answer_end_position', 'article_id'])


In [19]:
data['data'][1]

{'question_id': 2,
 'question': 'ลูนา 1 เป็นยานอวกาศลำแรกในโครงการลูนาของโซเวียต มีชื่อเรียกอีกชื่อว่าอะไร',
 'answer': 'เมชตา',
 'answer_begin_position': 139,
 'answer_end_position': 144,
 'article_id': 376583}

In [20]:
#load data JSON to DataFrame
qa_lst = []
for qa in data['data']:
    qa_dict = {}
    qa_dict['article_id'] = qa['article_id']
    qa_dict['id'] = qa['question_id']
    qa_dict['html_context']  = load_article( f"./data/documents-nsc/{qa['article_id']}.txt")
    qa_dict['question'] = qa['question']
    qa_dict['html_label'] = [qa['answer_begin_position'],qa['answer_end_position']]
    qa_dict['html_answer'] = qa['answer']
    qa_lst.append(qa_dict)

df_qa = pd.DataFrame(qa_lst)
df_qa = df_qa[~df_qa.html_answer.isin(['ไม่ใช่','ใช่'])]
print(f"Shap:{df_qa.shape}")
display(df_qa.head(2))

Shap:(4000, 6)


Unnamed: 0,article_id,id,html_context,question,html_label,html_answer
0,115035,1,"<doc id=""115035"" url=""https://th.wikipedia.org...",สุนัขตัวแรกรับบทเป็นเบนจี้ในภาพยนตร์เรื่อง Ben...,"[529, 538]",ฮิกกิ้นส์
1,376583,2,"<doc id=""376583"" url=""https://th.wikipedia.org...",ลูนา 1 เป็นยานอวกาศลำแรกในโครงการลูนาของโซเวีย...,"[139, 144]",เมชตา


In [21]:
save_to_pickle(df_qa , f"./drqa/1-tokenizers/result/df_{file}_0_origi.pkl")

save df_ThaiQA04K_0_origi to ./drqa/1-tokenizers/result/df_ThaiQA04K_0_origi.pkl success


## Data cleaning

In [6]:
file = "ThaiQA04K" #ThaiQA04K ThaiQA15K
df_qa = load_pickle(f"./drqa/1-tokenizers/result/df_{file}_0_origi.pkl")
df_qa = df_qa[~df_qa.html_answer.isin(['ไม่ใช่','ใช่'])] #Exclude Yes/No Answer

display(df_qa.head(2))
print(f"shape of df_qa {df_qa.shape}")

load object from ./drqa/1-tokenizers/result/df_ThaiQA04K.pkl success,that is <class 'pandas.core.frame.DataFrame'>


AttributeError: ignored

In [None]:
def label_answer(context ,answer):
  try:
    start_idx = context.index(answer)
    re_val = [start_idx, start_idx+len(answer)]
  except:
    re_val = np.nan
  return re_val
  
df_qa['context'] = df_qa['html_context'].apply(lambda x: re.sub(r'\s', ' ',BeautifulSoup(x, 'lxml').text))
df_qa['label']  = df_qa.apply(lambda x: label_answer(x['context'],x['html_answer']) , axis=1)
df_qa = df_qa[~df_qa.label.isna()]
display(df_qa.head(2))
print(f"shape of df_qa {df_qa.shape}")

Unnamed: 0,article_id,id,html_context,question,html_label,html_answer,context,label
0,115035,1,"<doc id=""115035"" url=""https://th.wikipedia.org...",สุนัขตัวแรกรับบทเป็นเบนจี้ในภาพยนตร์เรื่อง Ben...,"[529, 538]",ฮิกกิ้นส์,เบนจี้ เบนจี้ () เป็นชื่อตัวละครหมาพันทางแสนรู...,"[447, 456]"
1,376583,2,"<doc id=""376583"" url=""https://th.wikipedia.org...",ลูนา 1 เป็นยานอวกาศลำแรกในโครงการลูนาของโซเวีย...,"[139, 144]",เมชตา,ลูนา 1 ลูนา 1 (อี-1 ซีรีส์) ซึ่งในขณะนั้นรู้จั...,"[57, 62]"


shape of df_qa (14994, 8)


In [None]:
#Check map new answer
def getanswer(context,label):
  sidx,eidx = label
  return context[sidx:eidx]
print(f" Correct Answer : {sum(df_qa.apply(lambda x: getanswer(x['context'],x['label']) , axis=1) == df_qa['html_answer'])}")

 Correct Answer : 14994


In [None]:
save_to_pickle(df_qa.drop(columns=['html_context','html_label']).rename(columns={'html_answer':'answer'}) , 
               f"./drqa/1-tokenizers/result/df_{file}_cleaning.pkl")

save df_ThaiQACorpus_cleaning to ./drqa/1-tokenizers/result/df_ThaiQACorpus_cleaning.pkl success


## Create list of Word

In [None]:
file = "ThaiQACorpus"
df_qa = load_pickle(f"./drqa/1-tokenizers/result/df_{file}_cleaning.pkl")
print(f"shape of df_train {df_qa.shape}")
display(df_qa.head(2))

load object from ./drqa/1-tokenizers/result/df_ThaiQACorpus_cleaning.pkl success,that is <class 'pandas.core.frame.DataFrame'>
shape of df_train (14994, 6)


Unnamed: 0,article_id,id,question,answer,context,label
0,115035,1,สุนัขตัวแรกรับบทเป็นเบนจี้ในภาพยนตร์เรื่อง Ben...,ฮิกกิ้นส์,เบนจี้ เบนจี้ () เป็นชื่อตัวละครหมาพันทางแสนรู...,"[447, 456]"
1,376583,2,ลูนา 1 เป็นยานอวกาศลำแรกในโครงการลูนาของโซเวีย...,เมชตา,ลูนา 1 ลูนา 1 (อี-1 ซีรีส์) ซึ่งในขณะนั้นรู้จั...,"[57, 62]"


In [None]:
# gather text to build vocabularies
%%time
vocab_text = gather_text_for_vocab([df_qa])
word2idx, idx2word, word_vocab = build_word_vocab(vocab_text)

Number of sentences in dataset:  23669
raw-vocab: 111891
vocab-length: 111891
word2idx-length: 111891
CPU times: user 1min 47s, sys: 817 ms, total: 1min 47s
Wall time: 1min 47s


In [None]:
# numericalize context and questions
%%time
df_qa['context_ids']   = df_qa.context.apply(word_to_ids,   word2idx=word2idx)
df_qa['question_ids'] = df_qa.question.apply(word_to_ids,  word2idx=word2idx)
display(df_qa.head(2))

Unnamed: 0,article_id,id,question,answer,context,label,context_ids,question_ids
0,115035,1,สุนัขตัวแรกรับบทเป็นเบนจี้ในภาพยนตร์เรื่อง Ben...,ฮิกกิ้นส์,เบนจี้ เบนจี้ () เป็นชื่อตัวละครหมาพันทางแสนรู...,"[447, 456]","[2695, 5276, 2, 2695, 5276, 2, 162, 2, 7, 65, ...","[2329, 133, 91, 131, 7, 2695, 5276, 3, 105, 64..."
1,376583,2,ลูนา 1 เป็นยานอวกาศลำแรกในโครงการลูนาของโซเวีย...,เมชตา,ลูนา 1 ลูนา 1 (อี-1 ซีรีส์) ซึ่งในขณะนั้นรู้จั...,"[57, 62]","[1153, 155, 2, 50, 2, 1153, 155, 2, 50, 2, 10,...","[1153, 155, 2, 50, 2, 7, 6661, 840, 91, 3, 447..."


CPU times: user 3min 24s, sys: 588 ms, total: 3min 24s
Wall time: 3min 24s


In [None]:
def test_indices(df, idx2word):
    '''
    Performs the tests mentioned above. This method also gets the start and end of the answers
    with respect to the context_ids for each example.
    :param dataframe df: SQUAD df
    :param dict idx2word: inverse mapping of token ids to words
    :returns
        list start_value_error: example idx where the start idx is not found in the start spans
                                of the text
        list end_value_error: example idx where the end idx is not found in the end spans
                              of the text
        list assert_error: examples that fail assertion errors. A majority are due to the above errors
    '''

    start_value_error = []
    end_value_error = []
    assert_error = []
    for index, row in df.iterrows():
        answer_tokens = [w for w in word_tokenize(row['answer'] ,engine='newmm')]
        
        context_tokens = word_tokenize(row['context'] ,engine='newmm')
        context_span  = [(len("".join(context_tokens[0:i])), len("".join(context_tokens[0:i+1]))) 
                         for i,w in enumerate(context_tokens)]
        starts, ends = zip(*context_span)

        answer_start, answer_end = row['label']

        try:
            start_idx = starts.index(answer_start)
        except:
            start_value_error.append(index)
        try:
            end_idx  = ends.index(answer_end)
        except:
            end_value_error.append(index)

        try:
            assert idx2word[row['context_ids'][start_idx]] == answer_tokens[0]
            assert idx2word[row['context_ids'][end_idx]] == answer_tokens[-1]
        except:
            assert_error.append(index)


    return start_value_error, end_value_error, assert_error

def get_error_indices(df, idx2word):
    
    start_value_error, end_value_error, assert_error = test_indices(df, idx2word)
    err_idx = start_value_error + end_value_error + assert_error
    err_idx = set(err_idx)
    print(f"Number of error indices: {len(err_idx)}")
    
    return err_idx

In [None]:
%%time 
train_err = get_error_indices(df_qa, idx2word)

df_qa.drop(train_err, inplace=True)
print(f"Shape of data frame after drop error row: {df_qa.shape}")
#Some row is error explore that how to fixed that#

Number of error indices: 807
Shape of data frame after drop error row: (14187, 8)
CPU times: user 25min 57s, sys: 3.93 s, total: 26min 1s
Wall time: 25min 58s


In [None]:
def index_answer(row, idx2word):
    '''
    Takes in a row of the dataframe or one training example and
    returns a tuple of start and end positions of answer by calculating 
    spans.
    '''
    context_tokens = word_tokenize(row['context'] ,engine='newmm')
    context_span  = [(len("".join(context_tokens[0:i])), len("".join(context_tokens[0:i+1]))) 
                     for i,w in enumerate(context_tokens)]
    starts, ends = zip(*context_span)
    
    answer_start, answer_end = row['label']
    
    start_idx = starts.index(answer_start)
    end_idx  = ends.index(answer_end)
    
    answer_tokens = [w for w in word_tokenize(row['answer'] ,engine='newmm')]
    assert idx2word[row.context_ids[start_idx]] == answer_tokens[0]
    assert idx2word[row.context_ids[end_idx]] == answer_tokens[-1]
    
    return [start_idx, end_idx]

In [None]:
%%time
#label_idx is position in context_ids ,which value convert to word by idx2word[]
label_idx = df_qa.apply(index_answer, axis=1, idx2word=idx2word)
df_qa['label_idx'] = label_idx
print(f"df_qa.shape:{df_qa.shape}")
display(df_qa.head(2))

df_qa.shape:(14187, 9)


Unnamed: 0,article_id,id,question,answer,context,label,context_ids,question_ids,label_idx
0,115035,1,สุนัขตัวแรกรับบทเป็นเบนจี้ในภาพยนตร์เรื่อง Ben...,ฮิกกิ้นส์,เบนจี้ เบนจี้ () เป็นชื่อตัวละครหมาพันทางแสนรู...,"[447, 456]","[2695, 5276, 2, 2695, 5276, 2, 162, 2, 7, 65, ...","[2329, 133, 91, 131, 7, 2695, 5276, 3, 105, 64...","[133, 134]"
1,376583,2,ลูนา 1 เป็นยานอวกาศลำแรกในโครงการลูนาของโซเวีย...,เมชตา,ลูนา 1 ลูนา 1 (อี-1 ซีรีส์) ซึ่งในขณะนั้นรู้จั...,"[57, 62]","[1153, 155, 2, 50, 2, 1153, 155, 2, 50, 2, 10,...","[1153, 155, 2, 50, 2, 7, 6661, 840, 91, 3, 447...","[25, 27]"


CPU times: user 24min 32s, sys: 3.93 s, total: 24min 36s
Wall time: 24min 33s


## Save Data to plckle

In [None]:
#Save this State
save_to_pickle(df_qa , f"./drqa/1-tokenizers/result/df_{file}_prepairing.pkl")
save_to_pickle(word2idx , "./drqa/1-tokenizers/result/dict_word2idx.pkl")
save_to_pickle(idx2word , "./drqa/1-tokenizers/result/dict_idx2word.pkl")
save_to_pickle(word_vocab , "./drqa/1-tokenizers/result/list_word_vocab.pkl")

save df_ThaiQACorpus_prepairing to ./drqa/1-tokenizers/result/df_ThaiQACorpus_prepairing.pkl success
save dict_word2idx to ./drqa/1-tokenizers/result/dict_word2idx.pkl success
save dict_idx2word to ./drqa/1-tokenizers/result/dict_idx2word.pkl success
save list_word_vocab to ./drqa/1-tokenizers/result/list_word_vocab.pkl success


## Load Data

In [None]:
file = "ThaiQACorpus"
df_qa = load_pickle(f"./drqa/1-tokenizers/result/df_{file}_prepairing.pkl")
word2idx = load_pickle("./drqa/1-tokenizers/result/dict_word2idx.pkl")
idx2word = load_pickle("./drqa/1-tokenizers/result/dict_idx2word.pkl")
word_vocab = load_pickle("./drqa/1-tokenizers/result/list_word_vocab.pkl")

load object from ./drqa/1-tokenizers/result/df_ThaiQACorpus_prepairing.pkl success,that is <class 'pandas.core.frame.DataFrame'>
load object from ./drqa/1-tokenizers/result/dict_word2idx.pkl success,that is <class 'dict'>
load object from ./drqa/1-tokenizers/result/dict_idx2word.pkl success,that is <class 'dict'>
load object from ./drqa/1-tokenizers/result/list_word_vocab.pkl success,that is <class 'list'>


# Embedding 
ref : https://fasttext.cc/docs/en/crawl-vectors.html

## FASTTEXT

In [None]:
%%time
th_embedding = fasttext.load_model("./data/cc.th.300.bin")
print(f"Embedding dimension : {th_embedding.get_dimension()}")
print(f"Number of word in embedding : {len(th_embedding.get_words())}")
print(f"Example word in embedding : {th_embedding.get_words()[0:15]}")

Embedding dimension : 300
Number of word in embedding : 2000000
Example word in embedding : ['ที่', '</s>', "'", '.', 'การ', '-', 'ใน', 'และ', ')', '(', 'เป็น', 'ของ', ',', 'ได้', ':']
CPU times: user 5.73 s, sys: 8.09 s, total: 13.8 s
Wall time: 1min 47s




In [None]:
def create_embed_matrix():
  embed_matrix = {}
  for word in th_embedding.get_words():
    embed_matrix[word] = th_embedding.get_word_vector(word)
  return embed_matrix

In [None]:
%%time
embed_matrix = create_embed_matrix()
print(f"Size of embed_matrix : {len(embed_matrix)}")

Size of embed_matrix : 2000000
CPU times: user 25.9 s, sys: 7.89 s, total: 33.8 s
Wall time: 29.9 s


In [None]:
def create_word_embedding(embed_matrix):
    '''
    Creates a weight matrix of the words that are common in the GloVe vocab and
    the dataset's vocab. Initializes OOV words with a zero vector.
    '''
    weights_matrix = np.zeros((len(word_vocab), 300))
    words_found = 0
    not_found = []
    for i, word in enumerate(word_vocab):
        try:
            weights_matrix[i] = embed_matrix[word]
            words_found += 1
        except:
          not_found.append(word)
    return weights_matrix, words_found ,not_found

In [None]:
%%time
weights_matrix, words_found, not_found = create_word_embedding(embed_matrix)
print(f"Total words found in glove vocab: {words_found}/{len(word_vocab)}", )

Total words found in glove vocab: 38076/61036
CPU times: user 148 ms, sys: 184 ms, total: 332 ms
Wall time: 328 ms


## PythaiNLP : thai2fit_wv

In [None]:
from pythainlp.corpus import get_corpus_path
from gensim.models import KeyedVectors
path = get_corpus_path("thai2fit_wv")
thaiW2V = KeyedVectors.load_word2vec_format(path, binary=True)
print(f"number of word in corpus {len(thaiW2V.vocab.keys()):0,.0f}")
print(f"number of dimention of vector {thaiW2V.vector_size}")

Corpus: thai2fit_wv
- Downloading: thai2fit_wv 0.1


100%|██████████| 62452646/62452646 [00:04<00:00, 12787470.92it/s]


number of word in corpus 51,358
number of dimention of vector 300


In [None]:
weights_matrix = np.zeros((len(word_vocab), 300))
words_found = 0
not_found = []
for i, word in enumerate(word_vocab):
  try:
    weights_matrix[i] = thaiW2V.get_vector(word)
    words_found += 1
  except:
    not_found.append(word)
print(f"Total words found in thainlp W2V: {words_found}/{len(word_vocab)}", )
print(f"Example word not found : {not_found[0:20]}")

Total words found in thainlp W2V: 29444/61036
Example word not found : [' ', 'ใน', '"', '\n', 'อร', '/', 'ส์', 'ต่าง ๆ', 'The', 'ยังมี', 'เป็นที่', 'ฯ', 'สเปน', 'พระองค์เจ้า', 'บี', 'จุฬาลงกรณ์', 'นี', 'ประ', 'มิวสิก', 'ณ์']


## PythaiNLP : ltw2v

In [None]:
from pythainlp.corpus import get_corpus_path
from gensim.models import KeyedVectors
path = get_corpus_path("ltw2v")
ltw2v = KeyedVectors.load_word2vec_format(path, binary=True, unicode_errors='ignore')
print(f"number of word in corpus {len(ltw2v.vocab.keys()):0,.0f}")
print(f"number of dimention of vector {ltw2v.vector_size}")

Corpus: ltw2v
- Downloading: ltw2v 0.1


100%|██████████| 1178368364/1178368364 [00:10<00:00, 115097495.03it/s]


number of word in corpus 731,185
number of dimention of vector 400


In [None]:
%%time
embed_dict = {}
for word in ltw2v.vocab.keys():
  embed_dict[word] = ltw2v.get_vector(word)[0:300]
print(f"size of embed_dict is {len(embed_dict)}")

size of embed_dict is 731185
CPU times: user 1.35 s, sys: 26.1 ms, total: 1.37 s
Wall time: 1.37 s


In [None]:
%%time
weights_matrix = np.zeros((len(word_vocab), 300))
words_found = 0
not_found = []
for i, word in enumerate(word_vocab):
  try:
    weights_matrix[i] = ltw2v.get_vector(word)[0:300]
    words_found += 1
  except:
    not_found.append(word)
print(f"Total words found in ltw2w : {words_found}/{len(word_vocab)}", )
print(f"Example word not found : {not_found[0:20]}")
print(f"Shape of weights_matrix is : {weights_matrix.shape}")

Total words found in ltw2w : 87264/111891
Example word not found : ['<unk>', '<pad>', ' ', 'ต่าง ๆ', '  ', 'เลื่อน', 'เสรี', 'อัญเชิญ', 'สะดวก', 'ทางตะวันตก', 'แฮร์รี่ พอตเตอร์', 'สมเด็จพระเทพรัตนราชสุดาฯ สยามบรมราชกุมารี', 'เรื่อย ๆ', 'สมเด็จพระนางเจ้าสิริกิติ์ พระบรมราชินีนาถ', 'ผู้กำกับภาพยนตร์', 'ปน', 'วิศวกร', 'ทุก ๆ', '   ', 'จริง ๆ']
Shape of weights_matrix is : (111891, 300)
CPU times: user 380 ms, sys: 132 ms, total: 512 ms
Wall time: 511 ms


In [None]:
#Save data
%%time
save_to_pickle(embed_dict , "./drqa/1-tokenizers/result/dict_embed_ltw2v.pkl")
np.save('./drqa/1-tokenizers/result/dfqa2v_ltw2v.npy',weights_matrix)

save dict_embed_ltw2v to ./drqa/1-tokenizers/result/dict_embed_ltw2v.pkl success
CPU times: user 5.47 s, sys: 2.63 s, total: 8.1 s
Wall time: 37.5 s


## My Word2Vec by Gensim skip-gram

# prepairing data

In [None]:
df_qa.head(2)

Unnamed: 0,article_id,id,question,answer,context,label,context_ids,question_ids,label_idx
0,115035,1,สุนัขตัวแรกรับบทเป็นเบนจี้ในภาพยนตร์เรื่อง Ben...,ฮิกกิ้นส์,เบนจี้ เบนจี้ () เป็นชื่อตัวละครหมาพันทางแสนรู...,"[447, 456]","[2695, 5276, 2, 2695, 5276, 2, 162, 2, 7, 65, ...","[2329, 133, 91, 131, 7, 2695, 5276, 3, 105, 64...","[133, 134]"
1,376583,2,ลูนา 1 เป็นยานอวกาศลำแรกในโครงการลูนาของโซเวีย...,เมชตา,ลูนา 1 ลูนา 1 (อี-1 ซีรีส์) ซึ่งในขณะนั้นรู้จั...,"[57, 62]","[1153, 155, 2, 50, 2, 1153, 155, 2, 50, 2, 10,...","[1153, 155, 2, 50, 2, 7, 6661, 840, 91, 3, 447...","[25, 27]"


In [None]:
unique_ctx = df_qa.context.apply(lambda x: ",".join(word_tokenize(x ,engine='newmm'))).unique()
unique_qtn = df_qa.question.apply(lambda x: ",".join(word_tokenize(x ,engine='newmm'))).unique()

In [None]:
lst_ctx_qnt = np.hstack((unique_ctx,unique_qtn))
print(lst_ctx_qnt.shape)

(22663,)


In [None]:
np.save('./drqa/1-tokenizers/result/token_ctx_qnt.npy',lst_ctx_qnt)

# Trainmodel

In [None]:
!pip install gensim==3.8.3

Collecting gensim==3.8.3
  Downloading gensim-3.8.3-cp37-cp37m-manylinux1_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 1.4 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-3.8.3


In [None]:
# from gensim.test.utils import common_texts, get_tmpfile
# from gensim.models import Word2Vec
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import numpy as np
import pickle

In [None]:
lst_ctx_qnt = np.load('./drqa/1-tokenizers/result/token_ctx_qnt.npy' ,allow_pickle=False)

ValueError: ignored

In [None]:
lst_input = [sent.split(',') for sent in lst_ctx_qnt]

NameError: ignored

In [None]:
model = Word2Vec(sent, min_count=1,size= 300,workers=3, window = 3, sg = 1) #sg = 1 is skip-gram ,sg= 0 is CBOW (default)

# Summary

Problem


1. Lost 50% of word in QA because word aren't found in thai Embedding Fasttext
2. https://pythainlp.github.io/docs/2.2/api/word_vector.html
3. https://pythainlp.github.io/dev-docs/api/word_vector.html
4. https://pythainlp.github.io/pythainlp-corpus/list-corpus.html
4. we use original context that is html.