In [23]:
import re
import sys
import pathlib
from typing import List
from IPython.display import Audio

import deepcut
import pandas as pd
import numpy as np
import thaispellcheck
from langdetect import detect
from string import punctuation
from pythainlp import word_vector
from pythainlp.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

ROOT_DIR = pathlib.Path.cwd().parent
sys.path.insert(0, str(ROOT_DIR))
from src.normalizers import EnglishTextNormalizer
from src.evaluate_utils import isd
from src.post_processer import remove_longest_repeating_words

In [4]:
target_folder = ROOT_DIR / 'data/result/Thonburian-whisper-deepbrain-lang' / 'th_Oppday_Q2_2023_IP_interfama'
label_dir = target_folder / 'Groundtruth.csv'
th_result_dir = target_folder / 'th_result.csv'
en_result_dir = target_folder / 'en_result.csv'
model = word_vector.WordVector(model_name="thai2fit_wv").get_model() # load thai2fit_wv from pythainlp

In [6]:
label_df = pd.read_csv(str(label_dir))
th_result_df = pd.read_csv(str(th_result_dir)) 
en_result_df = pd.read_csv(str(en_result_dir))

In [42]:
merge_result_df = pd.concat(
    [th_result_df, en_result_df],
    axis=0
)
merge_result_df.shape

(888, 3)

In [65]:
" ".strip()

''

In [112]:
EXCEPTION_WORDS = {'ใน', 'ประเทศเนี่ยโรง'}

In [145]:
def evaluate_text(text: str, window_length: int=3, threshold: float=0.3) -> (bool ,List[str]):
    words = deepcut.tokenize(text)
    vectors = []
    # incorrect_words = []
    cossim_list = []
    is_correct = True
    for n, word in enumerate(words):
        # If word not present in tokenizer it might be 
        # jargon or incorrect word
        if word.strip() != "" and not word in EXCEPTION_WORDS:
            try:
                vectors.append(model[word])
            except KeyError as key_error:
                if word in str(key_error):
                    is_correct = False
                    print(f"incorrect words {word}")

    if len(vectors) < 2:
        return True, 1.0
    if window_length >= len(vectors):
        cosine_sim = cosine_similarity(vectors)
        cosine_sim = np.mean(np.mean(cosine_sim))
        print(cosine_sim)
    
    else:
        start = 0
        end = window_length 
        while end <= len(vectors):
            window = vectors[start:end]
            cosine_sim = cosine_similarity(window)
            cosine_sim = np.mean(np.mean(cosine_sim))
            cossim_list.append(cosine_sim)
            start = end
            end = start + window_length
        cosine_sim = np.mean(cossim_list)
    # print(words)
    # print(f"for {word_chunk}: cosine_similarity: {cossim_list}")   
    print(f"cosine_similarity: {cosine_sim}")
    if cosine_sim < threshold:
        print(words)
        is_correct = False
        
    return is_correct, cosine_sim 

In [147]:
merge_result_df['is_correct'] = merge_result_df['text'].apply(evaluate_text)

0.68216884
cosine_similarity: 0.6821688413619995
cosine_similarity: 0.3707128167152405
cosine_similarity: 0.4303341805934906
incorrect words Auxility
incorrect words Day
cosine_similarity: 0.4031587839126587
cosine_similarity: 0.38849517703056335
incorrect words ไทม่าส์
cosine_similarity: 0.41859614849090576
incorrect words บริษัท Interfarm
cosine_similarity: 0.431443452835083
cosine_similarity: 0.42583149671554565
cosine_similarity: 0.40232759714126587
cosine_similarity: 0.40667060017585754
cosine_similarity: 0.3961417078971863
incorrect words ทรัลสุขภาพ
cosine_similarity: 0.40245503187179565
cosine_similarity: 0.47153547406196594
cosine_similarity: 0.3950575590133667
cosine_similarity: 0.40736478567123413
cosine_similarity: 0.43126532435417175
cosine_similarity: 0.4266875684261322
cosine_similarity: 0.4041251242160797
cosine_similarity: 0.42911002039909363
incorrect words อนุมัติธรรม
cosine_similarity: 0.4242708384990692
cosine_similarity: 0.41966065764427185
cosine_similarity: 0.454

In [79]:
merge_result_df.to_csv("checked_result.csv")

In [140]:
incorrect = 'สวัสดีครับ'

In [141]:
evaluate_text(incorrect)

0.68216884
['สวัสดี', 'ครับ']
cosine_similarity: 0.6821688413619995


(True, 0.68216884)