In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from pythainlp.tokenize import word_tokenize
from string import punctuation

import numpy as np

import time

def reset(df):
    cols = df.columns
    return df.reset_index()[cols]

print('OK !')

OK !


In [2]:
def canonicalize(string):
    normalized_tokens = list()
    a = word_tokenize(string, engine = 'newmm')
    for j in a:
        token = j.strip()
        #Add clean statement here 
        if len(token) > 1 and token not in set(punctuation) and token not in ['..','...','ๆๆ']:
            try:
                normalized_tokens.append(token.lower())
            except:
                normalized_tokens.append(token)
                pass
    return normalized_tokens

def query(query_text, vectorizer, document_matrix, df, searched_field):
    
    # Transform our string using the vocabulary
    transformed = vectorizer.transform([query_text])
    query = transformed[0:1]

    np_result = np.array(find_similar(query, tfidf_term_document_matrix, len(df)))

    mask = np_result[:, 1] > 0 

    np_result = np_result[mask, :]
        
    index_list = []
    score_list = []
    df_dict = {'index': index_list, 'score': score_list}

    df_dict['index'] =  np_result[:,0]
    df_dict['score'] =  np_result[:,1]

    df = pd.DataFrame(df_dict)
    df['index'] = df['index'].astype(int)
    df = df.set_index('index')
    
    return reset(pd.merge(df, data, left_index=True, right_index=True, how ='left'))


# A function that given an input query item returns the top-k most similar items 
# by their cosine similarity.
def find_similar(query_vector, td_matrix, top_k = 5):
    cosine_similarities = cosine_similarity(query_vector, td_matrix).flatten()
    related_doc_indices = cosine_similarities.argsort()[::-1]
    return [(index, cosine_similarities[index]) for index in related_doc_indices][0:top_k]

print('OK !')

OK !


In [3]:
# We pass in our tokenizer to the vectorizer object.

tfidf_vectorizer = TfidfVectorizer(tokenizer=canonicalize, sublinear_tf=True)

print('OK')

OK


In [4]:
data = pd.read_csv('data/clean_sample_text.csv')

data.head()

Unnamed: 0,published,content,engagement,len
0,11/07/20 00:09,โดน​ค่า ATS 14\n บาท​กับ​หุ้น​อยู่​เลย,0.0,38
1,11/07/20 00:51,ถ้า​ตาม​นิยาม​นี้\n การ​ผูก​บัตร​เครดิต​กับ​แ...,0.0,184
2,11/07/20 01:30,ไร้​เงินสด เครดิต​ก็​ไม่​มี\n หนี้​นอก​ระบบ​ด...,0.0,60
3,11/07/20 01:34,ถั่ว​ต้ม​เลย ไร้​เงินสด\n ไร้​ของ​จะ​กิน,0.0,40
4,11/07/20 03:30,อยากให้ลดข้อแม้ในการเข้าถึงแหล่งเงินทุนของสถาบ...,0.0,278


In [5]:
#This can take time; after finnishing this, you can save transformed document into pickle file for later usage
start = time.time()

tfidf_term_document_matrix = tfidf_vectorizer.fit_transform(data['content'])

print("Took " + "%.2f" % float(time.time() - start) + " sec")

Took 1.38 sec


In [6]:
query_text = "สนุก"
searched_field = 'content'

result_list = query(query_text, tfidf_vectorizer, tfidf_term_document_matrix, data, searched_field)

result_list

Unnamed: 0,score,published,content,engagement,len
0,0.439635,18/08/20 16:03,พวก​หัว​สาย​เอา​เงิน​ไป​จ่าย​กัน​สนุก​มา​นาน​แล้ว,0.0,49
1,0.36082,13/07/20 20:31,อัน​นี้​สนุก​แน่​ครับ\n https://www.facebook....,0.0,87
2,0.324446,13/07/20 20:35,อัน​นี้​มี​สนุก​และ​สี​สัน​แน่นอน\n https://w...,0.0,99


In [7]:
query_text = "เที่ยว"
searched_field = 'content'

result_list = query(query_text, tfidf_vectorizer, tfidf_term_document_matrix, data, searched_field)

result_list

Unnamed: 0,score,published,content,engagement,len
0,0.513902,17/08/20 17:35,ไป​เที่ยว​ไหน​น​น​น​น,0.0,21
1,0.444235,17/08/20 11:53,Sitanan Chuanoon เที่ยว​อีก​แล้ว,0.0,32
2,0.341085,17/08/20 11:36,Sageenah Nae มาก\n เที่ยว​ไหน​ล่ะ​หล่อน,0.0,39
3,0.336529,17/08/20 13:07,Aomm Thanomsub เที่ยว​สกล​จ้ะ​พี่,0.0,33
4,0.332468,17/08/20 20:48,Chanut Nut เเท็ก​ไมจะให้​ไป​เที่ยว​บ้าน​หรอ​5555,0.0,48
5,0.329796,17/08/20 13:57,ไป​เที่ยว​กัน​ไหม​เจ้า​เหมียว\n Titichaya Tak...,0.0,49
6,0.323319,17/08/20 11:14,เฟริส สี๋สี๋\n พา​ไป​เที่ยว​หน่อยยยยยย,0.0,38
7,0.314003,17/08/20 14:26,Aumarin Shuengharuethai Cha'may\n Begin ไป​เท...,0.0,53
8,0.313257,17/08/20 22:06,Mas Mas เที่ยว​ไหนดี ตอบมมสิ😁,0.0,29
9,0.311895,17/08/20 13:19,Thong Krubpom ปายยยยยยยยยยยยยยยยยยยยเที่ยว​ว​ว...,0.0,62
