# Import

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from pythainlp.util import normalize
from pythainlp.ulmfit.utils import ThaiTokenizer
from pythainlp.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

import numpy
numpy.set_printoptions(threshold=numpy.nan)

import re,string
import pythainlp
import pandas as pd

# Function

In [2]:
def text_to_tokens(text, stop_words=[]):
    # nomalize
    text = pythainlp.util.normalize(text)
    
    # remove duplicate ending characters
    for m in re.finditer(r'([\u0E00-\u0E7F])(\1{2,})', text):
        text = text.replace(m.group(0),m.group(1),1)
        
    # remove url
    text = re.sub(r'http\S+', '', text)
    
    # remove CR
    text = text.replace('CR', '').replace('SR', '').replace('\xa0', '')
        
    # remove unwanted character
    pattern = re.compile(r"[^\u0E00-\u0E7Fa-zA-Z ]|ๆ")
    text = pattern.sub('',text)
    
    # lowercase
    text = text.lower()
    
    # tokenization
    tokens = pythainlp.tokenize.word_tokenize(text, engine='newmm', whitespaces=False)
    
    # remove stopwords
    tokens = [t for t in tokens if not t in stop_words] 

    return tokens

def pos_filter_noun(tokens):
    postags =  pos_tag(tokens, engine = 'artagger')
    filterpos = []
    noun = ['NPRP','NCMN']
    for w in postags:
        
        # filter noun
        if w[1] in noun:
            filterpos.append(w[0])
        
    return filterpos

def find_stopwords(df):
    stopwords = []
    
    for index,row in df.iterrows():
        if row['label'] == 0:
            stopwords.append(row['Stopword'])
    return stopwords



# Generate Stopwords

This part of code is used to create array of stopword.

In [3]:
df = pd.read_csv('./data/stopwords.csv',encoding='utf-8-sig')
df.fillna(0,inplace = True)

stopwords = find_stopwords(df)

# Generate Corpus
This part of code is used to create corpus which is a array of term and document.

In [4]:
corpus = []

# load second corpus 
df = pd.read_csv('./data/predicted-non-questions - question.csv',encoding='utf-8-sig')
df = df.drop(['label'],axis = 1)

# load first corpus
df2 = pd.read_csv('./data/predited-questions - question and _0.75.csv',encoding='utf-8-sig')

# merge both corpus
all_question = df.append(df2,ignore_index=True)

# loop tokenize each sentence in the corpus
for w in tqdm(all_question['text']):
    corpus.append(text_to_tokens(w,stopwords))
    
# join each tokenize in each sentence with "<some_space>"
corpus = ["<some_space>".join(x) for x in corpus]

# add tokenizer my own
vectorizer = TfidfVectorizer(tokenizer = lambda x: x.split("<some_space>"), analyzer="word")

# fit create a array of term and document.
courpus_vector = vectorizer.fit_transform(corpus).toarray()


100%|██████████| 19356/19356 [00:07<00:00, 2567.01it/s]


# Test Find Similarity Function

In [5]:
def jaccard_similarity(list1, list2):
    
    # find intersection
    intersection = len(list(set(list1).intersection(list2)))
    
    # find union
    union = (len(list1) + len(list2)) - intersection
    
    # find ratio
    return float(intersection / union)

# vectorizer is used to create a vector from a text, corpus vector is a array of terms and documents
# thres is cosine similarity score, n is number of result, jac is jaccard_similarity score

def find_sim_of_corpus(text,vectorizer,courpus_vector,thres = 0.5,n=10,jac = 0.1): 
    
    # create the vector from the text
    vector = "<some_space>".join(text_to_tokens(text))
    vector = vectorizer.transform([vector]).toarray()[0]

    # caculate score of the vector with the corpus
    socres = []
    i = 0
    for s in courpus_vector:
        score = cosine_similarity(s.reshape(1, -1),vector.reshape(1, -1))
        socres.append({
            "text":all_question['text'][i],
            "score":score[0][0]
        })
        i+=1

    df = pd.DataFrame(socres)
    df = df.sort_values(by=['score'],ascending=False)
    
    # get keywords from the text
    tokens_text = text_to_tokens(text)
    topics_text = pos_filter_noun(tokens_text)
    
    # get top rank that fit with the setting condition
    top = []
    for index,row in tqdm(df.iterrows()):
        
        # get keywords from the sentence in the corpus
        tokens = text_to_tokens(row['text'])
        topics = pos_filter_noun(tokens)
        
        # find jaccard_similarity
        jacard_score = jaccard_similarity(topics_text,topics)
        
        # check condition
        if(row['score'] >= thres) and jacard_score >= jac:
            top.append({
                "text":row['text'],
                "score":row['score'],
                "jacard": jacard_score
            })
        
        # check number of result
        if len(top) == n:
            break
    return top

In [6]:
sims = find_sim_of_corpus('ใครเคยไปเจาะเลือดที่จุฬาบ้าง',vectorizer,courpus_vector,thres= 0.1)
result_df = pd.DataFrame(sims)
result_df

10it [00:00, 26.12it/s]


Unnamed: 0,jacard,score,text
0,0.333333,0.782177,ใครไปเจาะเลือดมั่งงงง
1,0.111111,0.506322,มีใครบ้างมั้ยที่แบบเจาะหูจิวละไม่เลือดสาด...อ่อก
2,0.333333,0.50094,จุฬาเป็นไงบ้าง?
3,0.333333,0.45775,อยากเข้าจุฬาบ้างมันยากไหมมม
4,0.5,0.373428,ใครมีที่ถ่ายรูปในจุฬาแนะนำบ้าง
5,0.111111,0.363304,อยากมีแฟนเรียนจุฬาต้องทำไงอะครับ เด็กจุฬาช่วยบ...
6,0.25,0.344686,ทำไงให้ง่วงเร็วๆนี่จะไปบริจาคเลือด🏩
7,0.25,0.338383,อยากเข้าจุฬาแต่เข้าไม่ได้ เพราะจุฬาลงกลอน
8,0.333333,0.335696,ทำไมเราถึงเข้าจุฬาไม่ได้
9,0.333333,0.333829,อยากมีแฟนเรียนจุฬา5555
