In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from pythainlp.util import normalize
from pythainlp.ulmfit.utils import ThaiTokenizer
from pythainlp.corpus import stopwords
from pythainlp.tag import pos_tag

import pandas as pd
import numpy as np
import os, json, sys
import copy, glob
import pythainlp
import tqdm
import re, string

# Text_to_tokens
- remove nomalize
- remove duplicate
- remove url
- remove cr
- remove unwanted character
- remove spaces
- tokenize
- remove Stopword
- check Upper case topic

In [23]:
def text_to_tokens(text):
    # nomalize
    text = normalize(text)
    
    # remove duplicate ending character
    for m in re.finditer(r'([\u0E00-\u0E7F])(\1{2,})', text):
        text = text.replace(m.group(0),m.group(1),1)
        
    # remove url
    text = re.sub(r'http\S+', '', text)
    
    # remove CR
    text = text.replace('cr', '').replace('sr', '').replace('\xa0', '')
        
    # remove unwanted character
    pattern = re.compile(r"[^\u0E00-\u0E7Fa-zA-Z ]")
    text = pattern.sub('',text)

    # tokenization
    tokens = pythainlp.tokenize.word_tokenize(text, engine='newmm')
    # remove spaces
    tokens = [token.replace(' ','') for token in tokens]
    tokens = list(filter(lambda token: token != '', tokens))
    
    # stopword list
    stopWords = stopwords.words('thai')
    
    FilteredTokens = []
    
    
    for w in tokens:
        # remove stopword
        if w in stopWords:
            continue
        
        # check uppercase
        if (bool(re.search("[a-zA-Z]", w))):
            if w.isupper():
                w = "xxup"+w
        FilteredTokens.append(w.lower())
    
    
    
    return FilteredTokens

In [24]:
text_to_tokens('THIS is A BOOK')

['xxupthis', 'is', 'xxupa', 'xxupbook']

## pos_filter_noun
- filter noun from pos to find topics

In [3]:
def pos_filter_noun(tokens):
    postags =  pos_tag(tokens, engine = 'artagger')
    filterpos = []
    noun = ['NPRP','NCMN']
    for w in postags:
        
        # filter noun
        if w[1] in noun:
            filterpos.append(w)
        
    return filterpos


## fine_ques_type
- find type of question sentence


In [4]:
def fine_ques_type(text):
    
    # nomalize
    text = normalize(text)
    
    # question keywords

    where = ['‡∏ó‡∏µ‡πà‡πÑ‡∏´‡∏ô','‡∏≠‡∏¢‡∏π‡πà‡πÑ‡∏´‡∏ô','‡∏ï‡∏£‡∏á‡πÑ‡∏´‡∏ô','‡πÑ‡∏£‡∏î‡∏µ'] 
    when = ['‡πÄ‡∏°‡∏∑‡πà‡∏≠‡πÑ‡∏´‡∏£‡πà','‡πÄ‡∏õ‡∏¥‡∏î‡πÑ‡∏´‡∏°','‡∏Å‡∏µ‡πà‡πÇ‡∏°‡∏á','‡∏Å‡∏µ‡πà‡∏ó‡∏∏‡πà‡∏°','‡∏Å‡∏µ‡πà‡∏ô‡∏≤‡∏ó‡∏µ','‡∏Å‡∏µ‡πà‡∏ß‡∏±‡∏ô','‡∏Å‡∏µ‡πà‡πÄ‡∏î‡∏∑‡∏≠‡∏ô','‡∏Å‡∏µ‡πà‡∏õ‡∏µ']
    why = ['‡∏ó‡∏≥‡πÑ‡∏°']
    who = ['‡πÉ‡∏Ñ‡∏£']
    whose = ['‡∏Ç‡∏≠‡∏á‡πÉ‡∏Ñ‡∏£']
    which = ['‡∏≠‡∏±‡∏ô‡πÑ‡∏´‡∏ô','‡∏´‡∏£‡∏∑‡∏≠']
    what = ['‡∏≠‡∏∞‡πÑ‡∏£','‡πÅ‡∏ö‡∏ö‡πÑ‡∏´‡∏ô','‡∏ó‡∏≥‡πÑ‡∏£','‡πÑ‡∏´‡∏ô']
    how = ['‡∏≠‡∏¢‡πà‡∏≤‡∏á‡πÑ‡∏£','‡∏¢‡∏±‡∏á‡πÑ‡∏á','‡∏ó‡∏≥‡πÑ‡∏á','‡πÄ‡∏ó‡πà‡∏≤‡πÑ‡∏´‡∏£‡πà','‡πÄ‡∏ó‡πà‡∏≤‡πÑ‡∏£','‡πÄ‡∏õ‡πá‡∏ô‡πÑ‡∏á','‡∏°‡∏µ‡∏Å‡∏µ‡πà']
    rec = ['‡πÅ‡∏ô‡∏∞‡∏ô‡∏≥','‡∏£‡∏µ‡∏ß‡∏¥‡∏ß']
    yes_no = ['‡πÉ‡∏ä‡πà‡πÑ‡∏´‡∏°','‡πÉ‡∏ä‡πà‡∏°‡∏±‡πâ‡∏¢','‡πÑ‡∏î‡πâ‡πÑ‡∏´‡∏°','‡∏´‡∏£‡∏∑‡∏≠‡∏¢‡∏±‡∏á','‡πÑ‡∏î‡πâ‡πÑ‡∏´‡∏°','‡πÑ‡∏õ‡πÑ‡∏´‡∏°','‡πÉ‡∏ä‡πà‡πÑ‡∏´‡∏°','‡πÑ‡∏î‡πâ‡∏°‡∏±‡πâ‡∏¢','‡∏°‡∏±‡πâ‡∏¢','‡∏´‡∏£‡∏∑‡∏≠‡πÄ‡∏õ‡∏•‡πà‡∏≤','‡πÑ‡∏´‡∏°','‡πÑ‡∏î‡πâ‡∏õ‡πà‡∏≤‡∏ß','‡πÑ‡∏î‡πâ‡∏õ‡∏∞','‡πÑ‡∏î‡πâ‡∏£‡∏∂‡πÄ‡∏õ‡∏•‡πà‡∏≤','‡∏°‡∏±‡πä‡∏¢','‡∏Å‡∏±‡∏ô‡∏¢‡∏±‡∏á']
    
    ques_types = []
    
    
            
    for w in where:
        if w in text:
            ques_types.append('where')
            break
            
    for w in when:
        if w in text:
            ques_types.append('when')
            break
            
    for w in why:
        if w in text:
            ques_types.append('why')
            break
            
    for w in who:
        if w in text:
            ques_types.append('who')
            break
            
    for w in whose:
        if w in text:
            ques_types.append('whose')
            break
            
    for w in which:
        if w in text:
            ques_types.append('which')
            break
            
    for w in what:
        if w in text and 'which' not in ques_types and 'where' not in ques_types:
            ques_types.append('what')
            break
            
    for w in how:
        if w in text:
            ques_types.append('how')
            break
            
    for w in rec:
        if w in text:
            ques_types.append('rec')
            break
            
    for w in yes_no:
        if w in text and 'when' not in ques_types:
            ques_types.append('yes_no')
            break
    
    return ques_types
        

In [5]:
df = pd.read_csv('./data/question.csv')
df 

Unnamed: 0,text,probability,label
0,‡∏ß‡∏±‡∏î‡∏ö‡∏±‡∏ß‡∏Ç‡∏ß‡∏±‡∏ç‡∏ï‡∏≠‡∏ô‡∏ô‡∏µ‡πâ‡∏Ñ‡∏ô‡πÄ‡∏£‡∏¥‡πà‡∏°‡πÄ‡∏¢‡∏≠‡∏∞‡∏´‡∏£‡∏∑‡∏≠‡∏¢‡∏±‡∏á‡∏Ñ‡∏∞,0.5001,1
1,‡∏™‡∏£‡∏∏‡∏õ‡πÇ‡∏Ñ‡πÇ‡∏Ñ‡πà‡πÄ‡∏ü‡∏£‡∏ä‡∏´‡∏£‡∏∑‡∏≠‡πÇ‡∏ï‡πÇ‡∏ï‡πâ‡πÄ‡∏ü‡∏£‡∏ä‡∏≠‡∏∞,0.5001,1
2,‡∏õ‡∏Å‡∏ï‡∏¥‡πÅ‡∏•‡πâ‡∏ß‡πÄ‡∏ß‡∏•‡∏≤‡∏Å‡∏¥‡∏ô‡∏ã‡∏µ‡πÄ‡∏£‡∏µ‡∏¢‡∏•‡∏ô‡∏µ‡πà‡πÉ‡∏™‡πà‡∏ô‡∏°‡∏Å‡πà‡∏≠‡∏ô‡∏´‡∏£‡∏∑‡∏≠‡πÉ‡∏™‡πà‡∏ã‡∏µ‡πÄ‡∏£‡∏µ...,0.5002,1
3,‡∏á‡πà‡∏ß‡∏á‡πÅ‡∏ö‡∏ö‡∏ô‡∏µ‡πâ‡∏´‡∏ô‡∏π‡∏à‡∏∞‡∏ó‡∏≥‡πÅ‡∏•‡∏ö‡∏¢‡∏±‡∏á‡πÑ‡∏á‡πÑ‡∏°‡πà‡πÉ‡∏´‡πâ‡∏ú‡∏¥‡∏î‡∏Ñ‡∏∞? ‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡∏™‡∏î...,0.5003,1
4,‡πÄ‡∏≠‡∏≠ ‡∏≠‡∏¢‡∏≤‡∏Å‡∏£‡∏π‡πâ‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡∏Å‡∏±‡∏ô ‡∏°‡∏±‡∏ô‡πÇ‡∏õ‡πä‡∏´‡∏£‡∏≠ ‡∏´‡∏£‡∏∑‡∏≠‡πÄ‡∏û‡∏£‡∏≤‡∏∞‡∏Ñ‡∏ô‡∏ß‡∏¥‡∏à‡∏≤...,0.5003,1
5,‡∏ß‡∏±‡∏ô‡∏ô‡∏µ‡πâ‡∏Ç‡∏∏‡∏î‡∏ó‡∏≠‡∏á‡πÄ‡∏õ‡∏¥‡∏î‡πÑ‡∏´‡∏°‡∏Ñ‡∏∞,0.5004,1
6,‡∏î‡∏π‡∏´‡∏ô‡∏±‡∏á‡∏Å‡∏±‡∏ô‡∏°‡∏±‡πâ‡∏¢ üé¨,0.5004,1
7,‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ‡πÑ‡∏õ‡∏£‡πà‡∏ß‡∏°‡∏Å‡∏¥‡∏à‡∏Å‡∏£‡∏£‡∏°‡∏ß‡∏±‡∏ô‡∏ô‡∏µ‡πâ‡∏à‡∏∞‡∏°‡∏µ‡∏ú‡∏•‡∏≠‡∏≤‡∏£‡∏≤‡∏¢‡∏°‡∏±‡πâ‡∏¢‡∏¢‡∏á‡∏∞,0.5005,1
8,‡∏ù‡∏ô‡∏ï‡∏Å‡πÅ‡∏ö‡∏ö‡∏ô‡∏µ‡πâ‡∏Å‡∏¥‡∏ô‡∏Ç‡πâ‡∏≤‡∏ß‡∏Å‡∏±‡∏ô‡πÑ‡∏´‡∏° :) #‡∏´‡∏≤‡∏Ñ‡∏ô‡∏Å‡∏¥‡∏ô‡∏Ç‡πâ‡∏≤‡∏ß‡πÄ‡∏õ‡πá‡∏ô‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏ô,0.5005,1
9,‡∏ñ‡πâ‡∏≤‡∏Å‡∏π‡∏Ç‡∏≥‡∏Å‡∏π‡∏à‡∏∞‡∏ö‡∏≤‡∏õ‡πÑ‡∏´‡∏°5555,0.5005,1


In [6]:
count = 0
for t in df['text']:
#     print(t,fine_ques_type(t))
    if len(fine_ques_type(t)) == 0:
        print(t)
        count +=1
        
print(count)

‡∏°‡∏±‡∏ò‡∏¢‡∏°‡πÄ‡∏•‡πà‡∏ô‡πÅ‡∏≠‡∏û‡∏ô‡∏µ‡πâ‡πÑ‡∏î‡πâ‡∏õ‡πâ‡∏∞555
‡∏£‡πâ‡∏≤‡∏ô‡πÑ‡∏°‡πà‡πÄ‡∏õ‡∏¥‡∏î‡∏ß‡∏±‡∏ô‡∏≠‡∏≤‡∏ó‡∏¥‡∏ï‡∏¢‡πå‡πÄ‡∏´‡∏£‡∏≠
‡∏Ñ‡∏¥‡∏î‡∏ñ‡∏∂‡∏á‡πÄ‡∏£‡∏≤‡∏°‡πâ‡∏∞üòÇüíó
‡πÄ‡∏ä‡πâ‡∏≤‡∏ô‡∏µ‡πâ‡∏™‡∏î‡πÉ‡∏™ ‡∏´‡∏£‡∏≠‡∏ß‡∏∞
‡∏Ñ‡∏¥‡∏î‡∏ñ‡∏∂‡∏á‡∏Å‡∏±‡∏ô‡πÅ‡∏ö‡∏ö‡∏ó‡∏µ‡πà‡∏â‡∏±‡∏ô‡∏Ñ‡∏¥‡∏î‡∏ñ‡∏∂‡∏á‡∏ö‡πâ‡∏≤‡∏á‡∏£‡∏∂‡∏õ‡πà‡∏≤‡∏ß‡∏Ñ‡∏∏‡∏ì
‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡πÑ‡∏õ‡πÑ‡∏î‡πâ‡∏Ç‡∏ô‡∏≤‡∏î‡∏ô‡∏µ‡πâ‡πÄ‡∏•‡∏¢‡∏´‡∏£‡∏≠
‡πÄ‡∏ß‡∏•‡∏≤‡πÄ‡∏ó‡∏µ‡πà‡∏¢‡∏á‡∏£‡∏≠‡∏≠‡∏≤‡∏à‡∏≤‡∏£‡πå‡∏õ‡∏•‡πà‡∏≠‡∏¢‡πÄ‡∏õ‡πá‡∏ô‡πÅ‡∏ö‡∏ö‡∏ô‡∏µ‡πâ‡∏Å‡∏±‡∏ô‡∏°‡∏±‡πâ‡∏á‡∏£‡∏∂‡∏õ‡πà‡∏≤‡∏ß‡∏ß üòÇüê∞üíòüíò ‡∏´‡∏¥‡∏ß‡∏à‡∏ô‡∏£‡∏≠‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ‡πÅ‡∏•‡πâ‡∏ß‡∏ß #‡πÄ‡∏õ‡∏µ‡∏¢‡∏Å‡∏õ‡∏π‡∏ô‡πÇ‡∏ã‡∏Ñ‡∏¥‡πâ‡∏ß‡∏ó‡πå
Will a pretty face make it better?
‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏Ñ‡πÄ‡∏î‡∏µ‡∏¢‡∏ß‡∏°‡∏±‡∏ô‡∏ó‡∏≥‡πÉ‡∏´‡πâ‡∏Ñ‡∏ô‡∏£‡πâ‡∏≠‡∏á‡πÑ‡∏´‡πâ‡∏Ç‡∏ô‡∏≤‡∏î‡∏ô‡∏µ‡πâ‡πÄ‡∏•‡∏¢‡∏´‡∏£‡∏≠‡∏ß‡∏∞..
‡∏¢‡∏≥‡∏°‡∏≤‡∏°‡πà‡∏≤‡∏ï‡∏£‡∏á‡πÄ‡∏™‡∏î‡∏™‡∏≤‡∏î‡πÄ‡∏õ‡∏¥‡∏î‡∏õ‡πà‡∏∞
‡∏ü‡∏¥‡∏ï‡πÄ‡∏ô‡∏™‡∏ó‡∏µ‡πà‡∏ó‡πà‡∏≤‡∏û‡∏£‡∏∞‡∏à‡∏±‡∏ô‡∏ó‡∏£‡πå‡πÄ‡∏•‡πà‡

‡∏≠‡∏¢‡∏≤‡∏Å‡∏à‡∏∞‡∏ñ‡∏≤‡∏°‡πÑ‡∏£‡∏´‡∏ô‡πà‡∏≠‡∏¢
‡∏≠‡∏¢‡∏≤‡∏Å‡∏à‡∏∞‡∏ñ‡∏≤‡∏°‡πÑ‡∏£‡∏´‡∏ô‡πà‡∏≠‡∏¢
‡∏Ñ‡∏≠‡∏°‡∏ä‡∏±‡πâ‡∏ô‡∏™‡∏≠‡∏á‡∏ó‡∏µ‡πà‡∏®‡∏Å‡∏£‡∏õ‡∏£‡∏¥‡πâ‡∏ô‡πÑ‡∏î‡πâ‡∏ï‡∏•‡∏≠‡∏î‡∏õ‡πà‡∏≤‡∏ß‡∏Ñ‡∏£‡∏±‡∏ö ‡∏à‡∏∞‡∏≠‡∏≠‡∏Å‡πÑ‡∏õ‡∏õ‡∏£‡∏¥‡πâ‡∏ô‡∏á‡∏≤‡∏ô‡∏ï‡∏≠‡∏ô‡∏ï‡∏µ‡∏´‡πâ‡∏≤‡∏≠‡∏∞
‡∏ï‡∏µ4‡πÄ‡πÄ‡∏•‡πâ‡∏ß‡∏´‡∏£‡∏≠ ‡πÄ‡∏£‡∏µ‡∏¢‡∏ô8‡πÇ‡∏°‡∏á‡∏´‡∏£‡∏≠‡∏≠‡∏∑‡∏≠..
‡∏Ç‡∏≠ How to ‡πÄ‡∏•‡∏∑‡πà‡∏≠‡∏ô‡∏™‡∏ñ‡∏≤‡∏ô‡∏∞‡∏à‡∏≤‡∏Å‡∏Ñ‡∏ô‡∏Ñ‡∏∏‡∏¢ ‡πÄ‡∏õ‡πá‡∏ô‡πÅ‡∏ü‡∏ô‡∏´‡∏ô‡πà‡∏≠‡∏¢‡∏Ñ‡πà‡∏∞?
#‡∏Ñ‡∏ô‡∏Ñ‡∏∏‡∏¢2018
‡∏≠‡∏¢‡∏≤‡∏Å‡∏£‡∏π‡πâ‡∏ï‡∏•‡∏≠‡∏î‡πÄ‡∏ß‡∏•‡∏≤‡∏ó‡∏µ‡πà‡∏Ñ‡∏∏‡∏¢‡∏Å‡∏±‡∏ô‡πÑ‡∏°‡πà‡∏£‡∏π‡πâ‡∏™‡∏∂‡∏Å‡πÑ‡∏£‡∏ö‡πâ‡∏≤‡∏á‡∏≠‡πà‡∏≠
‡∏≠‡∏¢‡∏≤‡∏Å‡∏£‡∏π‡πâ‡∏ß‡πà‡∏≤‡∏ô‡∏∏‡πâ‡∏á‡∏´‡∏°‡∏µ ‡πÅ‡∏≠‡∏î ‡∏ú‡∏ç ‡πÉ‡∏ô‡πÄ‡∏ü‡∏™‡πÑ‡∏õ‡∏Å‡∏µ‡πà‡∏Ñ‡∏ô‡πÅ‡∏•‡πâ‡∏ß‡∏Ñ‡∏±‡∏ö
‡∏Ñ‡∏ß‡∏≤‡∏°‡∏Ñ‡∏¥‡∏î‡∏ñ‡∏∂‡∏á‡∏Å‡∏≥‡πÄ‡∏£‡∏¥‡∏ö ‡∏Å‡∏¥‡∏ô‡∏¢‡∏≤‡∏Å‡∏µ‡πà‡πÄ‡∏°‡πá‡∏î‡∏ñ‡∏∂‡∏á‡∏´‡∏≤‡∏¢ üòÅ
050 ‡∏Ç‡∏≤‡∏î‡∏Ñ‡∏≤‡∏ö‡∏ô‡∏∂‡∏á‡πÄ‡∏õ‡∏ô‡πÑ‡∏£‡∏õ‡∏∞
Filmmy Nobuna ‡∏õ‡∏∞‡∏Ñ‡∏±‡∏ö
‡∏Ç‡∏¥‡∏á‡πÄ‡∏ä‡∏µ‡πâ‡∏¢‡πÑ‡∏£‡∏ô‡∏±‡∏Å‡∏´‡∏ô‡∏≤‡∏Ñ‡∏

In [7]:
print('‡∏¢‡∏±‡∏á‡πÑ‡∏á' in '‡∏≠‡∏¢‡∏≤‡∏Å‡∏£‡∏π‡πâ‡πÄ‡∏•‡πà‡∏ô‡∏¢‡∏±‡∏á‡πÑ‡∏á...ü§•')

True
