In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from pythainlp.util import normalize
from pythainlp.ulmfit.utils import ThaiTokenizer
from pythainlp.corpus import stopwords
from pythainlp.tag import pos_tag

import pandas as pd
import numpy as np
import os, json, sys
import copy, glob
import pythainlp
from tqdm import tqdm
import re, string

# Text_to_tokens
- nomalize
- remove duplicate ending character
- remove url
- remove cr
- remove unwanted character
- remove stopwords
- to lowercase
- tokenize
- remove space 

In [2]:
def text_to_tokens(text, stop_words=[]):
    # nomalize
    text = pythainlp.util.normalize(text)
    
    # remove duplicate ending characters
    for m in re.finditer(r'([\u0E00-\u0E7F])(\1{2,})', text):
        text = text.replace(m.group(0),m.group(1),1)
        
    # remove url
    text = re.sub(r'http\S+', '', text)
    
    # remove CR
    text = text.replace('CR', '').replace('SR', '').replace('\xa0', '')
        
    # remove unwanted character
    pattern = re.compile(r"[^\u0E00-\u0E7Fa-zA-Z ]|‡πÜ")
    text = pattern.sub('',text)
    
    # lowercase
    text = text.lower()
    
    # tokenization
    tokens = pythainlp.tokenize.word_tokenize(text, engine='newmm', whitespaces=False)
    
    # remove stopwords
    tokens = [t for t in tokens if not t in stop_words] 
    
    return tokens

## pos_filter_noun
- filter noun from pos to find topics

In [3]:
def pos_filter_noun(tokens):
    postags =  pos_tag(tokens, engine = 'artagger')
    filterpos = []
    noun = ['NPRP','NCMN']
    for w in postags:
        
        # filter noun
        if w[1] in noun:
            filterpos.append(w[0])
        
    return filterpos


## fine_ques_type
- find type of question sentence


In [4]:
def fine_ques_type(text):
    
    # nomalize
    text = normalize(text)
    
    # question keywords

    where = ['‡∏ó‡∏µ‡πà‡πÑ‡∏´‡∏ô','‡∏≠‡∏¢‡∏π‡πà‡πÑ‡∏´‡∏ô','‡∏ï‡∏£‡∏á‡πÑ‡∏´‡∏ô','‡πÑ‡∏£‡∏î‡∏µ'] 
    when = ['‡πÄ‡∏°‡∏∑‡πà‡∏≠‡πÑ‡∏´‡∏£‡πà','‡πÄ‡∏õ‡∏¥‡∏î‡πÑ‡∏´‡∏°','‡∏Å‡∏µ‡πà‡πÇ‡∏°‡∏á','‡∏Å‡∏µ‡πà‡∏ó‡∏∏‡πà‡∏°','‡∏Å‡∏µ‡πà‡∏ô‡∏≤‡∏ó‡∏µ','‡∏Å‡∏µ‡πà‡∏ß‡∏±‡∏ô','‡∏Å‡∏µ‡πà‡πÄ‡∏î‡∏∑‡∏≠‡∏ô','‡∏Å‡∏µ‡πà‡∏õ‡∏µ']
    why = ['‡∏ó‡∏≥‡πÑ‡∏°']
    who = ['‡πÉ‡∏Ñ‡∏£']
    whose = ['‡∏Ç‡∏≠‡∏á‡πÉ‡∏Ñ‡∏£']
    which = ['‡∏≠‡∏±‡∏ô‡πÑ‡∏´‡∏ô','‡∏´‡∏£‡∏∑‡∏≠']
    what = ['‡∏≠‡∏∞‡πÑ‡∏£','‡πÅ‡∏ö‡∏ö‡πÑ‡∏´‡∏ô','‡∏ó‡∏≥‡πÑ‡∏£','‡πÑ‡∏´‡∏ô']
    how = ['‡∏≠‡∏¢‡πà‡∏≤‡∏á‡πÑ‡∏£','‡∏¢‡∏±‡∏á‡πÑ‡∏á','‡∏ó‡∏≥‡πÑ‡∏á','‡πÄ‡∏ó‡πà‡∏≤‡πÑ‡∏´‡∏£‡πà','‡πÄ‡∏ó‡πà‡∏≤‡πÑ‡∏£','‡πÄ‡∏õ‡πá‡∏ô‡πÑ‡∏á','‡∏°‡∏µ‡∏Å‡∏µ‡πà']
    rec = ['‡πÅ‡∏ô‡∏∞‡∏ô‡∏≥','‡∏£‡∏µ‡∏ß‡∏¥‡∏ß']
    yes_no = ['‡πÉ‡∏ä‡πà‡πÑ‡∏´‡∏°','‡πÉ‡∏ä‡πà‡∏°‡∏±‡πâ‡∏¢','‡πÑ‡∏î‡πâ‡πÑ‡∏´‡∏°','‡∏´‡∏£‡∏∑‡∏≠‡∏¢‡∏±‡∏á','‡πÑ‡∏î‡πâ‡πÑ‡∏´‡∏°','‡πÑ‡∏õ‡πÑ‡∏´‡∏°','‡πÉ‡∏ä‡πà‡πÑ‡∏´‡∏°','‡πÑ‡∏î‡πâ‡∏°‡∏±‡πâ‡∏¢','‡∏°‡∏±‡πâ‡∏¢','‡∏´‡∏£‡∏∑‡∏≠‡πÄ‡∏õ‡∏•‡πà‡∏≤','‡πÑ‡∏´‡∏°','‡πÑ‡∏î‡πâ‡∏õ‡πà‡∏≤‡∏ß','‡πÑ‡∏î‡πâ‡∏õ‡∏∞','‡πÑ‡∏î‡πâ‡∏£‡∏∂‡πÄ‡∏õ‡∏•‡πà‡∏≤','‡∏°‡∏±‡πä‡∏¢','‡∏Å‡∏±‡∏ô‡∏¢‡∏±‡∏á']
    
    ques_types = []
    
    
            
    for w in where:
        if w in text:
            ques_types.append('where')
            break
            
    for w in when:
        if w in text:
            ques_types.append('when')
            break
            
    for w in why:
        if w in text:
            ques_types.append('why')
            break
            
    for w in who:
        if w in text:
            ques_types.append('who')
            break
            
    for w in whose:
        if w in text:
            ques_types.append('whose')
            break
            
    for w in which:
        if w in text:
            ques_types.append('which')
            break
            
    for w in what:
        if w in text and 'which' not in ques_types and 'where' not in ques_types:
            ques_types.append('what')
            break
            
    for w in how:
        if w in text:
            ques_types.append('how')
            break
            
    for w in rec:
        if w in text:
            ques_types.append('rec')
            break
            
    for w in yes_no:
        if w in text and 'when' not in ques_types:
            ques_types.append('yes_no')
            break
    
    return ques_types
        

In [5]:
df = pd.read_csv('./data/predicted-non-questions - question.csv',encoding='utf-8-sig')
df = df.drop(['label'],axis = 1)

df2 = pd.read_csv('./data/predited-questions - question and _0.75.csv',encoding='utf-8-sig')
result = df.append(df2,ignore_index=True)
result

Unnamed: 0,text,probability
0,‡∏°‡∏±‡∏ô‡πÅ‡∏õ‡∏•‡∏Å‡∏´‡∏£‡∏≠ ‡∏ñ‡πâ‡∏≤‡∏™‡∏°‡∏°‡∏ï‡∏¥‡πÄ‡∏£‡∏≤‡∏≠‡∏¢‡∏π‡πà‡∏Å‡∏±‡∏ö‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏ô‡∏Ñ‡∏ô‡∏ô‡∏µ‡πâ‡πÅ‡∏•‡πâ‡∏ß‡πÄ‡∏£...,0.5000
1,‡∏ß‡∏¥‡πà‡∏á‡∏´‡∏ô‡∏µ‡πÄ‡∏´‡∏ô‡∏∑‡πà‡∏≠‡∏¢‡∏°‡∏±‡πâ‡∏¢ ‡∏ß‡∏¥‡πà‡∏á‡∏ï‡∏≤‡∏°‡πÄ‡∏´‡∏ô‡∏∑‡πà‡∏≠‡∏¢‡∏°‡∏≤‡∏Å,0.5001
2,‡πÄ‡∏Ñ‡∏¢‡πÄ‡∏à‡∏≠‡∏û‡∏µ‡πà‡∏™‡∏¥‡∏á‡∏ó‡∏µ‡πà‡∏°‡∏≠‡∏°‡∏±‡πâ‡∏¢‡∏¢ (‡∏™‡∏¥‡∏á‡πÇ‡∏ï),0.5001
3,‡πÉ‡∏Ñ‡∏£‡πÑ‡∏õ‡πÄ‡∏à‡∏≤‡∏∞‡πÄ‡∏•‡∏∑‡∏≠‡∏î‡∏°‡∏±‡πà‡∏á‡∏á‡∏á‡∏á,0.5001
4,‡∏ß‡∏¥‡πà‡∏á‡∏´‡∏ô‡∏µ‡πÄ‡∏´‡∏ô‡∏∑‡πà‡∏≠‡∏¢‡∏°‡∏±‡πâ‡∏¢\n‡∏ß‡∏¥‡πà‡∏á‡∏ï‡∏≤‡∏°‡πÄ‡∏´‡∏ô‡∏∑‡πà‡∏≠‡∏¢‡∏°‡∏≤‡∏Å..,0.5001
5,‡πÑ‡∏´‡∏ô‡∏ö‡∏≠‡∏Å‡πÄ‡∏£‡∏≤‡∏ß‡πà‡∏≤‡πÄ‡∏£‡∏≤‡πÄ‡∏õ‡πá‡∏ô‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∏‡∏Ç‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏∏‡∏ì‡πÑ‡∏á\n‡πÅ‡∏•‡πâ‡∏ß‡∏ï‡∏≠‡∏ô‡∏ô‡∏µ‡πâ...,0.5001
6,‡πÄ‡∏à‡πâ‡∏≤‡πÅ‡∏≠‡∏û‡∏ô‡∏µ‡πâ‡∏°‡∏±‡∏ô‡πÄ‡∏•‡πà‡∏ô‡∏¢‡∏±‡∏á‡πÑ‡∏á‡∏ô‡∏∞ ;__;,0.5002
7,40000 ‡∏Ñ‡∏ô‡∏ï‡∏≠‡∏ô‡∏ô‡∏µ‡πâ‡πÄ‡∏ï‡πá‡∏°‡∏¢‡∏±‡∏á‡∏Ñ‡πà‡∏∞,0.5002
8,‡∏≠‡∏µ‡∏Å‡∏ô‡∏≤‡∏ô‡πÑ‡∏´‡∏° ‡∏Å‡∏ß‡πà‡∏≤‡πÉ‡∏à‡∏à‡∏∞‡∏•‡∏∑‡∏°,0.5002
9,‡πÄ‡∏£‡∏≤‡∏°‡∏±‡∏ô‡∏Å‡πá‡πÅ‡∏Ñ‡πà‡∏Ñ‡∏ô‡πÉ‡∏ô‡πÑ‡∏•‡∏ô‡πå\n‡∏à‡∏∞‡πÑ‡∏õ‡∏™‡∏π‡πâ‡∏Ñ‡∏ô‡∏Ç‡πâ‡∏≤‡∏á‡∏Å‡∏≤‡∏¢‡πÄ‡∏ò‡∏≠‡πÑ‡∏î‡πâ‡∏¢‡∏±‡∏á...,0.5002


In [6]:
def find_types(df,stopword):
    data = []
    for t in tqdm(df['text']):

        types = ','.join(fine_ques_type(t))
        
        tokens = text_to_tokens(t,stopword)
        topics = ','.join(pos_filter_noun(tokens))
        
        data.append({
            'text':t,
            'types':types,
            'keywords':topics
        })

    df2 = pd.DataFrame(data)
    return df2[['text','types','keywords']]


In [7]:
def find_stopwords(df):
    stopwords = []
    
    for index,row in df.iterrows():
        if row['label'] == 0:
            stopwords.append(row['Stopword'])
    return stopwords

## Experiment

In [8]:
# download stopword
# stopwords version 2
df = pd.read_csv('./data/stopwords.2.csv',encoding='utf-8-sig')
df.fillna(0,inplace = True)

stopwords = find_stopwords(df)
stopwords

['‡∏ô‡∏µ‡πâ',
 '‡∏ô‡πç‡∏≤',
 '‡∏ô‡∏±‡πâ‡∏ô',
 '‡∏ô‡∏±‡∏Å',
 '‡∏ó‡∏µ‡πà',
 '‡∏ó‡∏±‡πâ‡∏á‡∏ô‡∏µ‡πâ',
 '‡∏à‡∏≤‡∏Å',
 '‡∏à‡∏∞',
 '‡∏Ñ‡∏ß‡∏≤‡∏°',
 '‡∏Ñ‡∏£‡∏±‡πâ‡∏á',
 '‡∏Ñ‡∏á',
 '‡∏ô‡πà‡∏≤',
 '‡πÄ‡∏Ç‡πâ‡∏≤',
 '‡∏ñ‡∏∂‡∏á',
 '‡∏ï‡πà‡∏≠',
 '‡∏ï‡∏±‡πâ‡∏á‡πÅ‡∏ï‡πà',
 '‡∏ï‡∏±‡πâ‡∏á',
 '‡∏î‡πâ‡∏≤‡∏ô',
 '‡∏î‡πâ‡∏ß‡∏¢',
 '‡∏≠‡∏µ‡∏Å',
 '‡∏≠‡∏≤‡∏à',
 '‡∏≠‡∏¢‡πà‡∏≤‡∏á',
 '‡πÅ‡∏ï‡πà',
 '‡πÄ‡∏°‡∏∑‡πà‡∏≠',
 '‡πÄ‡∏û‡∏∑‡πà‡∏≠',
 '‡∏´‡∏•‡∏±‡∏á',
 '‡∏´‡∏ô‡∏∂‡πà‡∏á',
 '‡∏™‡πà‡∏ß‡∏ô',
 '‡∏™‡∏∏‡∏î',
 '‡∏£‡∏≤‡∏¢',
 '‡∏Ç‡∏ì‡∏∞',
 '‡∏Å‡πá',
 '‡∏Å‡∏≤‡∏£',
 '‡∏Å‡∏±‡∏ô',
 '‡∏Å‡∏ß‡πà‡∏≤',
 '‡∏à‡∏∂‡∏á',
 '‡πÑ‡∏ß‡πâ',
 '‡πÑ‡∏õ',
 '‡πÑ‡∏î‡πâ',
 '‡πÇ‡∏î‡∏¢',
 '‡πÅ‡∏´‡πà‡∏á',
 '‡πÅ‡∏•‡πâ‡∏ß',
 '‡πÅ‡∏•‡∏∞',
 '‡∏ó‡∏±‡πâ‡∏á',
 '‡πÄ‡∏Ñ‡∏¢',
 '‡∏Ç‡∏±‡πâ‡∏ô',
 '‡πÑ‡∏á',
 '‡∏à‡∏á',
 '‡∏ã‡∏∂‡πà‡∏á‡∏Å‡πá',
 '‡∏ã‡∏∂‡πà‡∏á‡∏Å‡πá‡∏Ñ‡∏∑‡∏≠',
 '‡∏ã‡∏∂‡πà‡∏á‡πÑ‡∏î‡πâ‡πÅ‡∏Å‡πà',
 '‡∏ì',
 '‡∏î‡πâ‡∏ß‡∏¢',
 '‡∏î‡πâ‡∏ß‡∏¢‡πÄ‡∏ä‡πà‡∏ô‡∏Å‡∏±‡∏ô',
 '‡∏î‡πâ‡∏ß‡∏¢‡∏ó‡∏µ‡πà',
 '‡∏î‡πâ‡∏ß‡∏¢‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏£‡∏â‡∏∞‡∏ô‡∏µ‡πâ',
 '‡∏î‡πâ‡∏ß‡∏¢‡πÄ‡∏û‡∏£‡

In [9]:

result_df = find_types(result,stopwords)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 19356/19356 [14:01<00:00, 23.01it/s]


In [10]:
result_df

Unnamed: 0,text,types,keywords
0,‡∏°‡∏±‡∏ô‡πÅ‡∏õ‡∏•‡∏Å‡∏´‡∏£‡∏≠ ‡∏ñ‡πâ‡∏≤‡∏™‡∏°‡∏°‡∏ï‡∏¥‡πÄ‡∏£‡∏≤‡∏≠‡∏¢‡∏π‡πà‡∏Å‡∏±‡∏ö‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏ô‡∏Ñ‡∏ô‡∏ô‡∏µ‡πâ‡πÅ‡∏•‡πâ‡∏ß‡πÄ‡∏£...,yes_no,"‡∏´‡∏£‡∏≠,‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏ô,‡πÇ‡∏•‡∏Å,‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∏‡∏Ç,‡πÑ‡∏´‡∏°,‡∏ó‡∏±‡πâ‡∏á‡πÇ‡∏•‡∏Å,‡∏õ‡πà‡∏∞,‡πÄ‡∏´‡∏ô‡∏∑‡πà‡∏≠‡∏¢"
1,‡∏ß‡∏¥‡πà‡∏á‡∏´‡∏ô‡∏µ‡πÄ‡∏´‡∏ô‡∏∑‡πà‡∏≠‡∏¢‡∏°‡∏±‡πâ‡∏¢ ‡∏ß‡∏¥‡πà‡∏á‡∏ï‡∏≤‡∏°‡πÄ‡∏´‡∏ô‡∏∑‡πà‡∏≠‡∏¢‡∏°‡∏≤‡∏Å,yes_no,"‡∏ß‡∏¥‡πà‡∏á‡∏´‡∏ô‡∏µ,‡πÄ‡∏´‡∏ô‡∏∑‡πà‡∏≠‡∏¢,‡∏°‡∏±‡πâ‡∏¢,‡πÄ‡∏´‡∏ô‡∏∑‡πà‡∏≠‡∏¢"
2,‡πÄ‡∏Ñ‡∏¢‡πÄ‡∏à‡∏≠‡∏û‡∏µ‡πà‡∏™‡∏¥‡∏á‡∏ó‡∏µ‡πà‡∏°‡∏≠‡∏°‡∏±‡πâ‡∏¢‡∏¢ (‡∏™‡∏¥‡∏á‡πÇ‡∏ï),yes_no,"‡∏û‡∏µ‡πà,‡∏™‡∏¥‡∏á,‡∏°‡∏≠,‡∏°‡∏±‡πâ‡∏¢,‡∏¢,‡∏™‡∏¥‡∏á‡πÇ‡∏ï"
3,‡πÉ‡∏Ñ‡∏£‡πÑ‡∏õ‡πÄ‡∏à‡∏≤‡∏∞‡πÄ‡∏•‡∏∑‡∏≠‡∏î‡∏°‡∏±‡πà‡∏á‡∏á‡∏á‡∏á,who,"‡πÄ‡∏•‡∏∑‡∏≠‡∏î,‡∏°‡∏±‡πà‡∏á"
4,‡∏ß‡∏¥‡πà‡∏á‡∏´‡∏ô‡∏µ‡πÄ‡∏´‡∏ô‡∏∑‡πà‡∏≠‡∏¢‡∏°‡∏±‡πâ‡∏¢\n‡∏ß‡∏¥‡πà‡∏á‡∏ï‡∏≤‡∏°‡πÄ‡∏´‡∏ô‡∏∑‡πà‡∏≠‡∏¢‡∏°‡∏≤‡∏Å..,yes_no,"‡∏ß‡∏¥‡πà‡∏á‡∏´‡∏ô‡∏µ,‡πÄ‡∏´‡∏ô‡∏∑‡πà‡∏≠‡∏¢,‡∏°‡∏±‡πâ‡∏¢,‡πÄ‡∏´‡∏ô‡∏∑‡πà‡∏≠‡∏¢"
5,‡πÑ‡∏´‡∏ô‡∏ö‡∏≠‡∏Å‡πÄ‡∏£‡∏≤‡∏ß‡πà‡∏≤‡πÄ‡∏£‡∏≤‡πÄ‡∏õ‡πá‡∏ô‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∏‡∏Ç‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏∏‡∏ì‡πÑ‡∏á\n‡πÅ‡∏•‡πâ‡∏ß‡∏ï‡∏≠‡∏ô‡∏ô‡∏µ‡πâ...,what,"‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∏‡∏Ç,‡∏ï‡∏≠‡∏ô‡∏ô‡∏µ‡πâ,‡∏•‡πà‡∏∞"
6,‡πÄ‡∏à‡πâ‡∏≤‡πÅ‡∏≠‡∏û‡∏ô‡∏µ‡πâ‡∏°‡∏±‡∏ô‡πÄ‡∏•‡πà‡∏ô‡∏¢‡∏±‡∏á‡πÑ‡∏á‡∏ô‡∏∞ ;__;,how,"‡πÄ‡∏à‡πâ‡∏≤,‡πÅ‡∏≠,‡∏û,‡∏¢‡∏±‡∏á‡πÑ‡∏á"
7,40000 ‡∏Ñ‡∏ô‡∏ï‡∏≠‡∏ô‡∏ô‡∏µ‡πâ‡πÄ‡∏ï‡πá‡∏°‡∏¢‡∏±‡∏á‡∏Ñ‡πà‡∏∞,,‡∏ï‡∏≠‡∏ô‡∏ô‡∏µ‡πâ
8,‡∏≠‡∏µ‡∏Å‡∏ô‡∏≤‡∏ô‡πÑ‡∏´‡∏° ‡∏Å‡∏ß‡πà‡∏≤‡πÉ‡∏à‡∏à‡∏∞‡∏•‡∏∑‡∏°,yes_no,"‡πÑ‡∏´‡∏°,‡πÉ‡∏à,‡∏•‡∏∑‡∏°"
9,‡πÄ‡∏£‡∏≤‡∏°‡∏±‡∏ô‡∏Å‡πá‡πÅ‡∏Ñ‡πà‡∏Ñ‡∏ô‡πÉ‡∏ô‡πÑ‡∏•‡∏ô‡πå\n‡∏à‡∏∞‡πÑ‡∏õ‡∏™‡∏π‡πâ‡∏Ñ‡∏ô‡∏Ç‡πâ‡∏≤‡∏á‡∏Å‡∏≤‡∏¢‡πÄ‡∏ò‡∏≠‡πÑ‡∏î‡πâ‡∏¢‡∏±‡∏á...,how,"‡πÑ‡∏•‡∏ô‡πå,‡∏Ç‡πâ‡∏≤‡∏á,‡∏Å‡∏≤‡∏¢"
