In [1]:
import os
import re
import glob
import shutil
import numpy as np
import pandas as pd
from pyvi import ViTokenizer, ViPosTagger
from const import STOPWORDS, NON_VIETNAMESE

In [2]:
feature_data = None
label = None
all_f = None
x_train = None

In [3]:
def preprocessing(quest):
    #loại bỏ các từ không phải tiếng việt
    quest = quest.replace(u'\xa0', u' ').replace("\t", " ").lower()
    quest = re.sub(NON_VIETNAMESE, " ",quest)
    #phân mảnh từ
    quest = ViTokenizer.tokenize(quest).split(" ")
    #loại bỏ stopword
    for i in range(quest.__len__()):
        if quest[i] in STOPWORDS:
            quest[i] = ""
        #nối lại thành 1 câu
    quest = " ".join(quest)
    quest = re.sub(r'\d', ' ', quest)
    quest = re.sub(r' +', ' ', quest)
    return quest

In [4]:
def extract_feature(text):
    feature = list()
    freq = list()
    
    ls_word = text.split(' ')
    
    for word in ls_word:
        if word in feature:
            freq[feature.index(word)] += 1
        else:
            feature.append(word)
            freq.append(1)
    
    return feature, freq

# For else

In [5]:
def optimal_feature(vector):
    skip_val = 0.25
    
    ls_feature = list()
    freq = list()
    
    for idx, feature in enumerate(vector[0]):
        if vector[1][idx] >= skip_val:
            ls_feature.append(feature)
            freq.append(vector[1][idx])
    
    return ls_feature, freq

In [6]:
def append_feature(src, add):
    new_feature = list()
    new_freq = list()
    
    for idx, feature in enumerate(src[0]):
        if feature not in add[0]:
            new_feature.append(feature)
            new_freq.append(src[1][idx]/2)
        else:
            new_feature.append(feature)
            new_freq.append((src[1][idx] + add[1][add[0].index(feature)])/2)
    
    for idx, feature in enumerate(add[0]):
        if feature not in new_feature:
            new_feature.append(feature)
            new_freq.append(add[1][idx]/2)
            
    return new_feature, new_freq

In [7]:
text = 'con cá'
text2 = 'con cua'
f1 = extract_feature(text)
f2 = extract_feature(text2)
print(f1)
print(f2)
f3 = append_feature(f1, f2)
print(f3)
f4 = append_feature(f1, f3)
print(f4)

(['con', 'cá'], [1, 1])
(['con', 'cua'], [1, 1])
(['con', 'cá', 'cua'], [1.0, 0.5, 0.5])
(['con', 'cá', 'cua'], [1.0, 0.75, 0.25])


In [8]:
# def get_ave_freq(freq):

In [9]:
def display_dataframe(feature):
    df = pd.DataFrame([feature[1]],columns=feature[0])
    display(df)

In [10]:
def prepare_data():
    feature_data = list()
    label = list()
    
    for foldername in glob.glob('database\\*'):
        if os.path.isdir('./' + foldername) and '.' not in foldername:
            for filename in glob.glob(foldername + '\\*.txt'):
                subject_feature = [[]]
                with open(filename, 'r', encoding='utf-8') as post:
                    doc = post.read()
                    if 'Content:' in doc and 'Author:' in doc: 
                        doc = doc[doc.index('Content:'):doc.index('Author:')].strip('Content:').strip()
                        
                        
                        segment_text = preprocessing(doc).strip()
                        feature = extract_feature(segment_text)
                        subject_feature = append_feature(subject_feature, feature)
                        
            optimal_val = optimal_feature(subject_feature)             
            feature_data.append(subject_feature)
            label.append(foldername.split('\\')[1])
    
    return feature_data, label

In [11]:
def show_ls_f():
    for train_set in feature_data:
        display_dataframe(train_set)

In [12]:
# show_ls_f()

In [13]:
def all_feature(feature_data):
    sum_feature = [[]]
    for feature_set in feature_data:
        sum_feature = append_feature(sum_feature, feature_set)
        
    return sum_feature

In [14]:
def get_data_train(feature_data, all_f):
    total = [0]*len(all_f[0])
    x_train = list()
    for train_set in feature_data:
        line = list(total)
        for idx, feature in enumerate(train_set[0]):
            line[all_f[0].index(feature)] = train_set[1][idx]
            
        x_train.append(line)
        
    return x_train

In [15]:
def show_table_feature(all_f, x_train, label):
    merge = [train + [label[idx]] for idx, train in enumerate(x_train)]
    df = pd.DataFrame(merge, columns=all_f[0] + ['label'])
    display(df)

In [16]:
# show_table_feature(all_f, x_train, label)

In [17]:
def main():
    global feature_data, label, all_f, x_train
    
    feature_data, label = prepare_data()
    all_f = all_feature(feature_data)
    display_dataframe(all_f)
    
    x_train = get_data_train(feature_data, all_f)
    show_table_feature(all_f, x_train, label)
    
if __name__ == '__main__':
    main()

Unnamed: 0,hạt,nếp,dẻo,mềm,kết_hợp,thịt,cá_rô,đồng,chắc_nịch,phảng_phất,...,phổ_biến,thực_trạng,ý_kiến,trái,cám_ơn,chấm_dứt,cảbệnh,kiên_quyết,tệ,một_chiều
0,3.1e-05,1.5e-05,3.1e-05,1.5e-05,0.001984,7.6e-05,0.000122,0.251114,1.5e-05,1.5e-05,...,0.25,0.25,0.25,0.25,0.25,0.5,0.25,0.25,0.25,0.25


Unnamed: 0,hạt,nếp,dẻo,mềm,kết_hợp,thịt,cá_rô,đồng,chắc_nịch,phảng_phất,...,thực_trạng,ý_kiến,trái,cám_ơn,chấm_dứt,cảbệnh,kiên_quyết,tệ,một_chiều,label
0,1.0,0.5,1.0,0.5,1.0,2.5,4.0,2.5,0.5,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Du lịch
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Giáo dục
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Giải trí
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Khoa học
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Kinh doanh
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Pháp luật
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Số hóa
7,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Sức khỏe
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Thế giới
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Thể thao


In [20]:
# x_train