In [None]:
from collections import Counter
from random import choice
import jieba.posseg as pseg
import jieba
import snownlp
import os
import re

In [None]:
def get_file_list(path):
    file_list = []
    for root, dirs, files in os.walk(path):
        for file in files[:20000]:
            if file.endswith('.txt'):
                file_list.append(os.path.join(root, file))
    return file_list

def read_file(file_list):
    content = []
    for file in file_list:
        with open(file, 'r', encoding='utf-8') as f:
            content.append(f.read())
    return content

def read_stopwords():
    stopwords = []
    with open('../source/cn_stopwords.txt', 'r', encoding='utf-8') as f:
        for line in f:
            stopwords.append(line.strip())
    return stopwords

file_list = get_file_list('../source/politics')
content = read_file(file_list)
stopwords = read_stopwords()

In [None]:
words_count = Counter()
words_count_stopwords = Counter()

symbol_pattern = re.compile(
    r'[^\w\u4e00-\u9fa5]'
)

for text in content:
    words = [word for word in jieba.lcut(re.sub(r'[\d\.-]+', '', text))]
    words = [word for word in words if not symbol_pattern.match(word)]
    words_count.update(words)
    
    words = [word for word in words if word not in stopwords and len(word) > 1]
    words_count_stopwords.update(words)

In [None]:
print('most_common_words_with_stopwords:')
for word, count in words_count.most_common(10):
    print(f'{word}: {count}')

print('most_common_words_without_stopwords:')
for word, count in words_count_stopwords.most_common(10):
    print(f'{word}: {count}')

In [None]:
text = choice(content)

words = pseg.cut(text)
words = [(word, flag) for word, flag in words if not symbol_pattern.match(word)]
for word, flag in words:
    print(f'{word} {flag}')

In [None]:
words = snownlp.SnowNLP(text)
words = [(word, flag) for word, flag in words.tags if not symbol_pattern.match(word)]

for word, flag in words:
    print(f'{word} {flag}')