In [4]:
import re
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer

def chinese_tokenizer(text):
    # Regex for matching Chinese characters
    pattern = re.compile(r'[\u4e00-\u9fff]+')
    # Use jieba to tokenize the text and filter with regex to include only Chinese characters
    tokens = jieba.cut(text)
    return [token for token in tokens if pattern.match(token)]

# Your Chinese text file path
file_path = "199801.txt"

def extract_chinese_keywords(file_path, top_n=5):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = [next(file) for _ in range(16)]
        
    # Segmenting and filtering non-Chinese characters
    text = [' '.join(chinese_tokenizer(line)) for line in lines]

    # Initializing the TF-IDF vectorizer with the custom tokenizer
    vectorizer = TfidfVectorizer()

    # Generating the TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(text)

    # Getting feature names (words)
    feature_names = vectorizer.get_feature_names_out()

    # Extracting scores and pairing them with words
    scores_words = zip(tfidf_matrix.toarray().sum(axis=0), feature_names)

    # Sorting words by their scores
    sorted_words = sorted(scores_words, reverse=True, key=lambda x: x[0])

    # Selecting the top 'top_n' keywords
    top_keywords = [word for _, word in sorted_words[:top_n]]

    return top_keywords

print(extract_chinese_keywords(file_path))


Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\xiaoma\AppData\Local\Temp\jieba.cache
Loading model cost 1.179 seconds.
Prefix dict has been built successfully.


['中国', '发展', '世纪', '新年', '国家']


In [3]:
pip install scikit-learn

Collecting scikit-learnNote: you may need to restart the kernel to use updated packages.

  Downloading scikit_learn-1.4.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.6 kB ? eta -:--:--
     -------------------------------------- 60.6/60.6 kB 535.5 kB/s eta 0:00:00
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.4.2-cp312-cp312-win_amd64.whl (10.6 MB)
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ---------------------------------------- 0.1/10.6 MB 1.7