In [None]:
from gensim import corpora, models
from nltk.corpus import sinica_treebank
import jieba
import re

In [None]:
# Pre-process
with open('../source/cn_stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = f.read().splitlines()

s = "此外，公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元，增资后，吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年，实现营业收入0万元，实现净利润-139.13万元。"

words = [word for word in jieba.lcut(re.sub(r'[\d\.-]+', '', s)) if len(word) > 1 and not word in stopwords ]

print(words)

In [None]:
# Keyword Extraction using gensim LSA

dictionary = corpora.Dictionary([words])
corpus = [dictionary.doc2bow(words)]
lsa = models.LsiModel(corpus, id2word=dictionary, num_topics=1)

for topic_id, topic in lsa.print_topics(num_words=5):
    keywords = [word.rstrip('"').split(r'*"') for word in topic.split(" + ")]

weight = [float(word[0]) for word in keywords]
keywords = [word[1] for word in keywords]

for i in range(len(keywords)):
    print(f"{keywords[i]} {weight[i]}")

In [None]:
# Keyword Extraction using gensim LDA

dictionary = corpora.Dictionary([words])
corpus = [dictionary.doc2bow(words)]
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=1)

for topic_id, topic in lda.print_topics(num_words=5):
    keywords = [word.rstrip('"').split(r'*"') for word in topic.split(" + ")]

weight = [float(word[0]) for word in keywords]
keywords = [word[1] for word in keywords]

for i in range(len(keywords)):
    print(f"{keywords[i]} {weight[i]}")

In [None]:
# Keyword Extraction using jieba TextRank

for x, w in jieba.analyse.textrank(s, withWeight=True)[:5]:
    print('%s %s' % (x, w))

In [None]:
# Keyword Extraction using TF-IDF by hand

idf = {}
with open("../source/idf.txt.big", 'r', encoding='utf-8') as f:
    lines = f.read().splitlines()
    for line in lines:
        word, value = line.split()
        idf[word] = float(value)

def TF_IDF(words, idf):
    tfidf = {}
    tf = {}
    for word in words:
        if word in tf:
            tf[word] += 1
        else:
            tf[word] = 1
    for word in tf:
        tf[word] = tf[word] / len(words)
        tfidf[word] = tf[word] * idf.get(word, 14)
    return tfidf

tfidf = TF_IDF(words, idf)
for word in sorted(tfidf.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(word[0], word[1])