In [16]:
import re  
import nltk
import jieba 
import pyLDAvis   
import numpy as np
import pandas as pd  
from nltk import pos_tag
from nltk.util import ngrams
import matplotlib.pyplot as plt 
from nltk.corpus import stopwords   
from gensim import corpora, models  
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer  
import pyLDAvis.gensim_models as gensimvis

## 课堂案例——微博语料库

In [2]:
# 导入数据
df = pd.read_excel('text_analysis_weibo_sample.xlsx', index_col = 0)
df.head()

Unnamed: 0,index,标题/微博内容,点赞,转发,评论,账号昵称UID加密,粉丝数,关注数,地域
0,34121,国债：地产行业重磅利好提振风险偏好，期债低开低走 国债期货全线收跌，10年期主力...,0,0,0,e5df796860e68f403bcf9651bab4d42e,0,0,其他
1,40230,#喜迎二十大 忠诚保平安#,0,0,0,6e35cb69ad52f20de5e28197b2e85306,405444,252,广西
2,7714,注意！事关明日教资考试！福建省教育考试院发布补充公告 福建省2022年下半年全国中小学教师...,0,0,0,e6953217442e6c06a7af23eee5e185f2,53264,2177,福建
3,27378,近日，“千年大计”雄安新区迎来五周岁生日。从“一张白纸...,0,0,0,,0,0,北京
4,15435,樊振东牛逼！,0,0,0,344af41eac516375c04dee6325e763cc,8,51,山东


In [3]:
# 预处理
def data_preprocess_chi(text):
    # 剔除符号与数字
    nonums_text = re.sub('[^\u4e00-\u9fa5]+', '', str(text))
        
    # 分词
    words = jieba.cut(nonums_text)
    
    # 去除停用词
    words = [w for w in words if w not in list(stopwords) and w!='\u200b']
    
    return words

stopwords = open('stopwords.txt', encoding = 'utf-8').read()
df['clean_text'] = df['标题/微博内容'].apply(data_preprocess_chi)

Building prefix dict from C:\Users\lenovo\anaconda3\Lib\site-packages\jieba\dict.txt ...
Loading model from cache C:\Users\lenovo\AppData\Local\Temp\jieba.cache
Loading model cost 0.4891774654388428 seconds.
Prefix dict has been built succesfully.


In [4]:
# LDA
dictionary = corpora.Dictionary(df['clean_text'])     # 根据分词结果创建字典
corpus = [dictionary.doc2bow(text) for text in df['clean_text']]     # 根据分词结果创建语料库

In [5]:
# 训练LDA模型
lda_model = models.LdaModel(corpus, num_topics=6, id2word=dictionary, passes=15)

In [6]:
# 查看主题
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.022*"郑州" + 0.010*"亿元" + 0.009*"城市" + 0.008*"长沙" + 0.006*"青岛" + 0.006*"上半年" + 0.006*"无锡" + 0.005*"优势" + 0.004*"增长" + 0.004*"我们"')
(1, '0.019*"月" + 0.016*"万吨" + 0.011*"市场" + 0.010*"日" + 0.010*"库存" + 0.007*"需求" + 0.007*"价格" + 0.007*"下降" + 0.007*"环比" + 0.006*"现货"')
(2, '0.009*"银行" + 0.008*"重庆" + 0.008*"亿元" + 0.007*"数字" + 0.007*"刘雨昕" + 0.006*"建设" + 0.006*"增长" + 0.005*"报告" + 0.005*"经济" + 0.005*"发展"')
(3, '0.008*"公司" + 0.008*"开户" + 0.007*"期货" + 0.006*"玉米" + 0.006*"产品" + 0.005*"重庆" + 0.005*"主要" + 0.005*"我们" + 0.004*"技术" + 0.004*"转基因"')
(4, '0.017*"电子" + 0.017*"烟" + 0.009*"考生" + 0.006*"中国" + 0.005*"考试" + 0.005*"政策" + 0.005*"数字" + 0.005*"市场" + 0.004*"发展" + 0.004*"考场"')
(5, '0.013*"月" + 0.008*"经济" + 0.008*"加息" + 0.008*"中国" + 0.007*"预期" + 0.007*"市场" + 0.007*"通胀" + 0.005*"数据" + 0.005*"增长" + 0.005*"美元"')


In [7]:
# 可视化
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary, n_jobs=1)
pyLDAvis.display(lda_vis)

In [8]:
# 导出可视化结果到html
# pyLDAvis.save_html(lda_vis, 'lda_visualization.html')

## 个人数据——美国故事语料库样本

In [12]:
# 导入数据
df = pd.read_csv('American_story_sample_100.csv')
df.head()

Unnamed: 0,Article_id,Date,Page_number,Newspaper_name,Headline,Author,Article_body
0,4_1920-01-09_p1_sn99062049_00415624992_1920010...,1920-01-09,p1,,Presbyterian ChurchDemobilizes Service Flag,,The services at the PresbyterianChurch Sunday ...
1,23_1920-01-05_p1_sn91066782_00415627932_192001...,1920-01-05,p1,,Lumber Exports VS.. Production.,,"Only Al per, cent of the total lum.ber product..."
2,16_1920-01-09_p1_sn86063730_00332894857_192001...,1920-01-09,p1,,WILSON AND BRYANfUIESSAGESSURPRISES OF JACKSON...,,"with Germany. was thus disclosed asa fact, alt..."
3,14_1920-01-24_p7_sn83025514_00340586384_192001...,1920-01-24,p7,The Colorado statesman.,,,Apple Snow.-Pare and quarter tartapples. add b...
4,40_1920-01-03_p4_sn83025458_00414217370_192001...,1920-01-03,p4,The daily morning oasis.,,,"Mr.. A. Ybarra, counsel for theSouthern Pacifi..."


In [13]:
# 下载
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [17]:
## Data Cleaning and Text Processing.
def preprocess_text(text):
    # 小写
    text = text.lower()
    
    # 移除URL
    # text = re.sub(r'http\S+', '',text)
    
    # 分词
    tokens = word_tokenize(text)
    
    # 去除停用词
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    
    # 去除标点符号和数字 和其他一些字符
    filtered_tokens = [word for word in filtered_tokens if word.isalpha() ]
    #filtered_tokens = [word for word in filtered_tokens if word not in string.punctuation]
    
    # 词性标注与筛选
    #pos_tags = pos_tag(filtered_tokens)
    #selected_tokens = [word  for word, pos in pos_tags if pos.startswith('N') or pos.startswith('J') or pos.startswith('V')]
    
    # 词性还原
    # lemmatizer = WordNetLemmatizer()
    # lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
  
    # 返回预处理后的文本
    return filtered_tokens


# 定义函数将词性标签映射到 WordNet 词性标签
# def get_wordnet_pos(treebank_tag):
#     if treebank_tag.startswith('J'):
#         return 'a'  # 形容词
#     elif treebank_tag.startswith('V'):
#         return 'v'  # 动词
#     elif treebank_tag.startswith('N'):
#         return 'n'  # 名词
#     else:
#         return None  # 默认为名词

df['clean_text'] = df['Article_body'].apply(preprocess_text)    

In [18]:
# LDA
dictionary = corpora.Dictionary(df['clean_text'])     # 根据分词结果创建字典
corpus = [dictionary.doc2bow(text) for text in df['clean_text']]     # 根据分词结果创建语料库

In [29]:
# 训练LDA模型
lda_model = models.LdaModel(corpus, num_topics=11, id2word=dictionary, passes=15)

In [30]:
# 查看主题
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.012*"nv" + 0.009*"room" + 0.009*"n" + 0.009*"e" + 0.008*"rooms" + 0.008*"st" + 0.008*"bath" + 0.007*"new" + 0.006*"h" + 0.005*"phone"')
(1, '0.004*"interment" + 0.004*"burial" + 0.002*"hunters" + 0.002*"gods" + 0.002*"tracy" + 0.002*"exciting" + 0.002*"rescue" + 0.002*"lucille" + 0.002*"officiating" + 0.002*"dizzy"')
(2, '0.009*"lbs" + 0.004*"cloudy" + 0.004*"site" + 0.003*"tonight" + 0.003*"sts" + 0.003*"tide" + 0.003*"jewish" + 0.003*"moon" + 0.002*"temperature" + 0.002*"airport"')
(3, '0.029*"game" + 0.024*"team" + 0.016*"league" + 0.015*"ball" + 0.013*"games" + 0.008*"base" + 0.007*"season" + 0.007*"teams" + 0.007*"football" + 0.006*"victory"')
(4, '0.015*"alaska" + 0.008*"district" + 0.008*"sale" + 0.008*"said" + 0.007*"juneau" + 0.006*"day" + 0.006*"estate" + 0.006*"seattle" + 0.005*"land" + 0.005*"court"')
(5, '0.006*"steady" + 0.006*"pounds" + 0.005*"coupe" + 0.005*"add" + 0.005*"sugar" + 0.004*"milk" + 0.004*"light" + 0.004*"market" + 0.004*"cream" + 0.004*"eggs"')
(6, 

In [31]:
# 可视化
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary, n_jobs=1)
pyLDAvis.display(lda_vis)

In [25]:
# 导出可视化结果到html
# pyLDAvis.save_html(lda_vis, 'lda_visualization.html')