In [None]:
import re
import os
import pandas as pd
import numpy as np
import csv
import jieba
import jieba.posseg as pseg

In [None]:
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
 
# spacy for lemmatization
import spacy
 
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
#读入数据
group_3 = pd.read_csv('D:\my research\group\group_3\group_3.csv',encoding='utf-8')
group_3.columns = ['0','content','date','time','user_id']
group_3 = group_3.drop(columns='0')
#group_1_del = group_1_del.dropna(axis=0, how='any')
#group_1 = group_1[0:100]
group_3.head()

In [None]:
def clean(text):
    text = re.sub(r"(回复)?(//)?\s*@\S*?\s*(:| |$)", " ", text)  # 去除正文中的@和回复/转发中的用户名
    text = re.sub(r"\[\S+\]", "", text)      # 去除表情符号
    text = re.sub(r"#\S+#", "", text)      # 保留话题内容
    URL_REGEX = re.compile(
        r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))',
        re.IGNORECASE)
    text = re.sub(URL_REGEX, "", text)       # 去除网址
    text = text.replace('转发微博','') # 去除无意义的词语
    text = text.replace('网页链接','')
    text = text.replace('的微博视频','')
    text = text.replace('妞妞端午花草','')
    text = text.replace('bluepoint2006','')
    text = text.replace('上海','')
    text = re.sub(r"\s+", " ", text) # 合并正文中过多的空格
    return text.strip()

In [None]:
count = []
for i in group_3['content']:
    line_u = str(i)
    line_uu = clean(line_u)
    count.append(line_uu)    

In [None]:
res = list ( filter ( None , count))

In [None]:
res

In [None]:
def cut(x):
    return [word for word, flag in pseg.cut(x) if flag in ['n','v','a','r','i']]

texts = [cut(a) for a in res]

In [None]:
text = list ( filter ( None ,texts))

In [None]:
text

In [None]:
import gensim.downloader as api
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

In [None]:
dictionary = corpora.Dictionary(text)
corpus = [dictionary.doc2bow(tmp) for tmp in text]

In [None]:
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [None]:
# 困惑度的XY坐标
x_perplexity = [] 
y_perplexity = [] 
# 一致性的XY坐标
x_coherence = [] 
y_coherence = [] 
#id2word：就是上边构造的dictionary字典
#passes：模型的训练次数
for i in range(1, 31):     
    # 循环生成主题数为i的模型， passes代表模型训练次数，数据量太大了，搞小一点。     
    lda = gensim.models.ldamodel.LdaModel(corpus_tfidf, num_topics=i, id2word=dictionary, passes=10)     
    # 计算当前模型困惑度
    cur_perplexity = lda.log_perplexity(corpus_tfidf)     
    # 构造横纵坐标轴数据
    x_perplexity.append(i)
    y_perplexity.append(cur_perplexity) 
    # 计算一致性
    cv_tmp = CoherenceModel(model=lda, texts=text, dictionary=dictionary, coherence='c_v') 
    x_coherence.append(i) 
    y_coherence.append(cv_tmp.get_coherence()) 

In [None]:
# 绘制一致性折线图 
plt.plot(x_coherence, y_coherence) 
plt.xlabel('num topics') 
plt.ylabel('coherence score') 
plt.legend(('coherence_values'), loc='best') 
plt.savefig('D:\my research\coherenceLine_1.jpg') 
plt.show() 

In [None]:
cohen = pd.DataFrame(y_coherence, x_coherence)
cohen.to_csv('D:\my research\group\coherence_3.csv',encoding="utf-8_sig") 

In [None]:
perp =  pd.DataFrame(y_perplexity, x_perplexity)
perp.to_csv('D:\my research\group\perplexity_3.csv',encoding="utf-8_sig") 