In [1]:
import pandas as pd
import jieba
import numpy as np
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim import corpora, models, similarities
from gensim.corpora import Dictionary

In [2]:
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

In [3]:
import re
import math
from gensim.models import TfidfModel

In [139]:
from src.nlp_preprocess import NLP_Preprocess, BOW_Filter
from src.lda_interpretation import LDA_WordAnalyzer

### Define Functions

In [5]:
def aggregate_preprocess(text):
    
    word_cut = jieba.cut(text,HMM=True,use_paddle=True,cut_all=False)
    
    word_list = list(word_cut)
    
    pre_processor = NLP_Preprocess(word_list)
    
    pre_processor.remove_punc()
    pre_processor.remove_number()
    pre_processor.remove_single_character()
    pre_processor.remove_stopwords(stopword_list=stopwords_list)
    
    return pre_processor.updated_word_list

In [6]:
def model_training(train_data,param):
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=train_data,**param)
    
    return lda_model

In [7]:
def LDA_cross_validation(word_embedding,n_fold,random_seed=100):
    
    np.random.seed(random_seed) # set the random seed
    
    size_of_document = len(word_embedding)
    
    random_index_group = np.random.randint(0,n_fold,size_of_document) # create randome index
    
    
    data_list = [] ### the following loop creates different subsamples
    
    for fold in range(0,n_fold):
        
        test_data = []
        train_date = []
        
        for idx,document in zip(random_index_group,word_embedding):
            
            if idx == fold:
                
                test_data.append(document)
            
            else:
                
                train_date.append(document)
            
        data_list.append((train_date,test_data))
        
    
    perplexity_list = [] ### calculate the evaluation score for each fold
    
    for train_data,test_data in data_list:
        
        model = model_training(train_data,param)
        
        perplexity = model.log_perplexity(test_data)
        
        perplexity_list.append(perplexity)
    
    return perplexity_list

### Import Data

#### Import Text data

In [8]:
data_directory = "data/"
file = "content.csv"

df = pd.read_csv(data_directory+file)
df["date"] = pd.to_datetime(df["date"])

text = list(df["content"])

#### Import stopwords

In [9]:
stopwords_directory = "data/stopwords/"
file_1 = "aggregate_stopwords.txt"

with open(stopwords_directory+file_1,"r",encoding="utf-8") as f:
    
    stopwords_list = f.readlines()
    stopwords_list = [word.strip() for word in stopwords_list]
    

### Word Frequency Filter

#### Pre-process all the text

In [10]:
text_after_preprocess = []

for one_text in text:
    
    if one_text != one_text:
        
        word_list = []
        text_after_preprocess.append(word_list)
        
        continue
    
    word_list = aggregate_preprocess(one_text)
    text_after_preprocess.append(word_list)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Sunny\AppData\Local\Temp\jieba.cache
Loading model cost 0.518 seconds.
Prefix dict has been built successfully.


In [11]:
len(text_after_preprocess)

2800

#### Create BOW Model

In [12]:
frequency_filter = BOW_Filter(text_after_preprocess)

#### Filter out Low frequency and high frequent word

In [13]:
wf_low_bound = 2

df_low_bound = 5

In [14]:
low_list,high_list = frequency_filter.document_frequency_filter(lower_bound = df_low_bound)
low_list,high_list = frequency_filter.term_frequency_filter(lower_bound = wf_low_bound)

In [15]:
filter_word_list = frequency_filter.show_filter_word()

In [16]:
frequency_filter.manually_add_filter_words(["中国","总理","李克强","政府","国务院"])

In [17]:
frequency_filter.update_dictionary()

### Word Embedding

In [18]:
dictionary = frequency_filter.dictionary
len(dictionary.token2id.keys())

8826

#### BOW

In [19]:
word_embedding_bow = [dictionary.doc2bow(one_word_list) for one_word_list in text_after_preprocess]

#### TF-IDF Embedding

In [20]:
tf_idf_model = TfidfModel(word_embedding_bow)

In [21]:
word_embedding_tf_idf = [tf_idf_model[one_bow] for one_bow in word_embedding_bow]

#### Fast Text Word Embedding

In [None]:
### Skip For now

### Topic Model 

#### Simple Implementation

In [20]:
word_embedding = word_embedding_bow
dictionary = dictionary

In [21]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=word_embedding,
                id2word=dictionary,
                num_topics=5,
                random_state=10,
                # update_every=1,
                chunksize=1,
                passes=3,
                alpha='auto',
                eta="auto"
                # per_word_topics=True
                )

  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


In [22]:
pprint(lda_model.print_topics(num_words=20))

[(0,
  '0.039*"合作" + 0.020*"世界" + 0.019*"关系" + 0.018*"领域" + 0.014*"中方" + 0.013*"加强" '
  '+ 0.013*"地区" + 0.012*"经济" + 0.010*"国家" + 0.010*"旅游" + 0.009*"稳定" + '
  '0.009*"和平" + 0.009*"交流" + 0.008*"国际" + 0.007*"国务院" + 0.007*"战略" + '
  '0.007*"推动" + 0.007*"愿同" + 0.007*"政治" + 0.007*"维护"'),
 (1,
  '0.062*"发展" + 0.031*"经济" + 0.023*"企业" + 0.020*"改革" + 0.017*"推进" + 0.014*"会议" '
  '+ 0.013*"促进" + 0.013*"推动" + 0.012*"政府" + 0.012*"投资" + 0.011*"创业" + '
  '0.011*"社会" + 0.011*"就业" + 0.010*"市场" + 0.010*"政策" + 0.010*"实现" + 0.009*"服务" '
  '+ 0.008*"支持" + 0.007*"扩大" + 0.007*"发挥"'),
 (2,
  '0.041*"创新" + 0.019*"就业" + 0.017*"考察" + 0.011*"教育" + 0.011*"成都" + 0.011*"建设" '
  '+ 0.009*"双创" + 0.009*"制造" + 0.008*"社会主义" + 0.008*"学生" + 0.008*"创客" + '
  '0.008*"技术" + 0.008*"大众" + 0.008*"互联网" + 0.007*"高校" + 0.007*"传统" + '
  '0.007*"国家" + 0.007*"中央政治局常委" + 0.007*"中共" + 0.006*"来到"'),
 (3,
  '0.026*"监管" + 0.023*"部门" + 0.021*"管理" + 0.018*"制度" + 0.018*"国务院" + '
  '0.015*"行政" + 0.014*"完善" + 0.010*"规定" + 0.010*"公开" + 0.009*"中

In [23]:
lda_model.log_perplexity(word_embedding)

-8.379870948856562

#### Cross Validation

##### Number of Topic CV result

In [63]:
topic_num_list = [2,3,4,5,6,7,8,9,10]
n_fold = 5
random_seed_list = list(np.random.randint(low=10000,size=len(topic_num_list)))

param = {"num_topics":5,
         "id2word":dictionary,
         "random_state":10,
         "chunksize":1,
         "passes":3,
         "alpha":'auto',
         "eta":"auto"}

In [67]:
evaluation_list = []

for topic_num,seed in zip(topic_num_list,random_seed_list):
    
    param["num_topics"] = topic_num
    
    perplexity_list = LDA_cross_validation(word_embedding = word_embedding,
                                           n_fold = n_fold,
                                           random_seed=seed)
    
    evaluation_list.append(perplexity_list)
    
    print("------- Topic Number : {}-------".format(topic_num))
    

2236
2218
2264
2245
2237
2270
2220
2274
2210
2226
2235
2245
2256
2216
2248
2260
2224
2248
2246
2222
2211
2229
2242
2256
2262
2224
2242
2220
2249
2265
2228
2248
2208
2263
2253
2236
2258
2212
2262
2232
2237
2238
2263
2261
2201


In [71]:
average_perplexity = [np.mean(perplexity) for perplexity in evaluation_list]

In [75]:
perplexity_df = pd.DataFrame([topic_num_list,average_perplexity]).T

In [76]:
perplexity_df

Unnamed: 0,0,1
0,2.0,-8.364623
1,3.0,-8.473494
2,4.0,-8.576652
3,5.0,-8.710798
4,6.0,-8.852686
5,7.0,-8.980463
6,8.0,-9.139739
7,9.0,-9.299387
8,10.0,-9.503175


### Model Evaluation

#### Word Interpretation

In [140]:
word_analyzer = LDA_WordAnalyzer(lda_model,dictionary)

In [141]:
word_analyzer.double_rank_representative_word()

NameError: name 'r' is not defined

In [142]:
word_analyzer.get_high_topic_frequency_word(0)

([414, 368, 172, 783, 1755, 24, 40, 510, 38, 75],
 ['合作', '世界', '关系', '领域', '中方', '加强', '地区', '经济', '国家', '旅游'])

In [143]:
word_analyzer.get_high_topic_frequency_word(1)

([185, 510, 160, 231, 226, 162, 684, 332, 234, 219],
 ['发展', '经济', '企业', '改革', '推进', '会议', '促进', '推动', '政府', '投资'])

In [144]:
word_analyzer.get_high_topic_frequency_word(2)

([307, 324, 1039, 72, 5386, 51, 313, 309, 99, 2625],
 ['创新', '就业', '考察', '教育', '成都', '建设', '双创', '制造', '社会主义', '学生'])

In [145]:
word_analyzer.get_high_topic_frequency_word(3)

([97, 131, 107, 22, 37, 584, 45, 1527, 604, 788],
 ['监管', '部门', '管理', '制度', '国务院', '行政', '完善', '规定', '公开', '中央'])

In [146]:
word_analyzer.get_high_topic_frequency_word(4)

([114, 51, 12, 13, 3652, 1486, 983, 1776, 1730, 38],
 ['群众', '建设', '保障', '做好', '宗教', '精神', '健康', '信息', '重建', '国家'])

In [147]:
word_analyzer.get_representative_topic([114, 51, 12, 13, 3652, 1486, 983, 1776, 1730, 38])

[(4, 0.034780517),
 (2, 0.010557064),
 (4, 0.017976282),
 (4, 0.0148258805),
 (4, 0.014233781),
 (4, 0.013480007),
 (4, 0.013316291),
 (4, 0.011693626),
 (4, 0.011563802),
 (0, 0.010279047)]

In [None]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0: # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num),                 round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    # print(contents)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    # print(sent_topics_df)
    return(sent_topics_df)

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=text_after_preprocess)

In [None]:
topic_1 = df_topic_sents_keywords[df_topic_sents_keywords["Dominant_Topic"]==5.0]

In [None]:
topic_1.head()

In [None]:
representative_text = list(topic_1.sort_values("Perc_Contribution",ascending=False)[0][0:10])

In [None]:
representative_text[1]