In [1]:
import pandas as pd
import jieba
import numpy as np
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim import corpora, models, similarities
from gensim.corpora import Dictionary

In [2]:
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

In [3]:
import re
import math
from gensim.models import TfidfModel

In [4]:
from src.nlp_preprocess import NLP_Preprocess, BOW_Filter
from src.lda_interpretation import LDA_WordAnalyzer,LDA_DocumentAnalyzer

### Define Functions

In [None]:
def aggregate_preprocess(text):
    
    word_cut = jieba.cut(text,HMM=True,use_paddle=True,cut_all=False)
    
    word_list = list(word_cut)
    
    pre_processor = NLP_Preprocess(word_list)
    
    pre_processor.remove_punc()
    pre_processor.remove_number()
    pre_processor.remove_single_character()
    pre_processor.remove_stopwords(stopword_list=stopwords_list)
    
    return pre_processor.updated_word_list

In [None]:
def model_training(train_data,param):
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=train_data,**param)
    
    return lda_model

In [None]:
def LDA_cross_validation(word_embedding,n_fold,random_seed=100):
    
    np.random.seed(random_seed) # set the random seed
    
    size_of_document = len(word_embedding)
    
    random_index_group = np.random.randint(0,n_fold,size_of_document) # create randome index
    
    
    data_list = [] ### the following loop creates different subsamples
    
    for fold in range(0,n_fold):
        
        test_data = []
        train_date = []
        
        for idx,document in zip(random_index_group,word_embedding):
            
            if idx == fold:
                
                test_data.append(document)
            
            else:
                
                train_date.append(document)
            
        data_list.append((train_date,test_data))
        
    
    perplexity_list = [] ### calculate the evaluation score for each fold
    
    for train_data,test_data in data_list:
        
        model = model_training(train_data,param)
        
        perplexity = model.log_perplexity(test_data)
        
        perplexity_list.append(perplexity)
    
    return perplexity_list

### Import Data

#### Import Text data

In [None]:
data_directory = "data/"
file = "content.csv"

df = pd.read_csv(data_directory+file)
df["date"] = pd.to_datetime(df["date"])

text = list(df["content"])

#### Import stopwords

In [None]:
stopwords_directory = "data/stopwords/"
file_1 = "aggregate_stopwords.txt"

with open(stopwords_directory+file_1,"r",encoding="utf-8") as f:
    
    stopwords_list = f.readlines()
    stopwords_list = [word.strip() for word in stopwords_list]
    

### Word Frequency Filter

#### Pre-process all the text

In [None]:
text_after_preprocess = []

for one_text in text:
    
    if one_text != one_text:
        
        word_list = []
        text_after_preprocess.append(word_list)
        
        continue
    
    word_list = aggregate_preprocess(one_text)
    text_after_preprocess.append(word_list)

In [None]:
len(text_after_preprocess)

#### Create BOW Model

In [None]:
frequency_filter = BOW_Filter(text_after_preprocess)

#### Filter out Low frequency and high frequent word

In [None]:
wf_low_bound = 2

df_low_bound = 5

In [None]:
low_list,high_list = frequency_filter.document_frequency_filter(lower_bound = df_low_bound)
low_list,high_list = frequency_filter.term_frequency_filter(lower_bound = wf_low_bound)

In [None]:
filter_word_list = frequency_filter.show_filter_word()

In [None]:
frequency_filter.manually_add_filter_words(["中国","总理","李克强","政府","国务院"])

In [None]:
frequency_filter.update_dictionary()

### Word Embedding

In [None]:
dictionary = frequency_filter.dictionary
len(dictionary.token2id.keys())

#### BOW

In [None]:
word_embedding_bow = [dictionary.doc2bow(one_word_list) for one_word_list in text_after_preprocess]

#### TF-IDF Embedding

In [None]:
tf_idf_model = TfidfModel(word_embedding_bow)

In [None]:
word_embedding_tf_idf = [tf_idf_model[one_bow] for one_bow in word_embedding_bow]

#### Fast Text Word Embedding

In [None]:
### Skip For now

### Topic Model 

#### Simple Implementation

In [None]:
word_embedding = word_embedding_bow
dictionary = dictionary

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=word_embedding,
                id2word=dictionary,
                num_topics=5,
                random_state=10,
                # update_every=1,
                chunksize=1,
                passes=3,
                alpha='auto',
                eta="auto"
                # per_word_topics=True
                )

In [None]:
pprint(lda_model.print_topics(num_words=20))

#### Cross Validation

##### Number of Topic CV result

In [None]:
topic_num_list = [2,3,4,5,6,7,8,9,10]
n_fold = 5
random_seed_list = list(np.random.randint(low=10000,size=len(topic_num_list)))

param = {"num_topics":5,
         "id2word":dictionary,
         "random_state":10,
         "chunksize":1,
         "passes":3,
         "alpha":'auto',
         "eta":"auto"}

In [None]:
evaluation_list = []

for topic_num,seed in zip(topic_num_list,random_seed_list):
    
    param["num_topics"] = topic_num
    
    perplexity_list = LDA_cross_validation(word_embedding = word_embedding,
                                           n_fold = n_fold,
                                           random_seed=seed)
    
    evaluation_list.append(perplexity_list)
    
    print("------- Topic Number : {}-------".format(topic_num))
    

In [None]:
average_perplexity = [np.mean(perplexity) for perplexity in evaluation_list]

In [None]:
perplexity_df = pd.DataFrame([topic_num_list,average_perplexity]).T

In [None]:
perplexity_df

### Model Evaluation

#### Word Interpretation

In [None]:
word_analyzer = LDA_WordAnalyzer(lda_model,dictionary)

In [None]:
word_analyzer.double_rank_representative_word()

In [None]:
word_analyzer.get_high_topic_frequency_word(0)

In [None]:
word_analyzer.get_high_topic_frequency_word(1)

In [None]:
word_analyzer.get_high_topic_frequency_word(2)

In [None]:
word_analyzer.get_high_topic_frequency_word(3)

In [None]:
word_analyzer.get_high_topic_frequency_word(4)

In [None]:
word_analyzer.get_representative_topic([1037, 113, 1484, 5384, 4264, 3, 4, 1992, 981])

##### Topic 0

The high frequency word are related to the internationl order, international communication and cooperation.

In [None]:
high_freq_id,high_freq_word = word_analyzer.get_high_topic_frequency_word(0,topn=20)

Filter out the last two words.

In [None]:
word_analyzer.get_representative_topic(high_freq_id)

In [None]:
high_freq_word[0:-2]

The high rank word also corresponds to the previous word. The word list is more appropriate.

In [None]:
rank_df = word_analyzer.double_rank_representative_word()
rank_series = rank_df.iloc[0,:]

In [None]:
double_rank_list = list(rank_series.nlargest(20).index)
high_rank_word = [dictionary[idx] for idx in double_rank_list]

In [None]:
high_rank_word = [dictionary[idx] for idx in double_rank_list]

In [None]:
high_rank_word

##### Topic 1

Based on the terms in topic one, we can see that topic 1 is more related to different types of reforms.

Such as, encourage investment, marketization, service industry, enviroment.

In [None]:
high_freq_id,high_freq_word = word_analyzer.get_high_topic_frequency_word(1,topn=20)

In [None]:
high_freq_word

In [None]:
word_analyzer.get_representative_topic(high_freq_id)

In [None]:
high_freq_word[0:3]+high_freq_word[4::]

In [None]:
rank_df = word_analyzer.double_rank_representative_word()
rank_series = rank_df.iloc[1,:]

In [None]:
double_rank_list = list(rank_series.nlargest(20).index)
high_rank_word = [dictionary[idx] for idx in double_rank_list]

In [None]:
high_rank_word

##### Topic 2

This topic is more about regulation and management.For example, it covers
1. new regulation
2. supervision
3. institutional setup

In [None]:
high_freq_id,high_freq_word = word_analyzer.get_high_topic_frequency_word(2,topn=20)

In [None]:
high_freq_word

In [None]:
word_analyzer.get_representative_topic(high_freq_id)

In [None]:
rank_df = word_analyzer.double_rank_representative_word()
rank_series = rank_df.iloc[2,:]

In [None]:
double_rank_list = list(rank_series.nlargest(20).index)
high_rank_word = [dictionary[idx] for idx in double_rank_list]

In [None]:
high_rank_word

##### Topic 3

This topic is more about the innovation, technology, entrepreneurial and industry policy.

In [None]:
high_freq_id,high_freq_word = word_analyzer.get_high_topic_frequency_word(3,topn=20)

In [None]:
high_freq_word

In [None]:
word_analyzer.get_representative_topic(high_freq_id)

In [None]:
high_freq_word[1:4]+high_freq_word[5:]

In [None]:
rank_df = word_analyzer.double_rank_representative_word()
rank_series = rank_df.iloc[3,:]

In [None]:
double_rank_list = list(rank_series.nlargest(20).index)
high_rank_word = [dictionary[idx] for idx in double_rank_list]

In [None]:
high_rank_word

##### Topic 4

This topic is more about the visit of the president.

In [None]:
high_freq_id,high_freq_word = word_analyzer.get_high_topic_frequency_word(4,topn=20)

In [None]:
high_freq_word

In [None]:
word_analyzer.get_representative_topic(high_freq_id)

In [None]:
rank_df = word_analyzer.double_rank_representative_word()
rank_series = rank_df.iloc[4,:]

In [None]:
double_rank_list = list(rank_series.nlargest(20).index)
high_rank_word = [dictionary[idx] for idx in double_rank_list]

In [None]:
high_rank_word

### Document Level Interpretation

In [None]:
doc_analyzer = LDA_DocumentAnalyzer(lda_model,word_embedding,text_after_preprocess,text)

In [None]:
doc_analyzer.create_deviation_ratio_df()

In [None]:
doc_analyzer.create_document_topic_distribution()

In [None]:
rank_df = doc_analyzer.create_double_rank()

In [None]:
word_list,doc_list = doc_analyzer.get_n_representative_document(rank_df.iloc[0,:])

In [None]:
doc_analyzer.get_n_representative_document_high_double_rank(topic_num = 1)

In [None]:
doc_analyzer.get_n_representative_document_high_frequency(topic_num = 1)