In [1]:
import pandas as pd
import jieba
import numpy as np
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim import corpora, models, similarities
from gensim.corpora import Dictionary

In [2]:
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

In [3]:
import re
import math
from gensim.models import TfidfModel

In [4]:
from src.nlp_preprocess import NLP_Preprocess, BOW_Filter

### Define Functions

In [5]:
def aggregate_preprocess(text):
    
    word_cut = jieba.cut(text,HMM=True,use_paddle=True,cut_all=False)
    
    word_list = list(word_cut)
    
    pre_processor = NLP_Preprocess(word_list)
    
    pre_processor.remove_punc()
    pre_processor.remove_number()
    pre_processor.remove_single_character()
    pre_processor.remove_stopwords(stopword_list=stopwords_list)
    
    return pre_processor.updated_word_list

### Import Data

#### Import Text data

In [6]:
data_directory = "data/"
file = "content.csv"

df = pd.read_csv(data_directory+file)
df["date"] = pd.to_datetime(df["date"])

text = list(df["content"])

#### Import stopwords

In [7]:
stopwords_directory = "data/stopwords/"
file_1 = "aggregate_stopwords.txt"

with open(stopwords_directory+file_1,"r",encoding="utf-8") as f:
    
    stopwords_list = f.readlines()
    stopwords_list = [word.strip() for word in stopwords_list]
    

### Word Frequency Filter

#### Pre-process all the text

In [8]:
text_after_preprocess = []

for one_text in text:
    
    if one_text != one_text:
        
        word_list = []
        text_after_preprocess.append(word_list)
        
        continue
    
    word_list = aggregate_preprocess(one_text)
    text_after_preprocess.append(word_list)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Sunny\AppData\Local\Temp\jieba.cache
Loading model cost 0.508 seconds.
Prefix dict has been built successfully.


In [9]:
len(text_after_preprocess)

2800

#### Create BOW Model

In [31]:
frequency_filter = BOW_Filter(text_after_preprocess)

#### Filter out Low frequency and high frequent word

In [32]:
wf_low_bound = 2

df_low_bound = 5

In [33]:
low_list,high_list = frequency_filter.document_frequency_filter(lower_bound = df_low_bound)
low_list,high_list = frequency_filter.term_frequency_filter(lower_bound = wf_low_bound)

In [34]:
filter_word_list = frequency_filter.show_filter_word()

In [35]:
frequency_filter.manually_add_filter_words(["中国","总理","李克强"])

In [36]:
frequency_filter.update_dictionary()

### Word Embedding

In [37]:
dictionary = frequency_filter.dictionary
len(dictionary.token2id.keys())

8826

#### BOW

In [38]:
word_embedding_bow = [dictionary.doc2bow(one_word_list) for one_word_list in text_after_preprocess]

#### TF-IDF Embedding

In [39]:
tf_idf_model = TfidfModel(word_embedding_bow)

In [40]:
word_embedding_tf_idf = [tf_idf_model[one_bow] for one_bow in word_embedding_bow]

#### Fast Text Word Embedding

In [None]:
### Skip For now

### Topic Model 

#### Simple Implementation

In [41]:
word_embedding = word_embedding_bow
dictionary = dictionary

In [46]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=word_embedding,
                id2word=dictionary,
                num_topics=5,
                random_state=100,
                # update_every=1,
                chunksize=1,
                passes=20,
                alpha='auto',
                eta="auto"
                # per_word_topics=True
                )

In [47]:
pprint(lda_model.print_topics(num_words=20))

[(0,
  '0.055*"合作" + 0.027*"关系" + 0.022*"领域" + 0.020*"中方" + 0.017*"加强" + 0.017*"地区" '
  '+ 0.013*"旅游" + 0.013*"和平" + 0.011*"国家" + 0.010*"宗教" + 0.009*"愿同" + '
  '0.009*"政治" + 0.009*"交流" + 0.009*"国务院" + 0.008*"两国" + 0.008*"建设" + '
  '0.008*"下午" + 0.008*"会见" + 0.007*"深化" + 0.007*"维护"'),
 (1,
  '0.022*"监管" + 0.021*"部门" + 0.018*"管理" + 0.013*"制度" + 0.013*"行政" + '
  '0.012*"国务院" + 0.012*"健康" + 0.011*"强化" + 0.010*"完善" + 0.010*"责任" + '
  '0.010*"信息" + 0.009*"部署" + 0.008*"规定" + 0.008*"公开" + 0.008*"中央" + 0.008*"单位" '
  '+ 0.008*"宗教" + 0.007*"依法" + 0.007*"取消" + 0.007*"政务"'),
 (2,
  '0.085*"经济" + 0.030*"世界" + 0.016*"希望" + 0.014*"增长" + 0.011*"传统" + 0.010*"稳定" '
  '+ 0.010*"金融" + 0.009*"形势" + 0.009*"战略" + 0.008*"贸易" + 0.008*"国际" + '
  '0.008*"拓展" + 0.007*"开放" + 0.007*"代表" + 0.007*"创新" + 0.007*"各国" + 0.006*"指数" '
  '+ 0.006*"复苏" + 0.006*"升级" + 0.006*"优势"'),
 (3,
  '0.031*"创新" + 0.029*"就业" + 0.016*"创业" + 0.015*"考察" + 0.011*"建设" + 0.010*"教育" '
  '+ 0.009*"群众" + 0.008*"鼓励" + 0.008*"国家" + 0.008*"双创" + 0.0

In [None]:
coherence_lda = coherence_model_lda.get_coherence()

In [None]:
print('Coherence Score: ', coherence_lda)

In [None]:
lda_model.log_perplexity(corpus)

In [None]:
coherence_values = []
model_list = []

for num_topics in range(2,41,2):
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                    id2word=id2word,
                    num_topics=num_topics,
                    # random_state=100,
                    # update_every=1,
                    # chunksize=100,
                    # passes=10,
                    # alpha='auto',
                    # per_word_topics=True
                                               )
                                                
    model_list.append(lda_model)
    coherencemodel = CoherenceModel(model=lda_model, texts=text_after_preprocess, dictionary=id2word, coherence='c_v')
    coherence_values.append(round(coherencemodel.get_coherence(),3))

In [None]:
x = range(2,41,2)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
optimal_model = model_list[2]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0: # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num),                 round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    # print(contents)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    # print(sent_topics_df)
    return(sent_topics_df)

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=text_after_preprocess)

In [None]:
topic_1 = df_topic_sents_keywords[df_topic_sents_keywords["Dominant_Topic"]==5.0]

In [None]:
topic_1.head()

In [None]:
representative_text = list(topic_1.sort_values("Perc_Contribution",ascending=False)[0][0:10])

In [None]:
representative_text[1]