# LDA with truncated documents and POS


#### May 13

### Import Pakeges

In [3]:
import pandas as pd
from pyhanlp import *
from gensim import *
import numpy as np
from sklearn.model_selection import train_test_split
import os
import pickle
from datetime import datetime
import matplotlib.pyplot as plt 
from googletrans import Translator

In [2]:
import networkx  as nx
from itertools import combinations as comb

# Importing the dataset
path = 'trade-news.csv'
dataset = pd.read_csv(path)

def text_preprocess(text):
    # Delete Space and truncate the text 
    text = text.replace(' ','')
    if len(text)>50:
        text = text[20:-20]
    # resegmentation, result is a list of words with their POS tagging for each document
    text = str(HanLP.segment(text)).replace('[','').replace(']','').split(', ')
    return text


content_seg = list(map(text_preprocess, dataset['content']))



# extract words we want according to it's POS tagging
def get_words(arr):
        re_list = []
        ner = words 
        for x in arr:
            temp = x.split("/")
            if(temp[1] in ner):
                re_list.append(x)
        return re_list

# entity we choose to use in network    
words = ['ni','nic','nit','nr','nrf','nt','ntc','ntcb','ntcf','ntch',
         'nth','nto','nts','ntu', 'n']

meaning = {'ni':'organization', 'nic': 'subordinate organization', 
           'nit': 'educational institution', 'nr': 'person',
           'nrf': 'person', 'nt': 'organization',
           'ntc':'company','ntcb':'bank','ntcf':'factory','ntch':'hotel',
           'nth':'hospital','nto':'government','nts':'middle and primary school',
           'ntu':'university', 'n':'noun'}

# extract words we want according to it's POS tagging
def get_words_and_POS(arr):
        result = {}
        ner = words 
        for x in arr:
            temp = x.split("/")
            if(temp[1] in ner):
                result[temp[0]] = meaning[temp[1]]
        return result

# filtered list of words for each document
        
words_list = [get_words(doc) for doc in content_seg]

### Save and Load the Words List

In [3]:
# Save words_list
with open('words_list_with_POS_10_50', 'wb') as f:
    pickle.dump(words_list, f)

In [4]:
# Load words_list
with open('words_list_with_POS_10_50', 'rb') as f:
    words_list = pickle.load(f)

### Create dictionary and train, test BOW corpus

In [5]:
# Create Train-Test Split of the Word List
words_list_train, words_list_test = train_test_split(words_list, test_size = 0.2, random_state = 2333)
# Change words_list to a gensim dictionary
dictionary = corpora.Dictionary(words_list_train)
# Convert document into the bag-of-words
corpus_train = [dictionary.doc2bow(words) for words in words_list_train]
corpus_test = [dictionary.doc2bow(words) for words in words_list_test]

### Train different LDA models

In [5]:
topics = [10, 50]#,20,30,40,50]#,75,100,150,200,300]

In [7]:
# Train different LDA models
LDAmodels = {}
for i in topics:
    LDAmodels[i] = models.LdaModel(corpus_train, id2word=dictionary, num_topics=i)
    if not os.path.exists('LDAmodelsNoun_POS10_50'):
        os.makedirs('LDAmodelsNoun_POS10_50')  
    LDAmodels[i].save('LDAmodelsNoun_POS10_50/topic_bow_train{}'.format(i))

### Save and Load LDA models

In [None]:
# Save LDA models
if not os.path.exists('LDAmodelsNoun_POS10_50'):
    os.makedirs('LDAmodelsNoun_POS10_50')   
for i in topics:
    LDAmodels[i].save('LDAmodelsNoun_POS10_50/topic_bow_train{}'.format(i))

In [7]:
LDAmodels

{10: <gensim.models.ldamodel.LdaModel at 0x7f605d2897b8>,
 20: <gensim.models.ldamodel.LdaModel at 0x7f605d289828>,
 30: <gensim.models.ldamodel.LdaModel at 0x7f605d2897f0>,
 40: <gensim.models.ldamodel.LdaModel at 0x7f605d289940>,
 50: <gensim.models.ldamodel.LdaModel at 0x7f5e7be2f6d8>,
 75: <gensim.models.ldamodel.LdaModel at 0x7f5e7be2f748>}

In [6]:
# load LDA models
LDAmodels = {}
for i in topics:
    LDAmodels[i]= models.LdaModel.load('LDAmodelsNoun_POS10_50/topic_bow_train{}'.format(i))

### Function for getting top words

In [1]:
from scipy.stats import norm
def get_top_words(LDAmodel, ntopics, method = "top_words", nwords = 100, npercentage = 0.01, quantile = 0.99):
    if method == "top_words":
        # The nwords parameter decides on how many words it prints for each topic.
        keywords = []
        for topic in range(0, ntopics):
            keyword = [LDAmodel.show_topic(topic, topn = nwords)[nword][0] for nword in range(0, nwords)]
            keywords.append(keywords)
        return(keywords)
    elif method == "top_percentage":
        # The nwords parameter is to used to decide on the maximum number of words it prints for each topic. 
        # A smaller number of nwords will accelerate the algorithm.
        keywords = []
        for topic in range(0, ntopics):
            keyword = []
            for nword in range(0,nwords):
                model_topic = LDAmodel.show_topic(topic, nwords)[nword]
                if model_topic[1] > npercentage:
                    keyword.append(model_topic[0])
            keywords.append(keyword)
        return(keywords)
    # Do not recommend this method, because it needs to go through all of the words and find their probabilities 
    # which makes it very slow. 有超级计算机随意。
    elif method == "top_percentage_std": 
        keywords = []
        for topic in range(0, ntopics):
            keyword = []
            percentage = [LDAmodel.show_topic(topic, len(dictionary))[nword][1] for nword in range(0,len(dictionary))]
            mean = stat.mean(percentage)
            std = stat.stdev(percentage)
            npercentage = mean + std * norm.ppf(quantile)
            for nword in range(0, nwords):
                model_topic = LDAmodel.show_topic(topic, nwords)[nword]
                if model_topic[1] > npercentage:
                    keyword.append(model_topic[0])
            keywords.append(keyword)
                          
                

In [8]:
topic_10_words = get_top_words(LDAmodels[10], 10, method = "top_percentage", nwords = 10, npercentage = 0.01)

In [9]:
topic_10_words

[['经济/n', '全球/n', '世界/n', '市场/n', '我国/n', '政策/n', '问题/n', '国家/n', '欧盟/n'],
 ['企业/n', '制度/n', '部门/n', '机制/n', '社会/n', '问题/n', '组织/n', '政策/n'],
 ['企业/n', '市场/n', '产品/n', '业务/n', '人民币/n', '金融/n', '行业/n', '平台/n', '品牌/n'],
 ['产业/n',
  '企业/n',
  '项目/n',
  '城市/n',
  '国际/n',
  '经济/n',
  '国家/n',
  '物流/n',
  '战略/n',
  '优势/n'],
 ['国家/n',
  '习近平/nr',
  '国/n',
  '双方/n',
  '世界/n',
  '关系/n',
  '国际/n',
  '一带/n',
  '领域/n',
  '中方/n'],
 ['同比/n',
  '汽车/n',
  '专业/n',
  '工业/n',
  '百分点/n',
  '本院/n',
  '增速/n',
  '人/n',
  '公告/n',
  '全省/n'],
 ['农业/n', '产业/n', '农村/n', '生态/n', '农产品/n', '项目/n', '质量/n', '企业/n', '特色/n'],
 ['项目/n', '人/n', '文件/n', '工程/n', '单位/n', '技术/n', '声明/n', '路/n'],
 ['习近平/nr',
  '人民/n',
  '社会主义/n',
  '党/n',
  '社会/n',
  '国家/n',
  '精神/n',
  '经济/n',
  '特色/n',
  '制度/n'],
 ['文化/n', '人/n', '历史/n']]