# 新闻学篇主题模型

In [1]:
import nltk
from stop_words import get_stop_words
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag
import numpy as np
import pandas as pd
import gensim
from gensim import corpora, models
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim as gensimvis
import pyLDAvis
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

## 文献数据预处理

In [2]:
# load data to dataframe
data = pd.read_csv("RCfiltered689-2.csv",encoding='unicode_escape')
print('# of records',len(data))

# remove empty abstract records
df =  data[data['AB'].isna()== False]
print('# of non-empty records:',len(df))

# extract a documment of all abstracts
docs = list(df['TI']+ '. '+df['AB'])

# Step 0: replace specified words in docs
char_to_replace = {'US': 'United States',
                   'U.S.': 'United States',
                   'UK': 'United Kingdom',
                   'U.K.': 'United Kingdom',
                   'EU': 'European Union',
                   'TV': 'television',
                   '-methods': '-method',
                   '-Methods': '-method',
                  '(mis)information': 'misinformation',
                  'mis-information': 'misinformation',
                  'agenda setting': 'agenda-setting',
                  'President Trump': 'Trump',
                  'Donald Trump': 'Trump'}

for idx in range(len(docs)):
    for key, value in char_to_replace.items():
        docs[idx] = docs[idx].replace(key, value)
#print(docs[0])    

# Step 1: split the documents into tokens
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = nltk.word_tokenize(docs[idx])


# Step 2: filter noun & adj 
filtertag = ['NN','NNS','NNP','NNPS','VBG','JJ'] 
docs = [[token[0] for token in pos_tag(doc) if token[1] in filtertag ] for doc in docs]  
    
# Step 3: remove stopwords
stopwords = get_stop_words('en')
stopwords = stopwords + ['e.g.','e.g','i.e.','i.e','and/or','mis']
docs = [[token for token in doc if not token in stopwords] for doc in docs]

# Step 4: remove numbers, but not words that contain numbers
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Step 5: remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]


# Step 6: lemmatize the documents
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

# Step 7: compute bigrams
from gensim.models import Phrases

# add bigrams and trigrams to docs (only ones that appear 3 times or more).
bigram = Phrases(docs, min_count = 3)
for idx in range(len(docs)):
     docs[idx] = bigram[docs[idx]]

# Step 8: filter noun    
filtertag = ['VBG','JJ'] 
docs = [[token[0] for token in pos_tag(doc) if token[1] not in filtertag ] for doc in docs]  


# save document to file
file = open('docs.txt','w');
file.write(str(docs))
file.close()

# remove rare and common tokens.
# create a dictionary representation of the documents.
dictionary = Dictionary(docs)
#print(len(dictionary))

# Filter out words that occur less than 5 documents, or more than 10% of the documents.
dictionary.filter_extremes(no_below = 5, no_above = 0.1)
dictionary.save_as_text('dist_5_0.10.txt')

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))
print('Done!')

# of records 687
# of non-empty records: 687
Number of unique tokens: 908
Number of documents: 687
Done!


## 模型参数和一致性计算

In [3]:
model = gensim.models.LdaMulticore(corpus=corpus,
                                   id2word=dictionary,
                                   num_topics=18, 
                                   random_state=100,
                                   chunksize=100,
                                   passes=10,
                                   alpha=0.91,
                                   eta=0.01)

In [5]:
## Saving the corpus, dictionary, and model.
# corpora.MmCorpus.serialize('savemodel/Corpus_001.mm', corpus)
# dictionary.save('savemodel/Dictionary_001.dict')
# model.save('savemodel/Model_001.model')

# Load corpora, models, dictionary
corpus = corpora.MmCorpus('savemodel/Corpus_002.mm')
dictionary = corpora.Dictionary.load('savemodel/Dictionary_002.dict')
model =  models.LdaModel.load('savemodel/Model_002.model')

In [6]:
# compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# compute Coherence Score using UMass
coherence_model_lda = CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# view topics
model.print_topics(num_topics=18, num_words=10)


Coherence Score:  0.3944205977899933

Coherence Score:  -11.277606393752029


[(0,
  '0.025*"advocacy" + 0.020*"communication" + 0.017*"accountability" + 0.017*"course" + 0.015*"tension" + 0.015*"university" + 0.011*"europe" + 0.011*"identity" + 0.010*"concept" + 0.010*"objectivity"'),
 (1,
  '0.053*"conflict" + 0.019*"video" + 0.015*"industry" + 0.014*"training" + 0.014*"interest" + 0.012*"cause" + 0.012*"habitus" + 0.011*"semi-structured_interview" + 0.011*"editor" + 0.010*"typology"'),
 (2,
  '0.039*"public_opinion" + 0.033*"bias" + 0.030*"disinformation" + 0.027*"threat" + 0.018*"explanation" + 0.017*"controversy" + 0.015*"awareness" + 0.013*"number" + 0.013*"framework" + 0.013*"solution"'),
 (3,
  '0.028*"market" + 0.028*"reader" + 0.025*"evaluation" + 0.022*"gender" + 0.021*"intention" + 0.018*"consumer" + 0.016*"solution_journalism" + 0.015*"change" + 0.014*"cue" + 0.014*"subscription"'),
 (4,
  '0.023*"strategy" + 0.017*"brand" + 0.016*"platform" + 0.015*"facebook" + 0.014*"business" + 0.013*"spain" + 0.013*"organisation" + 0.010*"employment" + 0.010*"st

## 结果可视化 

In [7]:
# Prepare the visualization data.
vis_data = gensimvis.prepare(model, corpus, dictionary)

# Visualize the topic model.
%matplotlib inline
pyLDAvis.enable_notebook()
pyLDAvis.display(vis_data)
#pyLDAvis.show(vis_data)

## 话题分布

In [8]:
# get the topic distribution for the given document.
topicdist = model.get_document_topics(corpus, minimum_probability=0.05, minimum_phi_value=None, per_word_topics=False)

# assigns the topics to the documents in corpus
lda_corpus = model[corpus]

# pair corpus and topic probability
combine = list(zip(lda_corpus,docs))

Sortedlist = []
Firstlist = []
# Secondlist = []
for doc in combine:
    sortls =  sorted(doc[0], key=lambda tup: tup[1], reverse=True)
    first = sortls[0]
    Firstlist.append(first[0])
    Sortedlist.append(sortls)
#     try:
#         second = sortls[1]
#         #Secondlist.append(second)
#         Secondlist.append(second[0])      
#     except:
#         #Secondlist.append([])
#         Secondlist.append(np.nan)
#         pass

tpdf = pd.DataFrame(combine)

tpdf['sorted'] = Sortedlist
tpdf['first_cluster'] = Firstlist
# tpdf['second_cluster'] = Secondlist
tpdf['authors'] = data['AF']
tpdf['title'] = data['TI']
tpdf['abstract'] = data['AB']
tpdf['source'] = data['SO']
tpdf['doi'] = data['DI']
tpdf['citation'] = data['Z9']


tpdfgb = tpdf.groupby('first_cluster')
print(tpdfgb.size())

tpdf.to_excel('Journalism_topic_resluts_18.xlsx',encoding ='utf-8')
print("All done!")

first_cluster
0     32
1     25
2     25
3     45
4     64
5     40
6     43
7     43
8     20
9     42
10    40
11    44
12    28
13    56
14    36
15    34
16    45
17    25
dtype: int64
All done!


## 补充：模型参数筛查

In [214]:
import numpy as np
import tqdm

# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()



grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 6
max_topics = 24
step_size = 2
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540) #len(docs))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus, dictionary=dictionary, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

 79%|█████████████████████████████████████████████████████████████▎                | 540/687 [3:00:03<49:00, 20.01s/it]
