# 新闻学篇主题模型

In [1]:
import nltk
from stop_words import get_stop_words
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag
import numpy as np
import pandas as pd
import gensim
from gensim import corpora, models
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim as gensimvis
import pyLDAvis
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

## 文献数据预处理

In [3]:
# load data to dataframe
data = pd.read_csv("journalism.csv",encoding='unicode_escape')
print('# of records',len(data))

# remove empty abstract records
df =  data[data['Abstract'].isna()== False]
print('# of non-empty records:',len(df))

df = df[df["Document Type"].isin(["Article","Article; Early Access"])]
print('# of article type:',len(df))

df = df.reset_index(drop=True)

df.to_csv('journalism723.csv')

# of records 818
# of non-empty records: 739
# of article type: 723


In [4]:
# read data
df1 = pd.read_csv('journalism723.csv')

In [5]:
df1.groupby('Source Title').size()

Source Title
DIGITAL JOURNALISM                           131
JOURNALISM                                   213
JOURNALISM & MASS COMMUNICATION QUARTERLY     67
JOURNALISM PRACTICE                          200
JOURNALISM STUDIES                           112
dtype: int64

In [6]:
# read data
df1 = pd.read_csv('journalism723.csv')

# extract a documment of all abstracts
docs = list(df1['Article Title']+ '. '+df1['Abstract'])

# Step 0: replace specified words in docs
char_to_replace = {'US': 'United States',
                   'U.S.': 'United States',
                   'UK': 'United Kingdom',
                   'U.K.': 'United Kingdom',
                   'EU': 'European Union',
                   'TV': 'television',
                   '-methods': '-method',
                   '-Methods': '-method',
                  '(mis)information': 'misinformation',
                  'mis-information': 'misinformation',
                  'agenda setting': 'agenda-setting',
                  'President Trump': 'Trump',
                  'Donald Trump': 'Trump'}

for idx in range(len(docs)):
    for key, value in char_to_replace.items():
        docs[idx] = docs[idx].replace(key, value)
#print(docs[0])    

# Step 1: split the documents into tokens
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = nltk.word_tokenize(docs[idx])


# Step 2: filter noun & adj 
filtertag = ['NN','NNS','NNP','NNPS','VBG','JJ'] 
docs = [[token[0] for token in pos_tag(doc) if token[1] in filtertag ] for doc in docs]  
    
# Step 3: remove stopwords
stopwords = get_stop_words('en')
stopwords = stopwords + ['e.g.','e.g','i.e.','i.e','and/or','mis']
docs = [[token for token in doc if not token in stopwords] for doc in docs]

# Step 4: remove numbers, but not words that contain numbers
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Step 5: remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]


# Step 6: lemmatize the documents
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

# Step 7: compute bigrams
from gensim.models import Phrases

# add bigrams and trigrams to docs (only ones that appear 3 times or more).
bigram = Phrases(docs, min_count = 3)
for idx in range(len(docs)):
     docs[idx] = bigram[docs[idx]]

# Step 8: filter noun    
filtertag = ['VBG','JJ'] 
docs = [[token[0] for token in pos_tag(doc) if token[1] not in filtertag ] for doc in docs]  


# save document to file
file = open('docs.txt','w');
file.write(str(docs))
file.close()

# remove rare and common tokens.
# create a dictionary representation of the documents.
dictionary = Dictionary(docs)
#print(len(dictionary))

# Filter out words that occur less than 5 documents, or more than 10% of the documents.
dictionary.filter_extremes(no_below = 5, no_above = 0.1)
dictionary.save_as_text('dist_5_0.10.txt')

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))
print('Done!')

Number of unique tokens: 985
Number of documents: 723
Done!


## 模型参数和一致性计算

In [7]:
# model = gensim.models.LdaMulticore(corpus=corpus,
#                                    id2word=dictionary,
#                                    num_topics=18, 
#                                    random_state=100,
#                                    chunksize=723,
#                                    passes=10,
#                                    alpha=0.91,
#                                    eta=0.01)

In [13]:
# # Saving the corpus, dictionary, and model.
# corpora.MmCorpus.serialize('savemodel/Corpus_jrn_01.mm', corpus)
# dictionary.save('savemodel/Dictionary_jrn_01.dict')
# model.save('savemodel/Model_jrn_01.model')

# Load corpora, models, dictionary
corpus = corpora.MmCorpus('savemodel/Corpus_jrn_01.mm')
dictionary = corpora.Dictionary.load('savemodel//Dictionary_jrn_01.dict')
model =  models.LdaModel.load('savemodel/Model_jrn_01.model')

In [14]:
# compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# compute Coherence Score using UMass
coherence_model_lda = CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# view topics
model.print_topics(num_topics=18, num_words=20)


Coherence Score:  0.3706964676441505

Coherence Score:  -10.289237201030181


[(0,
  '0.028*"sport" + 0.018*"sport_journalism" + 0.018*"author" + 0.018*"change" + 0.017*"global_south" + 0.016*"news_avoidance" + 0.014*"reporter" + 0.014*"law" + 0.013*"state" + 0.012*"impact" + 0.012*"visibility" + 0.012*"gender" + 0.011*"editor" + 0.011*"newsroom" + 0.011*"special_issue" + 0.011*"affair" + 0.010*"china" + 0.010*"cynicism" + 0.010*"audience_perception" + 0.009*"autonomy"'),
 (1,
  '0.076*"science" + 0.024*"health" + 0.022*"trend" + 0.021*"orientation" + 0.018*"label" + 0.016*"textual_analysis" + 0.016*"state" + 0.014*"interplay" + 0.014*"interpretation" + 0.013*"wider" + 0.013*"group" + 0.013*"behaviour" + 0.012*"epistemology" + 0.012*"activism" + 0.012*"purpose" + 0.012*"model" + 0.011*"willingness" + 0.011*"standard" + 0.011*"world" + 0.010*"term"'),
 (2,
  '0.057*"crisis" + 0.026*"factor" + 0.018*"relation" + 0.017*"selection" + 0.017*"intention" + 0.017*"brand" + 0.016*"situation" + 0.016*"reaction" + 0.014*"government" + 0.014*"point" + 0.013*"user" + 0.012*"

## 结果可视化 

In [15]:
### Prepare the visualization data.
vis_data = gensimvis.prepare(model, corpus, dictionary)

# Visualize the topic model.
%matplotlib inline
pyLDAvis.enable_notebook()
pyLDAvis.display(vis_data)
#pyLDAvis.show(vis_data)

## 话题分布

In [10]:
#print topic keywords
import re
tpid = []
tpkw = []
for i in range(18):
    kw = re.sub(r'[0-9*."+]','',model.print_topic(i))
    tpid.append(i)
    tpkw.append(kw.replace('  ',','))

dftp = pd.DataFrame({'topicid': tpid,'topic_words':tpkw})
#dftp

In [11]:
# get the topic distribution for the given document.
topicdist = model.get_document_topics(corpus, minimum_probability=0.05, minimum_phi_value=None, per_word_topics=False)

# assigns the topics to the documents in corpus
lda_corpus = model[corpus]

# pair corpus and topic probability
combine = list(zip(lda_corpus,docs))

Sortedlist = []
Firstlist = []
# Secondlist = []
for doc in combine:
    sortls =  sorted(doc[0], key=lambda tup: tup[1], reverse=True)
    first = sortls[0]
    Firstlist.append(first[0])
    Sortedlist.append(sortls)
#     try:
#         second = sortls[1]
#         #Secondlist.append(second)
#         Secondlist.append(second[0])      
#     except:
#         #Secondlist.append([])
#         Secondlist.append(np.nan)
#         pass

tpdf = pd.DataFrame(combine)

tpdf['sorted'] = Sortedlist
tpdf['first_cluster'] = Firstlist
# tpdf['second_cluster'] = Secondlist
tpdf['authors'] = df1['Author Full Names']
tpdf['title'] = df1['Article Title']
tpdf['abstract'] = df1['Abstract']
tpdf['source'] = df1['Source Title']
tpdf['doi'] = df1['DOI']
tpdf['citation'] = df1['Times Cited, WoS Core']

tpdfgb = tpdf.groupby('first_cluster')
print(tpdfgb.size())

tpdf0 = pd.merge(tpdf, dftp, left_on = 'first_cluster',right_on = 'topicid')

tpdf0.to_excel('Journalism_topic_resluts_18.xlsx',encoding ='utf-8')
print("All done!")

first_cluster
0     47
1     18
2     38
3     41
4     50
5     34
6     49
7     36
8     46
9     36
10    47
11    39
12    39
13    27
14    44
15    42
16    37
17    53
dtype: int64
All done!


## 补充：模型参数筛查

In [None]:
# import numpy as np
# import tqdm

# # supporting function
# def compute_coherence_values(corpus, dictionary, k, a, b):
    
#     lda_model = gensim.models.LdaMulticore(corpus=corpus,
#                                            id2word=dictionary,
#                                            num_topics=k, 
#                                            random_state=100,
#                                            chunksize=100,
#                                            passes=10,
#                                            alpha=a,
#                                            eta=b)
    
#     coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
    
#     return coherence_model_lda.get_coherence()



# grid = {}
# grid['Validation_Set'] = {}
# # Topics range
# min_topics = 6
# max_topics = 24
# step_size = 2
# topics_range = range(min_topics, max_topics, step_size)
# # Alpha parameter
# alpha = list(np.arange(0.01, 1, 0.3))
# alpha.append('symmetric')
# alpha.append('asymmetric')
# # Beta parameter
# beta = list(np.arange(0.01, 1, 0.3))
# beta.append('symmetric')
# # Validation sets
# num_of_docs = len(corpus)
# corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
#                # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
#                gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
#                corpus]
# corpus_title = ['75% Corpus', '100% Corpus']
# model_results = {'Validation_Set': [],
#                  'Topics': [],
#                  'Alpha': [],
#                  'Beta': [],
#                  'Coherence': []
#                 }
# # Can take a long time to run
# if 1 == 1:
#     pbar = tqdm.tqdm(total=540) #len(docs))
    
#     # iterate through validation corpuses
#     for i in range(len(corpus_sets)):
#         # iterate through number of topics
#         for k in topics_range:
#             # iterate through alpha values
#             for a in alpha:
#                 # iterare through beta values
#                 for b in beta:
#                     # get the coherence score for the given parameters
#                     cv = compute_coherence_values(corpus=corpus, dictionary=dictionary, 
#                                                   k=k, a=a, b=b)
#                     # Save the model results
#                     model_results['Validation_Set'].append(corpus_title[i])
#                     model_results['Topics'].append(k)
#                     model_results['Alpha'].append(a)
#                     model_results['Beta'].append(b)
#                     model_results['Coherence'].append(cv)
                    
#                     pbar.update(1)
#     pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
#     pbar.close()

  0%|                                                                                          | 0/540 [00:00<?, ?it/s]