# 传播学篇主题模型

In [1]:
import nltk
from stop_words import get_stop_words
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag
import numpy as np
import pandas as pd
import gensim
from gensim import corpora, models
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim as gensimvis
import pyLDAvis
# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pyLDAvis==3.2.1
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

##  数据预处理

In [2]:
# load data to dataframe
data = pd.read_csv("communication.csv",encoding='unicode_escape')
print('# of records',len(data))

# remove empty abstract records
df =  data[data['Abstract'].isna()== False]
print('# of non-empty records:',len(df))

df = df[df["Document Type"].isin(["Article","Article; Early Access"])]
print('# of article type:',len(df))

df = df.reset_index(drop=True)

df.to_csv('communication1482.csv')

# of records 1710
# of non-empty records: 1537
# of article type: 1482


In [3]:
# read data
df1 = pd.read_csv('communication1482.csv')
df1.groupby("Source Title").size()

Source Title
COMMUNICATION METHODS AND MEASURES              16
COMMUNICATION RESEARCH                          58
COMMUNICATION THEORY                            30
COMUNICAR                                       36
HUMAN COMMUNICATION RESEARCH                    36
INFORMATION COMMUNICATION & SOCIETY            185
INTERNATIONAL JOURNAL OF ADVERTISING           104
INTERNATIONAL JOURNAL OF PRESS-POLITICS         84
JOURNAL OF ADVERTISING                          58
JOURNAL OF COMMUNICATION                        38
JOURNAL OF COMPUTER-MEDIATED COMMUNICATION      21
JOURNAL OF PUBLIC RELATIONS RESEARCH            23
MEDIA PSYCHOLOGY                                54
NEW MEDIA & SOCIETY                            305
POLICY AND INTERNET                             50
POLITICAL COMMUNICATION                         50
PUBLIC OPINION QUARTERLY                        35
PUBLIC RELATIONS REVIEW                         66
RESEARCH ON LANGUAGE AND SOCIAL INTERACTION     18
SCIENCE COMMUNICAT

In [4]:
# read data
# df1 = pd.read_csv('communication1482.csv')

# extract a documment of all abstracts
docs = list(df1['Article Title']+ '. '+df1['Abstract'])

# Step 1: split the documents into tokens
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # convert to lowercase.
    docs[idx] = nltk.word_tokenize(docs[idx])

# # Step 2: filter noun & adj 
filtertag = ['NN','NNS','NNP','NNPS','VBG','JJ'] 
docs = [[token[0] for token in pos_tag(doc) if token[1] in filtertag ] for doc in docs]  
    
# Step 3: remove stopwords
stopwords = get_stop_words('en')
stopwords = stopwords + ['e.g.','e.g','i.e.','i.e','and/or','mis']
docs = [[token for token in doc if not token in stopwords] for doc in docs]

# Step 4: remove numbers, but not words that contain numbers
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Step 5: remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

# Step 6: lemmatize the documents
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

# Step 7: compute bigrams
from gensim.models import Phrases

# add bigrams and trigrams to docs (only ones that appear 5 times or more).
bigram = Phrases(docs, min_count = 3)
for idx in range(len(docs)):
     docs[idx] = bigram[docs[idx]]

# Step 8: filter noun    
filtertag = ['VBG','JJ'] 
docs = [[token[0] for token in pos_tag(doc) if token[1] not in filtertag ] for doc in docs]  


# save document to file
file = open('com_docs.txt','w');
file.write(str(docs))
file.close()

# remove rare and common tokens.
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)
#print(len(dictionary))

# Filter out words that occur less than 5 documents, or more than 10% of the documents.
dictionary.filter_extremes(no_below = 5, no_above = 0.1)
dictionary.save_as_text('com_dist_5_0.1.txt')

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))
print('Done!')

Number of unique tokens: 1788
Number of documents: 1482
Done!


## 模型参数、一致性计算

In [5]:
model = gensim.models.LdaMulticore(corpus=corpus,
                                   id2word=dictionary,
                                   num_topics=19, 
                                   random_state=100,
                                   chunksize=1482,
                                   passes=10,
                                   alpha=0.91,
                                   eta=0.01)

In [8]:
# # Saving the corpus, dictionary, and model.
# corpora.MmCorpus.serialize('savemodel/Corpus_com_001.mm', corpus)
# dictionary.save('savemodel/Dictionary_com_001.dict')
# model.save('savemodel/Model_com_001.model')

# Load corpora, models, similarities
corpus = corpora.MmCorpus('savemodel/Corpus_com_001.mm')
dictionary = corpora.Dictionary.load('savemodel/Dictionary_com_001.dict')
model =  models.LdaModel.load('savemodel/Model_com_001.model')

In [9]:
# compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# compute Coherence Score using UMass
coherence_model_lda = CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# view topics
model.print_topics(num_topics=19, num_words=10)


Coherence Score:  0.35931437674948297

Coherence Score:  -8.979592322195638


[(0,
  '0.040*"identity" + 0.029*"interaction" + 0.021*"internet" + 0.015*"story" + 0.012*"community" + 0.012*"spectrum" + 0.011*"member" + 0.010*"need" + 0.010*"mother" + 0.009*"plan"'),
 (1,
  '0.060*"advertising" + 0.050*"brand" + 0.041*"consumer" + 0.034*"influencer" + 0.025*"message" + 0.021*"product" + 0.015*"ad" + 0.014*"appeal" + 0.013*"celebrity" + 0.012*"job"'),
 (2,
  '0.039*"audience" + 0.039*"woman" + 0.022*"motivation" + 0.022*"character" + 0.013*"value" + 0.012*"norm" + 0.012*"work" + 0.010*"video" + 0.010*"page" + 0.010*"youtube"'),
 (3,
  '0.020*"protest" + 0.017*"csr" + 0.017*"frame" + 0.016*"diversity" + 0.013*"association" + 0.013*"city" + 0.012*"space" + 0.011*"violence" + 0.010*"response" + 0.010*"legitimacy"'),
 (4,
  '0.049*"ad" + 0.030*"company" + 0.019*"image" + 0.015*"queer" + 0.012*"cause" + 0.010*"viewer" + 0.010*"video" + 0.010*"segment" + 0.010*"moment" + 0.010*"competition"'),
 (5,
  '0.047*"child" + 0.023*"service" + 0.019*"regulation" + 0.018*"governme

##  可视化 

In [10]:
# Prepare the visualization data.
vis_data = gensimvis.prepare(model, corpus, dictionary)

# Visualize the topic model.
%matplotlib inline
pyLDAvis.enable_notebook()
pyLDAvis.display(vis_data)
#pyLDAvis.show(vis_data)

###  话题分布

In [11]:
#print topic keywords
import re
tpid = []
tpkw = []
for i in range(19):
    kw = re.sub(r'[0-9*."+]','',model.print_topic(i))
    tpid.append(i)
    tpkw.append(kw.replace('  ',','))

dftp = pd.DataFrame({'topicid': tpid,'topic_words':tpkw})
#dftp

In [12]:
# get the topic distribution for the given document.
topicdist = model.get_document_topics(corpus, minimum_probability=0.05, minimum_phi_value=None, per_word_topics=False)

# assigns the topics to the documents in corpus
lda_corpus = model[corpus]

# pair corpus and topic probability
combine = list(zip(lda_corpus,docs))

Sortedlist = []
Firstlist = []
# Secondlist = []
for doc in combine:
    sortls =  sorted(doc[0], key=lambda tup: tup[1], reverse=True)
    first = sortls[0]
    Firstlist.append(first[0])
    Sortedlist.append(sortls)
#     try:
#         second = sortls[1]
#         #Secondlist.append(second)
#         Secondlist.append(second[0])      
#     except:
#         #Secondlist.append([])
#         Secondlist.append(np.nan)
#         pass

tpdf = pd.DataFrame(combine)

tpdf['sorted'] = Sortedlist
tpdf['first_cluster'] = Firstlist
# tpdf['second_cluster'] = Secondlist
tpdf['authors'] = df1['Author Full Names']
tpdf['title'] = df1['Article Title']
tpdf['abstract'] = df1['Abstract']
tpdf['source'] = df1['Source Title']
tpdf['doi'] = df1['DOI']
tpdf['citation'] = df1['Times Cited, WoS Core']

tpdfgb = tpdf.groupby('first_cluster')
print(tpdfgb.size())

tpdf0 = pd.merge(tpdf, dftp, left_on = 'first_cluster',right_on = 'topicid')

tpdf0.to_excel('Communication_topic_resluts_19.xlsx',encoding ='utf-8')
print("All done!")

first_cluster
0      87
1     105
2      73
3      74
4      66
5      60
6      84
7      72
8      76
9      62
10    121
11     76
12     52
13     85
14     82
15     60
16     67
17     92
18     88
dtype: int64
All done!


## 补充：模型参数筛查

In [214]:
# import numpy as np
# import tqdm

# # supporting function
# def compute_coherence_values(corpus, dictionary, k, a, b):
    
#     lda_model = gensim.models.LdaMulticore(corpus=corpus,
#                                            id2word=dictionary,
#                                            num_topics=k, 
#                                            random_state=100,
#                                            chunksize=1453,
#                                            passes=10,
#                                            alpha=a,
#                                            eta=b)
    
#     coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
    
#     return coherence_model_lda.get_coherence()



# grid = {}
# grid['Validation_Set'] = {}
# # Topics range
# min_topics = 6
# max_topics = 24
# step_size = 2
# topics_range = range(min_topics, max_topics, step_size)
# # Alpha parameter
# alpha = list(np.arange(0.01, 1, 0.3))
# alpha.append('symmetric')
# alpha.append('asymmetric')
# # Beta parameter
# beta = list(np.arange(0.01, 1, 0.3))
# beta.append('symmetric')
# # Validation sets
# num_of_docs = len(corpus)
# corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
#                # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
#                gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
#                corpus]
# corpus_title = ['75% Corpus', '100% Corpus']
# model_results = {'Validation_Set': [],
#                  'Topics': [],
#                  'Alpha': [],
#                  'Beta': [],
#                  'Coherence': []
#                 }
# # Can take a long time to run
# if 1 == 1:
#     pbar = tqdm.tqdm(total=540) #len(docs))
    
#     # iterate through validation corpuses
#     for i in range(len(corpus_sets)):
#         # iterate through number of topics
#         for k in topics_range:
#             # iterate through alpha values
#             for a in alpha:
#                 # iterare through beta values
#                 for b in beta:
#                     # get the coherence score for the given parameters
#                     cv = compute_coherence_values(corpus=corpus, dictionary=dictionary, 
#                                                   k=k, a=a, b=b)
#                     # Save the model results
#                     model_results['Validation_Set'].append(corpus_title[i])
#                     model_results['Topics'].append(k)
#                     model_results['Alpha'].append(a)
#                     model_results['Beta'].append(b)
#                     model_results['Coherence'].append(cv)
                    
#                     pbar.update(1)
#     pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
#     pbar.close()

 79%|█████████████████████████████████████████████████████████████▎                | 540/687 [3:00:03<49:00, 20.01s/it]
