# 传播学篇主题模型

In [3]:
import nltk
from stop_words import get_stop_words
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag
import numpy as np
import pandas as pd
import gensim
from gensim import corpora, models
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim as gensimvis
import pyLDAvis
# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pyLDAvis==3.2.1
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

##  数据预处理

In [5]:
data = pd.read_csv("RCfiltered1453.csv",encoding='unicode_escape')
print('# of records',len(data))

# remove empty abstract records
df =  data[data['AB'].isna()== False]
print('# of non-empty records:',len(df))

# extract a documment of all abstracts
docs = list(df['TI']+ '. '+df['AB'])

# Step 1: split the documents into tokens
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # convert to lowercase.
    docs[idx] = nltk.word_tokenize(docs[idx])

# # Step 2: filter noun & adj 
filtertag = ['NN','NNS','NNP','NNPS','VBG','JJ'] 
docs = [[token[0] for token in pos_tag(doc) if token[1] in filtertag ] for doc in docs]  
    
# Step 3: remove stopwords
stopwords = get_stop_words('en')
stopwords = stopwords + ['e.g.','e.g','i.e.','i.e','and/or','mis']
docs = [[token for token in doc if not token in stopwords] for doc in docs]

# Step 4: remove numbers, but not words that contain numbers
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Step 5: remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

# Step 6: lemmatize the documents
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

# Step 7: compute bigrams
from gensim.models import Phrases

# add bigrams and trigrams to docs (only ones that appear 5 times or more).
bigram = Phrases(docs, min_count = 3)
for idx in range(len(docs)):
     docs[idx] = bigram[docs[idx]]

# Step 8: filter noun    
filtertag = ['VBG','JJ'] 
docs = [[token[0] for token in pos_tag(doc) if token[1] not in filtertag ] for doc in docs]  


# # save document to file
# file = open('docs.txt','w');
# file.write(str(docs))
# file.close()

# remove rare and common tokens.
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)
#print(len(dictionary))

# Filter out words that occur less than 5 documents, or more than 10% of the documents.
dictionary.filter_extremes(no_below = 5, no_above = 0.1)
# dictionary.save_as_text('dist_5_0.1.txt')

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))
print('Done!')

# of records 1453
# of non-empty records: 1453
Number of unique tokens: 1761
Number of documents: 1453
Done!


## 模型参数、一致性计算

In [8]:
model = gensim.models.LdaMulticore(corpus=corpus,
                                   id2word=dictionary,
                                   num_topics=14, 
                                   random_state=100,
                                   chunksize=1453,
                                   passes=10,
                                   alpha=0.61,
                                   eta=0.91)

In [6]:
## Saving the corpus, dictionary, and model.
# corpora.MmCorpus.serialize('savemodel/Corpus_001.mm', corpus)
# dictionary.save('savemodel/Dictionary_001.dict')
# model.save('savemodel/Model_001.model')

# Load corpora, models, similarities
corpus = corpora.MmCorpus('savemodel/Corpus_001.mm')
dictionary = corpora.Dictionary.load('savemodel/Dictionary_001.dict')
model =  models.LdaModel.load('savemodel/Model_001.model')

In [7]:
# compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# compute Coherence Score using UMass
coherence_model_lda = CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# view topics
model.print_topics(num_topics=14, num_words=10)


Coherence Score:  0.3851314769007831

Coherence Score:  -7.25698282564196


[(0,
  '0.012*"debate" + 0.012*"development" + 0.011*"candidate" + 0.009*"loneliness" + 0.008*"concern" + 0.007*"body_image" + 0.006*"trust" + 0.006*"expectation" + 0.005*"future" + 0.005*"rate"'),
 (1,
  '0.035*"parent" + 0.028*"family" + 0.019*"child" + 0.009*"exposure" + 0.009*"identity" + 0.007*"adult" + 0.006*"program" + 0.006*"health" + 0.006*"family_member" + 0.006*"gap"'),
 (2,
  '0.034*"community" + 0.012*"disclosure" + 0.008*"researcher" + 0.007*"leader" + 0.006*"survey" + 0.006*"political_communication" + 0.006*"reddit" + 0.005*"leadership" + 0.005*"context_collapse" + 0.005*"network"'),
 (3,
  '0.019*"character" + 0.016*"game" + 0.013*"misinformation" + 0.012*"mobile_phone" + 0.011*"player" + 0.010*"space" + 0.009*"mobility" + 0.009*"video_game" + 0.007*"story" + 0.007*"consumption"'),
 (4,
  '0.037*"consumer" + 0.024*"message" + 0.017*"ad" + 0.014*"attitude" + 0.013*"influencers" + 0.008*"behavior" + 0.008*"meme" + 0.008*"influencer" + 0.007*"product" + 0.007*"health"'),
 

##  可视化 

In [8]:
# Prepare the visualization data.
vis_data = gensimvis.prepare(model, corpus, dictionary)

# Visualize the topic model.
%matplotlib inline
pyLDAvis.enable_notebook()
pyLDAvis.display(vis_data)
#pyLDAvis.show(vis_data)

##  话题分布

In [9]:
# get the topic distribution for the given document.
topicdist = model.get_document_topics(corpus, minimum_probability=0.05, minimum_phi_value=None, per_word_topics=False)

#model[corpus[100]]

# assigns the topics to the documents in corpus
lda_corpus = model[corpus]

# pair corpus and topic probability
combine = list(zip(lda_corpus,docs))


Sortedlist = []
Firstlist = []
# Secondlist = []
for doc in combine:
    sortls =  sorted(doc[0], key=lambda tup: tup[1], reverse=True)
    first = sortls[0]
    #Firstlist.append(first)
    Firstlist.append(first[0])
    Sortedlist.append(sortls)
#     try:
#         second = sortls[1]
#         #Secondlist.append(second)
#         Secondlist.append(second[0])      
#     except:
#         #Secondlist.append([])
#         Secondlist.append(np.nan)
#         pass

tpdf = pd.DataFrame(combine)

tpdf['sorted'] = Sortedlist
tpdf['first_cluster'] = Firstlist
# tpdf['second_cluster'] = Secondlist
tpdf['authors'] = data['AF']
tpdf['title'] = data['TI']
tpdf['abstract'] = data['AB']
tpdf['source'] = data['SO']
tpdf['doi'] = data['DI']
tpdf['citation'] = data['Z9']

tpdfgb = tpdf.groupby('first_cluster')
print(tpdfgb.size())

tpdf.to_excel('Communication_topic_resluts_14.xlsx',encoding ='utf-8')
print("All done!")

first_cluster
0      44
1      51
2      63
3      74
4     117
5     114
6      99
7      61
8     135
9      87
10     57
11    250
12     92
13    209
dtype: int64
All done!


## 补充：模型参数筛查

In [214]:
import numpy as np
import tqdm

# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=1453,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()



grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 6
max_topics = 24
step_size = 2
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540) #len(docs))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus, dictionary=dictionary, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

 79%|█████████████████████████████████████████████████████████████▎                | 540/687 [3:00:03<49:00, 20.01s/it]
