<a href="https://colab.research.google.com/github/workhardzy/K6312/blob/main/topic_modelling_COVID_news_v1_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Loading the libraries needed

In [None]:
#note you need to have it on your own google drive,for those who are sharing a drive, an approach is to add shortcut to your drive. 
#See https://stackoverflow.com/questions/54351852/accessing-shared-with-me-with-colab
#put in your authorisation code that is linked to the google account you are linking to and press enter
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!pip install pyLDAvis



In [None]:
#open file
import os
#import display
from IPython.display import display

import pandas as pd
import re, pickle, os
import datetime 
import nltk
from nltk.util import ngrams
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords, wordnet 
from collections import Counter 
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from gensim.corpora import MmCorpus
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim

import matplotlib.pyplot as plt
%matplotlib inline
import math
import ast

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Utils

In [None]:
def save_checkpoint(df,filepath):
    df.to_pickle(filepath)
    print('saved dataframe at {}'.format(filepath)) 


In [None]:
#these are the functions to train the model

def save_print_to_file(outfile, msg):
    with open(outfile, 'w') as fp:
        print(msg, file=fp)  

def get_word_count(news_text, num_gram):
    '''
    Get common word counts
    '''
    n_grams = list(ngrams(news_text, num_gram))
    
    #stop_ngrams = ['local news']
    #n_grams = [i for i in n_grams if i not in stop_ngrams]
    
    common_words = Counter(n_grams).most_common()
    word_count = pd.DataFrame(data = common_words, 
                              columns=['word','frequency']) 
    # Convert list to string
    word_count['word'] = word_count['word'].apply(' '.join)
    # Plot word count graph
    word_count.head(20).sort_values('frequency').plot.barh(
            x='word', y='frequency', title='Word Frequency',figsize=(19,10))
    plt.savefig(WORD_COUNT_FILE)
    print ('Word count saved\n')
    plt.close('all')
    
    return word_count

def word_grams(words, min=1, max=2):
    '''
    Build ngrams word list
    '''
    word_list = []
    for n in range(min, max):
        for ngram in ngrams(words, n):
            #print(ngram)
            word_list.append(' '.join(str(i) for i in ngram)) 
    return word_list

def train_lda_model(token_news):
    print('Start LDA model training ...\n')    
    # Build dictionary
    print('building dictionary')
    news_dict = corpora.Dictionary(token_news)
    # Remove words that occur less than 10 documents, # or more than 50% of the doc
    news_dict.filter_extremes(no_below=10, no_above=0.5)                       ########调
    # Transform doc to a vectorized form by computing frequency of each word
    bow_corpus = [news_dict.doc2bow(doc) for doc in token_news]
    # Save corpus and dictionary to file
    MmCorpus.serialize(CORPUS_FILE, bow_corpus)
    news_dict.save(DICT_FILE)
    print('saved corpus and dictionary to file')
    
    # Create tf-idf model and then apply transformation to the entire corpus
    print('create tf-idf model')
    tfidf = models.TfidfModel(bow_corpus)                                       ########调 (optional)
    tfidf_corpus = tfidf[bow_corpus]                                            ########调 (optional)
    
    print('training lda')
    # Train LDA model #this is the time bottleneck
    lda_model = models.ldamodel.LdaModel(corpus=tfidf_corpus,                   ########调 (optional)
                                         num_topics=NUM_TOPICS, 
                                         id2word=news_dict, 
                                         passes=NUM_PASSES, 
                                         alpha=ALPHA, 
                                         eta=ETA,
                                         random_state=42)
    # Save LDA model to file
    lda_model.save(LDA_MODEL_FILE)
    print ('LDA model saved\n')
      
    # Save all generated topics to a file
    msg = ''
    for idx, topic in lda_model.print_topics(-1):
        msg += 'Topic: {} \nWords: {}\n'.format(idx, topic)    
    save_print_to_file(LDA_TOPICS_FILE, msg)
    
    # Evaluate LDA model performance
    eval_lda (lda_model, tfidf_corpus, news_dict, token_news)    
    # Visualize topics
    vis_topics(lda_model, tfidf_corpus, news_dict)
        
    return lda_model

def eval_lda (lda_model, corpus, dict, token_text):
    
    # Compute Perplexity: a measure of how good the model is. lower the better.
    print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
    
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=token_text, 
                                         dictionary=dict, coherence='c_v')   
    print('\nCoherence: ', coherence_model_lda.get_coherence())

def vis_topics(lda_model, corpus, dict):
    '''
    Plot generated topics on an interactive graph
    '''
    lda_data =  pyLDAvis.gensim.prepare(lda_model, corpus, dict, mds='mmds')
    pyLDAvis.display(lda_data)
    pyLDAvis.save_html(lda_data, TOPIC_VIS_FILE)
    print ('Topic visual saved\n')

def wordcloud(word_count_df):
    '''
    Create word cloud image
    '''
    # Convert DataFrame to Map so that word cloud can be generated from freq
    word_count_dict = {}
    for w, f in word_count_df.values:
        word_count_dict[w] = f
    # Generate word cloud 
    wordcloud = WordCloud(max_words=300, width=1400, height=900, 
                          random_state=12, contour_width=3, 
                          contour_color='firebrick')
    wordcloud.generate_from_frequencies(word_count_dict)
    plt.figure(figsize=(10,10), facecolor='k')
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    # Save the word cloud image
    wordcloud.to_file(WORDCLOUD_FILE) 
    print ('Word cloud saved\n')
    plt.close('all')
    
    return wordcloud

In [None]:
'''
def clean_topics(df):
    print('CLEANING...')
    cleaned_tweets_df = preprocess_tweets(df,'tweet_text') ########调
    cleaned_tweets_df.to_pickle('/content/gdrive/My Drive/covid_mining/preprocessed_news_headlines.h5') ########调
    return cleaned_tweets_df
'''    
def generate_ngrams(cleaned_news_df):
    # Convert series to list for word count
    print('creating news_text corpus')
    news_text = [word for one_news in cleaned_news_df['token'] for word in one_news]
    
    print('getting ngram word count')
    # Get common ngrams word count
    word_count_df = get_word_count(news_text, num_gram=NUM_GRAMS)
    
    # Generate word cloud
    news_wordcloud = wordcloud(word_count_df)  

    # Generate ngram tokens
    cleaned_news_df['ngram_token'] = [word_grams(x, NUM_GRAMS, NUM_GRAMS+1) for x in cleaned_news_df['token']]
    
    
    return cleaned_news_df

    
def trainingLDA(ngrams):
    print('TRAINING...')
    # Train LDA model and visualize generated topics
    lda_model = train_lda_model(ngrams)

    print('DONE!')
    
    return lda_model

Here we start training the model and the data

In [None]:
#path variables
mth = 6   #to change the month                                                  ########调
FIGURE_PATH = r'gdrive/My Drive/covid_mining/figures/'                          ########调 #need to create the folder
DATA_PATH = r'gdrive/My Drive/covid_mining/data/'                               ########调 #need to create the folder
MODEL_PATH = r'gdrive/My Drive/covid_mining/models/'                            ########调 #need to create the folder
WORDCLOUD_FILE = FIGURE_PATH + 'wordcloud{}.png'.format(mth)
WORD_COUNT_FILE = FIGURE_PATH + 'commond_words_freq{}.png'.format(mth)
TOPIC_VIS_FILE = FIGURE_PATH + 'lda{}.html'.format(mth)
ORIG_NEWS_FILE = DATA_PATH + 'all_news'                                      ########换名字 
CLEANED_NEWS_FILE = DATA_PATH + 'news_cleaned_df'                            ########换名字 
CORPUS_FILE = MODEL_PATH + 'clean_news_corpus{}.mm'.format(mth)                             ########换名字 
DICT_FILE = MODEL_PATH + 'clean_news{}.dict'.format(mth)                                    ########换名字 
LDA_MODEL_FILE = MODEL_PATH + 'news_lda{}.model'.format(mth)                                ########换名字 
LDA_TOPICS_FILE = MODEL_PATH + 'news_lda_topics{}.txt'.format(mth)                          ########换名字 


#tunable parameters
NUM_GRAMS = 2 #ngrams                                                                                                                    ########调 (optional)
NUM_TOPICS = 20 #guesstimate  #要用分析                                                                                                   ########调 
NUM_PASSES = 5 #Number of training passes/iterations over all news headlines   #越高越久，但应该会准                                       ########调 (optional)         
ALPHA = 'auto'  #the lower alpha is, the more likely that a news headline may contain mixture of just a few of the topics 1/auto is normal
ETA = 'auto' # Word-Topic Density.The lower eta is, the more likely that a topic may contain a mixture of just a few of the words

#system variables
WORDCLOUD_FILE = FIGURE_PATH + 'wordcloud{}.png'.format(mth)                                  ########换名字 
WORD_COUNT_FILE = FIGURE_PATH + 'common_words_freq{}.png'.format(mth)                         ########换名字   
TOPIC_VIS_FILE = FIGURE_PATH + 'lda{}.html'.format(mth)                                       ########换名字 
CORPUS_FILE = MODEL_PATH + 'clean_news_corpus{}.mm'.format(mth)                             ########换名字 
DICT_FILE = MODEL_PATH + 'clean_news{}.dict'.format(mth)                                    ########换名字 
LDA_MODEL_FILE = MODEL_PATH + 'news_lda{}.model'.format(mth)                                ########换名字 



In [None]:
df = pd.read_pickle('/content/gdrive/My Drive/covid_mining/preprocessed_news_headlines{}.h5'.format(mth)) ########调
df

Unnamed: 0,title,date,topic_area,token,ngram_token,month
136195,Thousands in New Zealand protest against George Floyd killing | World news | The Guardian,2020-06-01,general,"[thousand, new, zealand, protest, george, floyd, kill, world, news, guardian]","[thousand new, new zealand, zealand protest, protest george, george floyd, floyd kill, kill world, world news, news guardian]",6
136196,Pro-Trump donors in huge cash drive to boost doctors pushing states to reopen | US news | The Guardian,2020-06-01,general,"[pro, trump, donor, huge, cash, drive, boost, doctor, push, state, reopen, u, news, guardian]","[pro trump, trump donor, donor huge, huge cash, cash drive, drive boost, boost doctor, doctor push, push state, state reopen, reopen u, u news, news guardian]",6
136197,America's seniors ebb away from Trump as coronavirus response disappoints | US news | The Guardian,2020-06-01,general,"[america, senior, ebb, away, trump, coronavirus, response, disappoints, u, news, guardian]","[america senior, senior ebb, ebb away, away trump, trump coronavirus, coronavirus response, response disappoints, disappoints u, u news, news guardian]",6
136198,China says it's ditching GDP targets. That could be good news for the world | Keyu Jin | Opinion | The Guardian,2020-06-01,general,"[china, say, ditch, gdp, target, could, good, news, world, keyu, jin, opinion, guardian]","[china say, say ditch, ditch gdp, gdp target, target could, could good, good news, news world, world keyu, keyu jin, jin opinion, opinion guardian]",6
136199,"To prevent a chaotic end to lockdown, the public should be told the true risks | Simon Jenkins | Opinion | The Guardian",2020-06-01,general,"[prevent, chaotic, end, lockdown, public, told, true, risk, simon, jenkins, opinion, guardian]","[prevent chaotic, chaotic end, end lockdown, lockdown public, public told, told true, true risk, risk simon, simon jenkins, jenkins opinion, opinion guardian]",6
...,...,...,...,...,...,...
175526,IVD players partner with Department of Health in REACT study,2020-06-30,healthcare,"[ivd, player, partner, department, health, react, study]","[ivd player, player partner, partner department, department health, health react, react study]",6
175527,Protecting individual’s data in an automated reality shaped by pandemic,2020-06-30,healthcare,"[protect, individual, data, automate, reality, shape, pandemic]","[protect individual, individual data, data automate, automate reality, reality shape, shape pandemic]",6
175528,COVID-19 cough screening app in development,2020-06-30,healthcare,"[covid, cough, screen, app, development]","[covid cough, cough screen, screen app, app development]",6
175529,Cellular decoys distract coronavirus - Materials Today,2020-06-30,business,"[cellular, decoy, distract, coronavirus, material, today]","[cellular decoy, decoy distract, distract coronavirus, coronavirus material, material today]",6


#training the model

can skip if not needed to train

In [None]:
#This is the part where we suppress warnings as it is occupying too much memory
import warnings
warnings.filterwarnings('ignore')


'''

Perplexity, the lower the better 

Coherence, the higher the better
'''

tokens = df['ngram_token']

%time LDAmodel = trainingLDA(tokens) #training done, #this portion takes a lot of time 



TRAINING...
Start LDA model training ...

building dictionary
saved corpus and dictionary to file
create tf-idf model
training lda
LDA model saved


Perplexity:  -8.281617660830644

Coherence:  0.6781050840385493
Topic visual saved

DONE!
CPU times: user 39.4 s, sys: 91.4 ms, total: 39.4 s
Wall time: 42.2 s


#to analyse

In [None]:
#showing the topics and the keywords
LDA_MODEL_FILE = MODEL_PATH + 'news_lda{}.model'.format(mth) 
ldaModel = models.ldamodel.LdaModel.load(LDA_MODEL_FILE)
for j in ldaModel.print_topics(-1):
    print(j)

(0, '0.056*"best quarter" + 0.045*"wall street" + 0.040*"financial condition" + 0.040*"management discussion" + 0.040*"discussion analysis" + 0.039*"analysis financial" + 0.039*"operation form" + 0.039*"result operation" + 0.038*"condition result" + 0.038*"amid pandemic"')
(1, '0.043*"covid marketscreener" + 0.035*"economic recovery" + 0.031*"wear mask" + 0.030*"need know" + 0.028*"australia news" + 0.027*"pandemic marketscreener" + 0.027*"say cnn" + 0.023*"coronavirus spike" + 0.022*"fintech time" + 0.022*"security law"')
(2, '0.086*"covid case" + 0.077*"news guardian" + 0.062*"world news" + 0.031*"rise coronavirus" + 0.031*"covid drug" + 0.023*"third quarter" + 0.019*"thing know" + 0.016*"u news" + 0.016*"movie theater" + 0.016*"plan marketscreener"')
(3, '0.043*"hedge fund" + 0.035*"u k" + 0.029*"fund dump" + 0.027*"fund love" + 0.025*"job cut" + 0.023*"statement marketscreener" + 0.022*"united state" + 0.022*"holding inc" + 0.021*"pandemic marketwatch" + 0.020*"fund crazy"')
(4, '0

In [None]:
#This is the part where we suppress warnings as it is occupying too much memory
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from tqdm import tqdm

#creating a dataframe for creating topic 
#load model
#load dataframe

def create_topic_df(df):
    CORPUS_FILE = MODEL_PATH + 'clean_news_corpus{}.mm'.format(mth)                      ########换名字 
    DICT_FILE = MODEL_PATH + 'clean_news{}.dict'.format(mth)                             ########换名字 
    LDA_MODEL_FILE = MODEL_PATH + 'news_lda{}.model'.format(mth)                         ########换名字 

    print(LDA_MODEL_FILE)
    ldaModel = models.ldamodel.LdaModel.load(LDA_MODEL_FILE)
    #load corpus
    print(CORPUS_FILE)
    corpus = MmCorpus(CORPUS_FILE)
    
    d_lookup = pd.DataFrame(ldaModel.print_topics(-1))
    
    topic_LDA_df = pd.DataFrame()
    
    for j in tqdm(range(len(corpus))):   
        tldf = pd.DataFrame(ldaModel[corpus[j]]).transpose().drop(0)
        a = tldf
        tldf['max_pred_value']= a.max(axis=1)
        tldf['pred_topic'] = a.idxmax(axis=1)
        tldf['topic_label'] = d_lookup[d_lookup[0]==tldf['pred_topic'].iloc[0]][1].iloc[0]
        topic_LDA_df = topic_LDA_df.append(tldf)

    topic_LDA_df.reset_index(drop=True,inplace=True)

    topic_LDA_df = topic_LDA_df.add_prefix('topic_')

    return topic_LDA_df


In [None]:
#execution
dd = create_topic_df(df) #this takes a really long time. 1:19:37. 1 Looking to optimise this for future work
display(dd)
save_checkpoint(dd,'/content/gdrive/My Drive/covid_mining/topic_modelled_news_headlines{}.h5'.format(mth)) ########调


  0%|          | 0/39025 [00:00<?, ?it/s]

gdrive/My Drive/covid_mining/models/news_lda6.model
gdrive/My Drive/covid_mining/models/clean_news_corpus6.mm


100%|██████████| 39025/39025 [08:38<00:00, 75.27it/s]


Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_max_pred_value,topic_pred_topic,topic_topic_label,topic_17,topic_18,topic_19
0,0.010413,0.323197,0.011107,0.167441,0.010307,0.011210,0.012647,0.011458,0.010340,0.168781,0.010850,0.010898,0.010975,0.011295,0.011182,0.012083,0.166949,0.323197,1,"0.043*""covid marketscreener"" + 0.035*""economic recovery"" + 0.031*""wear mask"" + 0.030*""need know"" + 0.028*""australia news"" + 0.027*""pandemic marketscreener"" + 0.027*""say cnn"" + 0.023*""coronavirus spike"" + 0.022*""fintech time"" + 0.022*""security law""",,,
1,0.014173,0.015153,0.470291,0.016162,0.016086,0.014997,0.016312,0.245964,0.016673,0.015046,0.018037,0.015788,0.014015,0.015858,0.015970,0.013816,0.016436,0.470291,2,"0.086*""covid case"" + 0.077*""news guardian"" + 0.062*""world news"" + 0.031*""rise coronavirus"" + 0.031*""covid drug"" + 0.023*""third quarter"" + 0.019*""thing know"" + 0.016*""u news"" + 0.016*""movie theater"" + 0.016*""plan marketscreener""",0.016272,0.017582,0.015370
2,0.011545,0.218969,0.361861,0.013166,0.198480,0.012217,0.013288,0.014991,0.013582,0.012257,0.014693,0.012861,0.011417,0.012918,0.013010,0.011255,0.013389,0.361861,2,"0.086*""covid case"" + 0.077*""news guardian"" + 0.062*""world news"" + 0.031*""rise coronavirus"" + 0.031*""covid drug"" + 0.023*""third quarter"" + 0.019*""thing know"" + 0.016*""u news"" + 0.016*""movie theater"" + 0.016*""plan marketscreener""",0.013255,0.014323,0.012521
3,0.014173,0.015153,0.015169,0.016162,0.016086,0.014997,0.243873,0.018403,0.016673,0.015046,0.018037,0.243349,0.014015,0.015858,0.015970,0.013816,0.016436,0.245143,18,"0.105*""hedge fund"" + 0.033*""opinion guardian"" + 0.032*""first time"" + 0.024*""american airline"" + 0.021*""announces new"" + 0.020*""first half"" + 0.018*""source reuters"" + 0.016*""bank england"" + 0.016*""trade deal"" + 0.016*""fund cash""",0.016272,0.245143,0.015370
4,0.026011,0.027809,0.027839,0.029662,0.029522,0.027524,0.029937,0.033774,0.030600,0.027613,0.033102,0.028975,0.025722,0.029104,0.029310,0.025356,0.030164,0.449905,18,"0.105*""hedge fund"" + 0.033*""opinion guardian"" + 0.032*""first time"" + 0.024*""american airline"" + 0.021*""announces new"" + 0.020*""first half"" + 0.018*""source reuters"" + 0.016*""bank england"" + 0.016*""trade deal"" + 0.016*""fund cash""",0.029863,0.449905,0.028208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39020,0.044664,0.047752,0.047804,0.050935,0.050693,0.047263,0.051406,0.057994,0.052544,0.047416,0.056841,0.049755,0.044168,0.049975,0.050329,0.043541,0.051795,0.057994,7,"0.043*""second quarter"" + 0.041*""opinion cnn"" + 0.036*""hong kong"" + 0.036*""covid impact"" + 0.035*""growth technavio"" + 0.024*""boris johnson"" + 0.022*""case spike"" + 0.021*""pm johnson"" + 0.019*""augment growth"" + 0.017*""boost growth""",0.051280,0.055410,0.048438
39021,0.044664,0.047752,0.047804,0.050935,0.050693,0.047263,0.051406,0.057994,0.052544,0.047416,0.056841,0.049755,0.044168,0.049975,0.050329,0.043541,0.051795,0.057994,7,"0.043*""second quarter"" + 0.041*""opinion cnn"" + 0.036*""hong kong"" + 0.036*""covid impact"" + 0.035*""growth technavio"" + 0.024*""boris johnson"" + 0.022*""case spike"" + 0.021*""pm johnson"" + 0.019*""augment growth"" + 0.017*""boost growth""",0.051280,0.055410,0.048438
39022,0.044664,0.047752,0.047804,0.050935,0.050693,0.047263,0.051406,0.057994,0.052544,0.047416,0.056841,0.049755,0.044168,0.049975,0.050329,0.043541,0.051795,0.057994,7,"0.043*""second quarter"" + 0.041*""opinion cnn"" + 0.036*""hong kong"" + 0.036*""covid impact"" + 0.035*""growth technavio"" + 0.024*""boris johnson"" + 0.022*""case spike"" + 0.021*""pm johnson"" + 0.019*""augment growth"" + 0.017*""boost growth""",0.051280,0.055410,0.048438
39023,0.044664,0.047752,0.047804,0.050935,0.050693,0.047263,0.051406,0.057994,0.052544,0.047416,0.056841,0.049755,0.044168,0.049975,0.050329,0.043541,0.051795,0.057994,7,"0.043*""second quarter"" + 0.041*""opinion cnn"" + 0.036*""hong kong"" + 0.036*""covid impact"" + 0.035*""growth technavio"" + 0.024*""boris johnson"" + 0.022*""case spike"" + 0.021*""pm johnson"" + 0.019*""augment growth"" + 0.017*""boost growth""",0.051280,0.055410,0.048438


saved dataframe at /content/gdrive/My Drive/covid_mining/topic_modelled_news_headlines6.h5


In [None]:
mth

6

In [None]:
pred = pd.read_pickle('/content/gdrive/My Drive/covid_mining/topic_modelled_news_headlines{}.h5'.format(mth)) ########调
df = df.reset_index()

df2 = df.join(pred)
df2

Unnamed: 0,index,title,date,topic_area,token,ngram_token,month,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_max_pred_value,topic_pred_topic,topic_topic_label,topic_17,topic_18,topic_19
0,136195,Thousands in New Zealand protest against George Floyd killing | World news | The Guardian,2020-06-01,general,"[thousand, new, zealand, protest, george, floyd, kill, world, news, guardian]","[thousand new, new zealand, zealand protest, protest george, george floyd, floyd kill, kill world, world news, news guardian]",6,0.010413,0.323197,0.011107,0.167441,0.010307,0.011210,0.012647,0.011458,0.010340,0.168781,0.010850,0.010898,0.010975,0.011295,0.011182,0.012083,0.166949,0.323197,1,"0.043*""covid marketscreener"" + 0.035*""economic recovery"" + 0.031*""wear mask"" + 0.030*""need know"" + 0.028*""australia news"" + 0.027*""pandemic marketscreener"" + 0.027*""say cnn"" + 0.023*""coronavirus spike"" + 0.022*""fintech time"" + 0.022*""security law""",,,
1,136196,Pro-Trump donors in huge cash drive to boost doctors pushing states to reopen | US news | The Guardian,2020-06-01,general,"[pro, trump, donor, huge, cash, drive, boost, doctor, push, state, reopen, u, news, guardian]","[pro trump, trump donor, donor huge, huge cash, cash drive, drive boost, boost doctor, doctor push, push state, state reopen, reopen u, u news, news guardian]",6,0.014173,0.015153,0.470291,0.016162,0.016086,0.014997,0.016312,0.245964,0.016673,0.015046,0.018037,0.015788,0.014015,0.015858,0.015970,0.013816,0.016436,0.470291,2,"0.086*""covid case"" + 0.077*""news guardian"" + 0.062*""world news"" + 0.031*""rise coronavirus"" + 0.031*""covid drug"" + 0.023*""third quarter"" + 0.019*""thing know"" + 0.016*""u news"" + 0.016*""movie theater"" + 0.016*""plan marketscreener""",0.016272,0.017582,0.015370
2,136197,America's seniors ebb away from Trump as coronavirus response disappoints | US news | The Guardian,2020-06-01,general,"[america, senior, ebb, away, trump, coronavirus, response, disappoints, u, news, guardian]","[america senior, senior ebb, ebb away, away trump, trump coronavirus, coronavirus response, response disappoints, disappoints u, u news, news guardian]",6,0.011545,0.218969,0.361861,0.013166,0.198480,0.012217,0.013288,0.014991,0.013582,0.012257,0.014693,0.012861,0.011417,0.012918,0.013010,0.011255,0.013389,0.361861,2,"0.086*""covid case"" + 0.077*""news guardian"" + 0.062*""world news"" + 0.031*""rise coronavirus"" + 0.031*""covid drug"" + 0.023*""third quarter"" + 0.019*""thing know"" + 0.016*""u news"" + 0.016*""movie theater"" + 0.016*""plan marketscreener""",0.013255,0.014323,0.012521
3,136198,China says it's ditching GDP targets. That could be good news for the world | Keyu Jin | Opinion | The Guardian,2020-06-01,general,"[china, say, ditch, gdp, target, could, good, news, world, keyu, jin, opinion, guardian]","[china say, say ditch, ditch gdp, gdp target, target could, could good, good news, news world, world keyu, keyu jin, jin opinion, opinion guardian]",6,0.014173,0.015153,0.015169,0.016162,0.016086,0.014997,0.243873,0.018403,0.016673,0.015046,0.018037,0.243349,0.014015,0.015858,0.015970,0.013816,0.016436,0.245143,18,"0.105*""hedge fund"" + 0.033*""opinion guardian"" + 0.032*""first time"" + 0.024*""american airline"" + 0.021*""announces new"" + 0.020*""first half"" + 0.018*""source reuters"" + 0.016*""bank england"" + 0.016*""trade deal"" + 0.016*""fund cash""",0.016272,0.245143,0.015370
4,136199,"To prevent a chaotic end to lockdown, the public should be told the true risks | Simon Jenkins | Opinion | The Guardian",2020-06-01,general,"[prevent, chaotic, end, lockdown, public, told, true, risk, simon, jenkins, opinion, guardian]","[prevent chaotic, chaotic end, end lockdown, lockdown public, public told, told true, true risk, risk simon, simon jenkins, jenkins opinion, opinion guardian]",6,0.026011,0.027809,0.027839,0.029662,0.029522,0.027524,0.029937,0.033774,0.030600,0.027613,0.033102,0.028975,0.025722,0.029104,0.029310,0.025356,0.030164,0.449905,18,"0.105*""hedge fund"" + 0.033*""opinion guardian"" + 0.032*""first time"" + 0.024*""american airline"" + 0.021*""announces new"" + 0.020*""first half"" + 0.018*""source reuters"" + 0.016*""bank england"" + 0.016*""trade deal"" + 0.016*""fund cash""",0.029863,0.449905,0.028208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39020,175526,IVD players partner with Department of Health in REACT study,2020-06-30,healthcare,"[ivd, player, partner, department, health, react, study]","[ivd player, player partner, partner department, department health, health react, react study]",6,0.044664,0.047752,0.047804,0.050935,0.050693,0.047263,0.051406,0.057994,0.052544,0.047416,0.056841,0.049755,0.044168,0.049975,0.050329,0.043541,0.051795,0.057994,7,"0.043*""second quarter"" + 0.041*""opinion cnn"" + 0.036*""hong kong"" + 0.036*""covid impact"" + 0.035*""growth technavio"" + 0.024*""boris johnson"" + 0.022*""case spike"" + 0.021*""pm johnson"" + 0.019*""augment growth"" + 0.017*""boost growth""",0.051280,0.055410,0.048438
39021,175527,Protecting individual’s data in an automated reality shaped by pandemic,2020-06-30,healthcare,"[protect, individual, data, automate, reality, shape, pandemic]","[protect individual, individual data, data automate, automate reality, reality shape, shape pandemic]",6,0.044664,0.047752,0.047804,0.050935,0.050693,0.047263,0.051406,0.057994,0.052544,0.047416,0.056841,0.049755,0.044168,0.049975,0.050329,0.043541,0.051795,0.057994,7,"0.043*""second quarter"" + 0.041*""opinion cnn"" + 0.036*""hong kong"" + 0.036*""covid impact"" + 0.035*""growth technavio"" + 0.024*""boris johnson"" + 0.022*""case spike"" + 0.021*""pm johnson"" + 0.019*""augment growth"" + 0.017*""boost growth""",0.051280,0.055410,0.048438
39022,175528,COVID-19 cough screening app in development,2020-06-30,healthcare,"[covid, cough, screen, app, development]","[covid cough, cough screen, screen app, app development]",6,0.044664,0.047752,0.047804,0.050935,0.050693,0.047263,0.051406,0.057994,0.052544,0.047416,0.056841,0.049755,0.044168,0.049975,0.050329,0.043541,0.051795,0.057994,7,"0.043*""second quarter"" + 0.041*""opinion cnn"" + 0.036*""hong kong"" + 0.036*""covid impact"" + 0.035*""growth technavio"" + 0.024*""boris johnson"" + 0.022*""case spike"" + 0.021*""pm johnson"" + 0.019*""augment growth"" + 0.017*""boost growth""",0.051280,0.055410,0.048438
39023,175529,Cellular decoys distract coronavirus - Materials Today,2020-06-30,business,"[cellular, decoy, distract, coronavirus, material, today]","[cellular decoy, decoy distract, distract coronavirus, coronavirus material, material today]",6,0.044664,0.047752,0.047804,0.050935,0.050693,0.047263,0.051406,0.057994,0.052544,0.047416,0.056841,0.049755,0.044168,0.049975,0.050329,0.043541,0.051795,0.057994,7,"0.043*""second quarter"" + 0.041*""opinion cnn"" + 0.036*""hong kong"" + 0.036*""covid impact"" + 0.035*""growth technavio"" + 0.024*""boris johnson"" + 0.022*""case spike"" + 0.021*""pm johnson"" + 0.019*""augment growth"" + 0.017*""boost growth""",0.051280,0.055410,0.048438


In [None]:
df_most_rep = df2

#compare 2 lists of tokens for matching comparison, essentially, it tells us if the tokens of this news headlines are the keywords used for the topic
df_most_rep['topic_topic_label'] = df_most_rep['topic_topic_label'].apply(lambda x: re.findall(r'"(.*?)"', x))

shared_tokens = []
for j in tqdm(range(len(df_most_rep))):
    line = df_most_rep.iloc[j]
    tokensA = line['topic_topic_label']
    tokensB = line['ngram_token']
    tokensC = list(set(tokensA).intersection(set(tokensB)))
    shared_tokens.append(tokensC)

df_most_rep['common tokens'] = pd.Series(shared_tokens)
df_most_rep['common tokens count'] = df_most_rep['common tokens'].apply(lambda x: len(x))

save_checkpoint(df_most_rep,'/content/gdrive/My Drive/covid_mining/topic_modelled_news_headlines{}.h5'.format(mth)) ########调



100%|██████████| 39025/39025 [00:10<00:00, 3550.76it/s]


saved dataframe at /content/gdrive/My Drive/covid_mining/topic_modelled_news_headlines6.h5


# To do analysis and results reporting

In [None]:
pd.set_option('display.max_colwidth', -1)

df_topic_sents_keywords = pd.read_pickle('/content/gdrive/My Drive/covid_mining/topic_modelled_news_headlines{}.h5'.format(mth)) ########调

topic_num_keywords = df_topic_sents_keywords[['topic_pred_topic', 'topic_topic_label']]

topic_num_keywords['topic_topic_label'] = topic_num_keywords['topic_topic_label'].apply(lambda x: str(x))
topic_num_keywords = topic_num_keywords.groupby(['topic_pred_topic', 'topic_topic_label']).count().reset_index()    


topic_counts = df_topic_sents_keywords['topic_pred_topic'].value_counts().reset_index()
topic_counts.columns = ['topic_pred_topic','counts']
topic_contribution = topic_counts['counts'].apply(lambda x: round(x/topic_counts.counts.sum(), 3)*100)
dfa = pd.concat([topic_counts,topic_contribution],axis=1)


df_dominant_topics = pd.merge(topic_num_keywords,dfa,on='topic_pred_topic')
df_dominant_topics.columns = ['Dominant Topic', 'Topic Keywords', 'Number of Documents', 'Percentage of news']

df_dominant_topics['Dominant Topic'] = df_dominant_topics.index

df_dominant_topics['Topic Keywords'] = df_dominant_topics['Topic Keywords'].apply(lambda x: ast.literal_eval(x))
df_dominant_topics['Topic Keywords'] = df_dominant_topics['Topic Keywords'].apply(lambda x: ', '.join([str(elem) for elem in x]) )
df_dominant_topics['Topic Keywords'] = df_dominant_topics['Dominant Topic'].apply(lambda x: 'Topic ' + str(x) + ': ') + df_dominant_topics['Topic Keywords']

display(df_dominant_topics)

Unnamed: 0,Dominant Topic,Topic Keywords,Number of Documents,Percentage of news
0,0,"Topic 0: best quarter, wall street, financial condition, management discussion, discussion analysis, analysis financial, operation form, result operation, condition result, amid pandemic",1661,4.3
1,1,"Topic 1: covid marketscreener, economic recovery, wear mask, need know, australia news, pandemic marketscreener, say cnn, coronavirus spike, fintech time, security law",1759,4.5
2,2,"Topic 2: covid case, news guardian, world news, rise coronavirus, covid drug, third quarter, thing know, u news, movie theater, plan marketscreener",1467,3.8
3,3,"Topic 3: hedge fund, u k, fund dump, fund love, job cut, statement marketscreener, united state, holding inc, pandemic marketwatch, fund crazy",1596,4.1
4,4,"Topic 4: central bank, cut job, coronavirus response, powell mnuchin, u china, pandemic cnn, european share, bet award, announces first, treasury yield",1411,3.6
5,5,"Topic 5: update marketscreener, first quarter, result marketscreener, form k, k marketscreener, financial result, financial statement, business update, quarter financial, event form",917,2.3
6,6,"Topic 6: coronavirus pandemic, say reuters, virus fear, case rise, business guardian, index marketscreener, south korea, board director, russell index, air france",1683,4.3
7,7,"Topic 7: second quarter, opinion cnn, hong kong, covid impact, growth technavio, boris johnson, case spike, pm johnson, augment growth, boost growth",12765,32.7
8,8,"Topic 8: coronavirus case, report marketscreener, donald trump, q earnings, stress test, q result, therapeutic announces, pandemic reuters, fund buying, national security",1565,4.0
9,9,"Topic 9: covid vaccine, coronavirus lockdown, amid covid, coronavirus cnnpolitics, annual meeting, due coronavirus, global market, vaccine candidate, inc announces, global stock",842,2.2


In [None]:
dis = pd.read_pickle('/content/gdrive/My Drive/covid_mining/topic_modelled_news_headlines{}.h5'.format(mth))              ########调
topicN = 3                                                                                                                      ########调
dis[dis['topic_pred_topic']==topicN]                                                                                             


Unnamed: 0,index,title,date,topic_area,token,ngram_token,month,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_max_pred_value,topic_pred_topic,topic_topic_label,topic_17,topic_18,topic_19,common tokens,common tokens count
19,136214,Advisers urge ministers to review benefit cap in wake of coronavirus | Politics | The Guardian,2020-06-01,general,"[adviser, urge, minister, review, benefit, cap, wake, coronavirus, politics, guardian]","[adviser urge, urge minister, minister review, review benefit, benefit cap, cap wake, wake coronavirus, coronavirus politics, politics guardian]",6,0.026011,0.027809,0.027839,0.447299,0.029522,0.027524,0.029937,0.033774,0.030600,0.027613,0.033102,0.028975,0.025722,0.029104,0.029310,0.025356,0.030164,0.447299,3,"[hedge fund, u k, fund dump, fund love, job cut, statement marketscreener, united state, holding inc, pandemic marketwatch, fund crazy]",0.029863,0.032268,0.028208,[],0
33,136228,Rees-Mogg's plans for MPs to vote in person 'beyond a farce' | Politics | The Guardian,2020-06-01,general,"[rees, mogg, plan, mp, vote, person, beyond, farce, politics, guardian]","[rees mogg, mogg plan, plan mp, mp vote, vote person, person beyond, beyond farce, farce politics, politics guardian]",6,0.026011,0.027809,0.027839,0.447299,0.029522,0.027524,0.029937,0.033774,0.030600,0.027613,0.033102,0.028975,0.025722,0.029104,0.029310,0.025356,0.030164,0.447299,3,"[hedge fund, u k, fund dump, fund love, job cut, statement marketscreener, united state, holding inc, pandemic marketwatch, fund crazy]",0.029863,0.032268,0.028208,[],0
44,136239,Following the science is so over for Matt. Now fetch him his lucky tie | John Crace | Politics | The Guardian,2020-06-01,general,"[follow, science, matt, fetch, lucky, tie, john, crace, politics, guardian]","[follow science, science matt, matt fetch, fetch lucky, lucky tie, tie john, john crace, crace politics, politics guardian]",6,0.026011,0.027809,0.027839,0.447299,0.029522,0.027524,0.029937,0.033774,0.030600,0.027613,0.033102,0.028975,0.025722,0.029104,0.029310,0.025356,0.030164,0.447299,3,"[hedge fund, u k, fund dump, fund love, job cut, statement marketscreener, united state, holding inc, pandemic marketwatch, fund crazy]",0.029863,0.032268,0.028208,[],0
81,136276,Government fails to distance Dominic Cummings from sex discrimination case | Politics | The Guardian,2020-06-01,general,"[government, fails, distance, dominic, cummings, sex, discrimination, case, politics, guardian]","[government fails, fails distance, distance dominic, dominic cummings, cummings sex, sex discrimination, discrimination case, case politics, politics guardian]",6,0.026011,0.027809,0.027839,0.447299,0.029522,0.027524,0.029937,0.033774,0.030600,0.027613,0.033102,0.028975,0.025722,0.029104,0.029310,0.025356,0.030164,0.447299,3,"[hedge fund, u k, fund dump, fund love, job cut, statement marketscreener, united state, holding inc, pandemic marketwatch, fund crazy]",0.029863,0.032268,0.028208,[],0
122,136317,Black Americans homeschool for different reasons than whites,2020-06-01,business,"[black, american, homeschool, different, reason, white]","[black american, american homeschool, homeschool different, different reason, reason white]",6,0.026011,0.027809,0.027839,0.447299,0.029522,0.027524,0.029937,0.033774,0.030600,0.027613,0.033102,0.028975,0.025722,0.029104,0.029310,0.025356,0.030164,0.447299,3,"[hedge fund, u k, fund dump, fund love, job cut, statement marketscreener, united state, holding inc, pandemic marketwatch, fund crazy]",0.029863,0.032268,0.028208,[],0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38935,175441,Five things to know about Leicester: the U.K. city that’s key to preventing a second wave of coronavirus - MarketWatch,2020-06-30,business,"[five, thing, know, leicester, u, k, city, key, prevent, second, wave, coronavirus, marketwatch]","[five thing, thing know, know leicester, leicester u, u k, k city, city key, key prevent, prevent second, second wave, wave coronavirus, coronavirus marketwatch]",6,0.322512,0.010413,0.166811,0.323880,0.011055,0.010307,0.011210,0.012647,0.011458,0.010340,0.012395,0.010850,0.010898,0.010975,0.011295,0.011182,0.012083,0.323880,3,"[hedge fund, u k, fund dump, fund love, job cut, statement marketscreener, united state, holding inc, pandemic marketwatch, fund crazy]",0.010563,,,[u k],1
38937,175443,"Rare coins, whiskey and even Lego. Brits search for niche investments during pandemic - MarketWatch",2020-06-30,business,"[rare, coin, whiskey, even, lego, brit, search, niche, investment, pandemic, marketwatch]","[rare coin, coin whiskey, whiskey even, even lego, lego brit, brit search, search niche, niche investment, investment pandemic, pandemic marketwatch]",6,0.026011,0.027809,0.027839,0.447299,0.029522,0.027524,0.029937,0.033774,0.030600,0.027613,0.033102,0.028975,0.025722,0.029104,0.029310,0.025356,0.030164,0.447299,3,"[hedge fund, u k, fund dump, fund love, job cut, statement marketscreener, united state, holding inc, pandemic marketwatch, fund crazy]",0.029863,0.032268,0.028208,[pandemic marketwatch],1
38942,175448,"Rare coins, whiskey and even Lego — some of the niche investments being considered during the pandemic - MarketWatch",2020-06-30,business,"[rare, coin, whiskey, even, lego, niche, investment, consider, pandemic, marketwatch]","[rare coin, coin whiskey, whiskey even, even lego, lego niche, niche investment, investment consider, consider pandemic, pandemic marketwatch]",6,0.026011,0.027809,0.027839,0.447299,0.029522,0.027524,0.029937,0.033774,0.030600,0.027613,0.033102,0.028975,0.025722,0.029104,0.029310,0.025356,0.030164,0.447299,3,"[hedge fund, u k, fund dump, fund love, job cut, statement marketscreener, united state, holding inc, pandemic marketwatch, fund crazy]",0.029863,0.032268,0.028208,[pandemic marketwatch],1
38948,175454,"Tuesday Pre-Market Stock Movers:Uber Gains, Lululemon Rises, Royal Dutch Shell Falls - MarketWatch",2020-06-30,business,"[tuesday, pre, market, stock, mover, uber, gain, lululemon, rise, royal, dutch, shell, fall, marketwatch]","[tuesday pre, pre market, market stock, stock mover, mover uber, uber gain, gain lululemon, lululemon rise, rise royal, royal dutch, dutch shell, shell fall, fall marketwatch]",6,0.014173,0.015153,0.015169,0.243723,0.243647,0.014997,0.016312,0.018403,0.016673,0.015046,0.018037,0.015788,0.014015,0.243419,0.015970,0.013816,0.016436,0.243723,3,"[hedge fund, u k, fund dump, fund love, job cut, statement marketscreener, united state, holding inc, pandemic marketwatch, fund crazy]",0.016272,0.017582,0.015370,[],0


In [None]:
#sort by predicted value
topicN = 16
dis = pd.read_pickle('/content/gdrive/My Drive/covid_mining/topic_modelled_news_headlines{}.h5'.format(mth))   ########调
dis[dis['topic_pred_topic']==topicN].sort_values('topic_max_pred_value')


Unnamed: 0,index,title,date,topic_area,token,ngram_token,month,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_max_pred_value,topic_pred_topic,topic_topic_label,topic_17,topic_18,topic_19,common tokens,common tokens count
24481,160847,First Trust Senior Floating Rate Income Fund II : Declares its Monthly Common Share Distribution of $0.105 Per Share for July | MarketScreener,2020-06-22,business,"[first, trust, senior, float, rate, income, fund, ii, declares, monthly, common, share, distribution, per, share, july, marketscreener]","[first trust, trust senior, senior float, float rate, rate income, income fund, fund ii, ii declares, declares monthly, monthly common, common share, share distribution, distribution per, per share, share july, july marketscreener]",6,0.010413,0.010424,0.011107,0.011055,0.010307,0.011210,0.012647,0.167844,0.166726,0.012395,0.010850,0.166018,0.010898,0.010975,0.011295,0.011182,0.168469,0.168469,16,"[supreme court, face mask, new coronavirus, south africa, coronavirus case, covid test, gold price, covid pandemic, coronavirus update, r cr]",0.166949,,,[],0
24479,160845,First Trust Intermediate Duration Preferred & Income Fund : Declares its Monthly Common Share Distribution of $0.1325 Per Share for July | MarketScreener,2020-06-22,business,"[first, trust, intermediate, duration, prefer, income, fund, declares, monthly, common, share, distribution, per, share, july, marketscreener]","[first trust, trust intermediate, intermediate duration, duration prefer, prefer income, income fund, fund declares, declares monthly, monthly common, common share, share distribution, distribution per, per share, share july, july marketscreener]",6,0.010413,0.010424,0.011107,0.011055,0.010307,0.011210,0.012647,0.167844,0.166726,0.012395,0.010850,0.166018,0.010898,0.010975,0.011295,0.011182,0.168469,0.168469,16,"[supreme court, face mask, new coronavirus, south africa, coronavirus case, covid test, gold price, covid pandemic, coronavirus update, r cr]",0.166949,,,[],0
25308,161685,US coronavirus: Young people in the South and West are increasingly getting coronavirus - CNN,2020-06-22,general,"[u, coronavirus, young, people, south, west, increasingly, get, coronavirus, cnn]","[u coronavirus, coronavirus young, young people, people south, south west, west increasingly, increasingly get, get coronavirus, coronavirus cnn]",6,0.166126,0.010413,0.010424,0.167493,0.011055,0.010307,0.011210,0.012647,0.167844,0.010340,0.012395,0.010850,0.010898,0.010975,0.011295,0.167569,0.168469,0.168469,16,"[supreme court, face mask, new coronavirus, south africa, coronavirus case, covid test, gold price, covid pandemic, coronavirus update, r cr]",0.010563,,,[],0
24478,160844,First Trust/Aberdeen Global Opportunity Income Fund : Declares its Monthly Common Share Distribution of $0.08 Per Share for July | MarketScreener,2020-06-22,business,"[first, trust, aberdeen, global, opportunity, income, fund, declares, monthly, common, share, distribution, per, share, july, marketscreener]","[first trust, trust aberdeen, aberdeen global, global opportunity, opportunity income, income fund, fund declares, declares monthly, monthly common, common share, share distribution, distribution per, per share, share july, july marketscreener]",6,0.010413,0.010424,0.011107,0.011055,0.010307,0.011210,0.012647,0.167844,0.166726,0.012395,0.010850,0.166018,0.010898,0.010975,0.011295,0.011182,0.168469,0.168469,16,"[supreme court, face mask, new coronavirus, south africa, coronavirus case, covid test, gold price, covid pandemic, coronavirus update, r cr]",0.166949,,,[],0
24444,160810,First Trust Mortgage Income Fund : Declares its Monthly Common Share Distribution of $0.06 Per Share for July | MarketScreener,2020-06-22,business,"[first, trust, mortgage, income, fund, declares, monthly, common, share, distribution, per, share, july, marketscreener]","[first trust, trust mortgage, mortgage income, income fund, fund declares, declares monthly, monthly common, common share, share distribution, distribution per, per share, share july, july marketscreener]",6,0.010413,0.010424,0.011107,0.011055,0.010307,0.011210,0.012647,0.167844,0.166726,0.012395,0.010850,0.166018,0.010898,0.010975,0.011295,0.011182,0.168469,0.168469,16,"[supreme court, face mask, new coronavirus, south africa, coronavirus case, covid test, gold price, covid pandemic, coronavirus update, r cr]",0.166949,,,[],0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2597,138802,5 Top-Ranked Nasdaq-Listed Tech Stocks to Buy Right Now,2020-06-02,business,"[top, ranked, nasdaq, list, tech, stock, buy, right]","[top ranked, ranked nasdaq, nasdaq list, list tech, tech stock, stock buy, buy right]",6,0.011545,0.012344,0.012357,0.013166,0.013104,0.012217,0.013288,0.014991,0.013582,0.012257,0.014693,0.012861,0.011417,0.012918,0.013010,0.011255,0.754895,0.754895,16,"[supreme court, face mask, new coronavirus, south africa, coronavirus case, covid test, gold price, covid pandemic, coronavirus update, r cr]",0.013255,0.014323,0.012521,[],0
1548,137748,Emergent BioSolutions : HHS Adds $628 Million to Contract with Emergent BioSolutions to Secure CDMO Manufacturing Capacity for Operation Warp Speed | MarketScreener,2020-06-02,business,"[emergent, biosolutions, hhs, add, million, contract, emergent, biosolutions, secure, cdmo, manufacturing, capacity, operation, warp, speed, marketscreener]","[emergent biosolutions, biosolutions hhs, hhs add, add million, million contract, contract emergent, emergent biosolutions, biosolutions secure, secure cdmo, cdmo manufacturing, manufacturing capacity, capacity operation, operation warp, warp speed, speed marketscreener]",6,0.011545,0.012344,0.012357,0.013166,0.013104,0.012217,0.013288,0.014991,0.013582,0.012257,0.014693,0.012861,0.011417,0.012918,0.013010,0.011255,0.754895,0.754895,16,"[supreme court, face mask, new coronavirus, south africa, coronavirus case, covid test, gold price, covid pandemic, coronavirus update, r cr]",0.013255,0.014323,0.012521,[],0
548,136743,Coca-Cola Bottlers Dedicate Production Capability to Support COVID-19 Test Kits | MarketScreener,2020-06-01,business,"[coca, cola, bottler, dedicate, production, capability, support, covid, test, kit, marketscreener]","[coca cola, cola bottler, bottler dedicate, dedicate production, production capability, capability support, support covid, covid test, test kit, kit marketscreener]",6,0.011545,0.012344,0.012357,0.013166,0.013104,0.012217,0.013288,0.014991,0.013582,0.012257,0.014693,0.012861,0.011417,0.012918,0.013010,0.011255,0.754895,0.754895,16,"[supreme court, face mask, new coronavirus, south africa, coronavirus case, covid test, gold price, covid pandemic, coronavirus update, r cr]",0.013255,0.014323,0.012521,[covid test],1
834,137030,Coca-Cola Bottlers Dedicate Production Capability to Support COVID-19 Test Kits,2020-06-01,business,"[coca, cola, bottler, dedicate, production, capability, support, covid, test, kit]","[coca cola, cola bottler, bottler dedicate, dedicate production, production capability, capability support, support covid, covid test, test kit]",6,0.011545,0.012344,0.012357,0.013166,0.013104,0.012217,0.013288,0.014991,0.013582,0.012257,0.014693,0.012861,0.011417,0.012918,0.013010,0.011255,0.754895,0.754895,16,"[supreme court, face mask, new coronavirus, south africa, coronavirus case, covid test, gold price, covid pandemic, coronavirus update, r cr]",0.013255,0.014323,0.012521,[covid test],1


In [None]:
#export
dis.to_csv('/content/gdrive/My Drive/covid_mining/topics{}.csv'.format(mth),encoding='utf-8') ########调