In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
from nltk.tokenize import sent_tokenize, word_tokenize
import itertools
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pickle
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD

In [2]:
news_df = pd.read_csv('NewsArticles_Top10Keywords.csv')

In [3]:

def extract_candidate_chunk(text):    
    #remove substring '\\xa0'
    text = ' '.join([word.replace('\\xa0',' ').replace("'", '') for word in text.split()])
    #remove substring between &# and ;
    text = re.sub(r'\&#.*?\;', "", text)
    #remove word contains '\\'
    text = ' '.join([word for word in text.split() if '\\' not in word])
    text = re.sub(r'\ |\?|\!|\’|\‘|\/|\;|\:|\(|\)|\[|\]', ' ', text)

    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))

    grammar = r"""
          NP: {<NN.*|JJ>*<NN.*>}  # chunk noun(s), adjectives and noun(s)
          """

    chunker = nltk.RegexpParser(grammar)
    #Parse the sentence, converting the parse tree into a tagged sequence, return (word, tag, IOB-tag)
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents))
    # Join phrases based on IOB syntax.
    candidates =[]
    for key, group in itertools.groupby(all_chunks, lambda l: l[2] != 'O'):
        if key:
            candidates.append(' '.join(w[0] for w in group).lower())
    #print(candidates)

    #Filter by maximum keyphrase length 
    candidates = list(filter(lambda l: len(l.split()) <= 3, candidates))

    #stop word list
    stop_words = set(stopwords.words('english'))
    #punctuation list
    punctuation = set(string.punctuation)

    #remove stopwords and punctuation
    candidates = [candidate for candidate in candidates if candidate not in stop_words and not all(char in punctuation for char in candidate)]

    
    
    #Lemmatize

    lemmatizer = nltk.stem.WordNetLemmatizer().lemmatize
    candidates =  [lemmatizer(x) for x in candidates]
    return candidates

In [4]:

def extract_candidate_words(text):    
    #remove substring '\\xa0'
    text = ' '.join([word.replace('\\xa0',' ').replace("'", '') for word in text.split()])
    #remove substring between &# and ;
    text = re.sub(r'\&#.*?\;', "", text)
    #remove word contains '\\'
    text = ' '.join([word for word in text.split() if '\\' not in word])
    text = re.sub(r'\ |\?|\!|\’|\‘|\/|\;|\:|\(|\)|\[|\]', ' ', text)

    
    tags = set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS']) # adjective and noun word 

    #Parse the sentence, convert to tagged sequence, return (word, tag)
    tagged_sents = list(itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))))
    #stop word list
    stop_words = set(stopwords.words('english'))
    #punctuation list
    punctuation = set(string.punctuation)
                        
    # filter on certain POS tags and lowercase all words
    candidates = [word.lower() for word, tag in tagged_sents
                  if tag in tags and word.lower() not in stop_words
                  and not all(char in punctuation for char in word)]
    #Lemmatize

    lemmatizer = nltk.stem.WordNetLemmatizer().lemmatize
    candidates =  [lemmatizer(x) for x in candidates]
    return candidates

In [6]:

news_df['title_chunk'] = news_df['title'].apply(lambda x:extract_candidate_chunk(x))

In [7]:
news_df['content_chunk'] = news_df['content'].apply(lambda x:extract_candidate_chunk(x))

In [8]:
news_df['title_words'] = news_df['title'].apply(lambda x:extract_candidate_words(x))

In [9]:
news_df['content_words'] = news_df['content'].apply(lambda x:extract_candidate_words(x))

In [11]:
#create chunk vobaculary
content_chunk_vocab = []
title_chunk_vocab = []
for i in range(len(news_df)):
    title_chunk_vocab.append(news_df['title_chunk'][i])
for j in range(len(news_df)):
    content_chunk_vocab.append(news_df['content_chunk'][j])

#pickle.dump(set(title_vocab), open( "title_phrases.p", "wb" ) )
#pickle.dump(set(content_vocab), open( "content_phrases.p", "wb" ) )

In [12]:
#create word vobaculary
content_word_vocab = []
title_word_vocab = []
for i in range(len(news_df)):
    title_word_vocab.append(news_df['title_words'][i])
for j in range(len(news_df)):
    content_word_vocab.append(news_df['content_words'][j])


In [27]:
import gensim
from gensim import corpora, models
def lda_model(vocab):
    dictionary = gensim.corpora.Dictionary(vocab)
    #no_below (int, optional) – Keep tokens which are contained in at least no_below documents.
    #no_above (float, optional) – Keep tokens which are contained in no more than no_above documents (fraction of total corpus size, not an absolute number).
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    #Convert document into the bag-of-words (BoW) format = list of (token_id, token_count) tuples.
    bow_corpus = [dictionary.doc2bow(doc) for doc in vocab]
    from gensim import corpora, models
    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]
    lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=50, workers=2)
    lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
    for idx, topic in lda_model.print_topics(-1):
        print('Topic: {} \nWords: {}'.format(idx, topic))

### count based LDA results

In [28]:
lda_model(title_chunk_vocab)

Topic: 0 
Words: 0.148*"samsung" + 0.074*"time" + 0.061*"million" + 0.052*"gamers" + 0.051*"netflix" + 0.049*"brand" + 0.046*"august" + 0.043*"ad" + 0.039*"marketer" + 0.036*"tool"
Topic: 1 
Words: 0.266*"microsoft" + 0.117*"vr" + 0.102*"pc" + 0.064*"preview" + 0.055*"developer" + 0.050*"sale" + 0.047*"new windows" + 0.042*"job" + 0.039*"machine learning" + 0.035*"day"
Topic: 2 
Words: 0.131*"year" + 0.093*"uber" + 0.091*"world" + 0.077*"china" + 0.063*"u.s." + 0.044*"playerunknown" + 0.043*"launch" + 0.042*"machine" + 0.038*"video" + 0.037*"war"
Topic: 3 
Words: 0.131*"week" + 0.107*"nintendo switch" + 0.060*"iphone" + 0.059*"fund" + 0.057*"platform" + 0.046*"support" + 0.044*"vb live" + 0.041*"customer" + 0.039*"alexa skills" + 0.037*"european tech stories"
Topic: 4 
Words: 0.088*"call" + 0.082*"android" + 0.072*"way" + 0.065*"business" + 0.049*"esports" + 0.045*"io" + 0.038*"car" + 0.038*"hearthstone" + 0.037*"hero" + 0.035*"india"
Topic: 5 
Words: 0.317*"ai" + 0.265*"google" + 0.05

In [20]:
lda_model(title_word_vocab)

Topic: 0 
Words: 0.081*"apple" + 0.058*"tech" + 0.029*"ceo" + 0.026*"—" + 0.025*"iphone" + 0.024*"review" + 0.019*"day" + 0.017*"ipo" + 0.016*"company" + 0.015*"new"
Topic: 1 
Words: 0.072*"startup" + 0.046*"data" + 0.034*"ai" + 0.034*"business" + 0.029*"platform" + 0.022*"big" + 0.022*"new" + 0.022*"venture" + 0.021*"customer" + 0.020*"software"
Topic: 2 
Words: 0.233*"game" + 0.048*"switch" + 0.048*"nintendo" + 0.032*"mobile" + 0.032*"developer" + 0.030*"digital" + 0.027*"industry" + 0.027*"machine" + 0.024*"studio" + 0.017*"indie"
Topic: 3 
Words: 0.058*"reality" + 0.048*"pc" + 0.037*"xbox" + 0.033*"virtual" + 0.027*"valley" + 0.027*"way" + 0.026*"silicon" + 0.026*"playstation" + 0.025*"gaming" + 0.023*"people"
Topic: 4 
Words: 0.128*"vr" + 0.044*"live" + 0.040*"ar" + 0.038*"gamesbeat" + 0.034*"vb" + 0.032*"sale" + 0.025*"intel" + 0.022*"global" + 0.021*"weekly" + 0.019*"marketing"
Topic: 5 
Words: 0.046*"u.s." + 0.040*"samsung" + 0.038*"uber" + 0.030*"first" + 0.025*"time" + 0.024*

In [21]:
lda_model(content_chunk_vocab)

Topic: 0 
Words: 0.041*"nintendo" + 0.031*"reuters" + 0.024*"internet" + 0.024*"uber" + 0.022*"switch" + 0.021*"car" + 0.021*"people" + 0.021*"thing" + 0.018*"friday" + 0.018*"tuesday"
Topic: 1 
Words: 0.075*"world" + 0.035*"android" + 0.034*"io" + 0.030*"event" + 0.027*"game" + 0.021*"los angeles" + 0.020*"something" + 0.020*"year" + 0.019*"gamers" + 0.015*"place"
Topic: 2 
Words: 0.040*"year" + 0.038*"amazon" + 0.033*"deal" + 0.029*"company" + 0.025*"ceo" + 0.022*"sale" + 0.019*"term" + 0.018*"week" + 0.017*"end" + 0.017*"sponsored presented"
Topic: 3 
Words: 0.067*"google" + 0.050*"microsoft" + 0.041*"company" + 0.040*"today" + 0.024*"window" + 0.024*"customer" + 0.023*"service" + 0.018*"year" + 0.014*"update" + 0.014*"samsung"
Topic: 4 
Words: 0.060*"company" + 0.042*"week" + 0.038*"funding" + 0.034*"round" + 0.031*"startup" + 0.026*"today" + 0.023*"business" + 0.022*"investor" + 0.021*"participation" + 0.017*"money"
Topic: 5 
Words: 0.089*"guest" + 0.050*"ai" + 0.037*"artificial i

In [22]:
lda_model(content_word_vocab)

Topic: 0 
Words: 0.018*"ceo" + 0.015*"tech" + 0.014*"news" + 0.013*"startup" + 0.013*"year" + 0.012*"sponsored" + 0.012*"president" + 0.011*"city" + 0.011*"san" + 0.011*"presented"
Topic: 1 
Words: 0.047*"google" + 0.024*"device" + 0.021*"home" + 0.019*"new" + 0.018*"smart" + 0.018*"assistant" + 0.016*"amazon" + 0.015*"guest" + 0.013*"company" + 0.012*"today"
Topic: 2 
Words: 0.033*"today" + 0.031*"new" + 0.029*"app" + 0.029*"user" + 0.023*"google" + 0.023*"service" + 0.018*"company" + 0.018*"feature" + 0.016*"developer" + 0.015*"available"
Topic: 3 
Words: 0.037*"today" + 0.036*"company" + 0.032*"venture" + 0.029*"startup" + 0.027*"release" + 0.026*"capital" + 0.026*"round" + 0.025*"press" + 0.022*"funding" + 0.018*"investor"
Topic: 4 
Words: 0.035*"ai" + 0.031*"guest" + 0.028*"microsoft" + 0.027*"intelligence" + 0.024*"artificial" + 0.020*"machine" + 0.019*"bot" + 0.017*"new" + 0.015*"technology" + 0.013*"window"
Topic: 5 
Words: 0.050*"reality" + 0.044*"vr" + 0.040*"virtual" + 0.026

### TFIDF based LDA model results

In [14]:
lda_model(title_chunk_vocab)

Topic: 0 
Words: 0.299*"google" + 0.087*"world" + 0.040*"chatbots" + 0.037*"destiny" + 0.037*"india" + 0.035*"rise" + 0.030*"others" + 0.027*"kid" + 0.026*"vr" + 0.019*"zelda breath"
Topic: 1 
Words: 0.096*"company" + 0.071*"uber" + 0.065*"pc" + 0.055*"future" + 0.045*"overwatch" + 0.045*"player" + 0.038*"machine learning" + 0.034*"hearthstone" + 0.031*"platform" + 0.030*"netflix"
Topic: 2 
Words: 0.284*"ai" + 0.071*"year" + 0.066*"samsung" + 0.060*"call" + 0.041*"people" + 0.028*"machine" + 0.028*"europe" + 0.027*"publisher" + 0.022*"september" + 0.020*"november"
Topic: 3 
Words: 0.082*"nintendo switch" + 0.073*"switch" + 0.065*"developer" + 0.047*"user" + 0.044*"esports" + 0.042*"job" + 0.042*"android" + 0.036*"data" + 0.033*"nintendo" + 0.033*"softbank"
Topic: 4 
Words: 0.109*"game" + 0.063*"u.s." + 0.049*"fund" + 0.045*"time" + 0.036*"car" + 0.036*"tesla" + 0.035*"model" + 0.035*"revenue" + 0.031*"gamers" + 0.025*"alexa skills"
Topic: 5 
Words: 0.074*"october" + 0.066*"week" + 0.05

In [16]:
lda_model(title_word_vocab)

Topic: 0 
Words: 0.024*"ai" + 0.022*"game" + 0.015*"market" + 0.011*"new" + 0.009*"player" + 0.008*"ii" + 0.008*"fantasy" + 0.008*"office" + 0.008*"online" + 0.008*"mobile"
Topic: 1 
Words: 0.020*"google" + 0.018*"platform" + 0.016*"ai" + 0.016*"mobile" + 0.015*"twitter" + 0.015*"game" + 0.012*"weekly" + 0.011*"developer" + 0.011*"app" + 0.010*"new"
Topic: 2 
Words: 0.025*"vr" + 0.016*"google" + 0.014*"reality" + 0.013*"game" + 0.012*"ai" + 0.012*"startup" + 0.012*"virtual" + 0.011*"indie" + 0.010*"new" + 0.009*"facebook"
Topic: 3 
Words: 0.016*"new" + 0.015*"bot" + 0.014*"ai" + 0.014*"facebook" + 0.012*"amazon" + 0.012*"life" + 0.012*"microsoft" + 0.010*"launch" + 0.010*"google" + 0.010*"ar"
Topic: 4 
Words: 0.017*"playerunknown" + 0.016*"battleground" + 0.014*"apple" + 0.014*"business" + 0.013*"call" + 0.012*"duty" + 0.011*"data" + 0.010*"software" + 0.010*"year" + 0.010*"iphone"
Topic: 5 
Words: 0.015*"company" + 0.014*"switch" + 0.013*"chatbots" + 0.013*"live" + 0.012*"vb" + 0.012*

In [15]:
lda_model(content_chunk_vocab)

Topic: 0 
Words: 0.014*"time" + 0.013*"year" + 0.013*"technology" + 0.011*"game" + 0.008*"ai" + 0.008*"today" + 0.008*"month" + 0.008*"company" + 0.008*"guest" + 0.007*"developer"
Topic: 1 
Words: 0.014*"today" + 0.010*"guest" + 0.010*"pc" + 0.009*"company" + 0.009*"world" + 0.009*"people" + 0.008*"press release" + 0.008*"game" + 0.008*"playstation" + 0.008*"microsoft"
Topic: 2 
Words: 0.014*"video" + 0.010*"today" + 0.010*"game" + 0.010*"october" + 0.009*"brand" + 0.009*"company" + 0.009*"world" + 0.009*"people" + 0.009*"player" + 0.009*"artificial intelligence"
Topic: 3 
Words: 0.027*"today" + 0.017*"company" + 0.015*"microsoft" + 0.013*"service" + 0.009*"team" + 0.008*"game" + 0.008*"people" + 0.007*"year" + 0.007*"window" + 0.007*"pc"
Topic: 4 
Words: 0.020*"game" + 0.013*"today" + 0.012*"sponsored presented" + 0.011*"company" + 0.011*"week" + 0.010*"year" + 0.008*"round" + 0.008*"report" + 0.008*"vr" + 0.007*"software"
Topic: 5 
Words: 0.015*"company" + 0.012*"today" + 0.012*"year

In [17]:
lda_model(content_word_vocab)

Topic: 0 
Words: 0.007*"game" + 0.006*"company" + 0.006*"data" + 0.006*"new" + 0.005*"today" + 0.005*"vr" + 0.004*"ai" + 0.004*"google" + 0.004*"technology" + 0.004*"intelligence"
Topic: 1 
Words: 0.006*"company" + 0.006*"game" + 0.005*"venture" + 0.005*"round" + 0.005*"new" + 0.004*"funding" + 0.004*"today" + 0.004*"series" + 0.004*"ai" + 0.004*"technology"
Topic: 2 
Words: 0.008*"game" + 0.005*"company" + 0.004*"year" + 0.004*"new" + 0.004*"today" + 0.004*"way" + 0.004*"apple" + 0.004*"vr" + 0.004*"guest" + 0.003*"pc"
Topic: 3 
Words: 0.008*"game" + 0.005*"company" + 0.005*"vb" + 0.005*"new" + 0.005*"google" + 0.005*"event" + 0.005*"reality" + 0.004*"live" + 0.004*"year" + 0.004*"today"
Topic: 4 
Words: 0.009*"game" + 0.007*"google" + 0.006*"new" + 0.005*"today" + 0.005*"app" + 0.004*"developer" + 0.004*"alexa" + 0.004*"company" + 0.004*"ii" + 0.004*"studio"
Topic: 5 
Words: 0.006*"company" + 0.006*"release" + 0.005*"press" + 0.005*"today" + 0.005*"ai" + 0.005*"service" + 0.005*"plat