In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import collections
import math
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
#scipy
import scipy 
# spacy for lemmatization
import spacy
import os
from sklearn.datasets import fetch_20newsgroups
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')


In [3]:
newsgroups_train = fetch_20newsgroups(subset='train', shuffle = True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle = True)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [4]:
print(list(newsgroups_train.target_names))


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [21]:
print(newsgroups_train.filenames.shape, newsgroups_train.target.shape)


(11314,) (11314,)


In [78]:
def lemmatization(text):
    """https://spacy.io/api/annotation"""
    texts_out = []
    doc = nlp(" ".join(text)) 
    texts_out = [token.lemma_ for token in doc]
    return texts_out


# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    #print(result)
    lemmatized = lemmatization(result)
            
    return lemmatized
def candidate_label_lemmatized(label):
    w_labels = label.split()
    lemmatized = lemmatization(w_labels)
    return " ".join(lemmatized)

    

In [82]:
candidate_label_lemmatized('a eating rabbit')

'a eat rabbit'

In [45]:
document_num = 50
doc_sample = 'This disk has failed many times. I would like to get it replaced.'

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))


Original document: 
['This', 'disk', 'has', 'failed', 'many', 'times.', 'I', 'would', 'like', 'to', 'get', 'it', 'replaced.']


Tokenized and lemmatized document: 
['disk', 'fail', 'time', 'like', 'replace']


In [46]:
processed_docs = []

for doc in newsgroups_train.data:
    processed_docs.append(preprocess(doc))

In [47]:
newsgroups_train.data[1]

"From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 11\nNNTP-Posting-Host: carson.u.washington.edu\n\nA fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't answered this\npoll. Thanks.\n\nGuy Kuo <guykuo@u.washington.edu>\n"

In [48]:
# Create Dictionary
id2word = corpora.Dictionary(processed_docs)
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in processed_docs]
corpus[:2]


[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 2),
  (18, 1),
  (19, 2),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1)],
 [(13, 1),
  (24, 1),
  (25, 1),
  (27, 1),
  (34, 1),
  (36, 1),
  (38, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 2),
  (51, 5),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 2),
  (57, 1),
  (58, 2),
  (59, 2),
  (60, 1),
  (61, 2),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 3),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 2

In [49]:
#Convert a streamed corpus in bag-of-words format into a sparse matrix scipy.sparse.csc_matrix, with documents as columns.

term_doc = gensim.matutils.corpus2csc(corpus)
freq = term_doc.sum(axis=1)
freq = (freq/np.sum(freq)).T
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8, 
                                           random_state=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)


# In[21]:




In [50]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]


[(0,
  '0.012*"team" + 0.010*"game" + 0.010*"play" + 0.008*"player" + '
  '0.008*"subject" + 0.007*"organization" + 0.007*"year" + 0.007*"write" + '
  '0.007*"good" + 0.006*"line"'),
 (1,
  '0.009*"year" + 0.007*"organization" + 0.007*"subject" + 0.007*"game" + '
  '0.006*"team" + 0.006*"university" + 0.006*"go" + 0.005*"line" + '
  '0.004*"time" + 0.004*"president"'),
 (2,
  '0.009*"people" + 0.009*"write" + 0.009*"think" + 0.007*"know" + '
  '0.007*"subject" + 0.006*"say" + 0.006*"believe" + 0.006*"organization" + '
  '0.005*"article" + 0.005*"thing"'),
 (3,
  '0.007*"people" + 0.007*"say" + 0.006*"israel" + 0.005*"armenian" + '
  '0.005*"know" + 0.004*"write" + 0.004*"israeli" + 0.004*"turkish" + '
  '0.004*"jews" + 0.004*"subject"'),
 (4,
  '0.012*"write" + 0.010*"article" + 0.009*"organization" + 0.009*"subject" + '
  '0.008*"people" + 0.007*"state" + 0.007*"right" + 0.006*"line" + '
  '0.005*"post" + 0.005*"think"'),
 (5,
  '0.016*"file" + 0.011*"line" + 0.011*"subject" + 0.011*"

In [52]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)



Perplexity:  -8.77326327103528

Coherence Score:  0.5156555435443112


In [53]:
df=pd.DataFrame(data=lda_model.get_topics())


In [54]:
df.shape

(8, 72810)

In [55]:
theta=df.values

In [56]:
theta.shape


(8, 72810)

In [57]:
freq.shape


(1, 72810)

In [58]:
np.ones((8,1))

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.]])

In [59]:
# Extract noun chunks from corpus
import spacy
nlp = spacy.load('en_core_web_sm')


In [60]:
#function taken from Textacy library to identify noun, verb and prepositional phrases
def pos_regex_matches(doc, pattern):
    """
    Extract sequences of consecutive tokens from a spacy-parsed doc whose
    part-of-speech tags match the specified regex pattern.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc`` or ``spacy.Span``)
        pattern (str): Pattern of consecutive POS tags whose corresponding words
            are to be extracted, inspired by the regex patterns used in NLTK's
            `nltk.chunk.regexp`. Tags are uppercase, from the universal tag set;
            delimited by < and >, which are basically converted to parentheses
            with spaces as needed to correctly extract matching word sequences;
            white space in the input doesn't matter.

            Examples (see ``constants.POS_REGEX_PATTERNS``):

            * noun phrase: r'<DET>? (<NOUN>+ <ADP|CONJ>)* <NOUN>+'
            * compound nouns: r'<NOUN>+'
            * verb phrase: r'<VERB>?<ADV>*<VERB>+'
            * prepositional phrase: r'<PREP> <DET>? (<NOUN>+<ADP>)* <NOUN>+'

    Yields:
        ``spacy.Span``: the next span of consecutive tokens from ``doc`` whose
        parts-of-speech match ``pattern``, in order of apperance
    """
    # standardize and transform the regular expression pattern...
    pattern = re.sub(r'\s', '', pattern)
    pattern = re.sub(r'<([A-Z]+)\|([A-Z]+)>', r'( (\1|\2))', pattern)
    pattern = re.sub(r'<([A-Z]+)>', r'( \1)', pattern)

    tags = ' ' + ' '.join(tok.pos_ for tok in doc)

    for m in re.finditer(pattern, tags):
        yield doc[tags[0:m.start()].count(' '):tags[0:m.end()].count(' ')]

In [61]:
pattern = r'<DET>? (<NOUN>+ <ADP|CONJ>)* <NOUN>+'


In [62]:
all_noun_chunks =[]


In [63]:
for x in newsgroups_train.data:
    doc = nlp(x)
    lists = pos_regex_matches(doc, pattern)
    for j in lists:
        all_noun_chunks.append(j.text.lower().strip())
        
   
    

In [64]:
len(all_noun_chunks)

517135

In [72]:
allnounchunks = pd.DataFrame({'NounChunks':all_noun_chunks})

In [75]:
#allnounchunks.to_csv('allnounchunks20NewsGroup.csv',index=False)

In [65]:
from collections import Counter

In [66]:
freq_all_noun_chunks = Counter(all_noun_chunks)

In [67]:
freq_all_noun_chunks.most_common(500)

[('lines', 11263),
 ('subject', 10916),
 ('article', 6258),
 ('host', 4841),
 ('people', 4234),
 ('distribution', 2625),
 ('|', 2085),
 ('time', 1898),
 ('%', 1810),
 ('x', 1725),
 ('years', 1618),
 ('things', 1324),
 ('thanks', 1188),
 ('way', 1136),
 ('world', 1089),
 ('thing', 1075),
 ('version', 1059),
 ('information', 1047),
 ('system', 1036),
 ('part', 1035),
 ('year', 956),
 ('question', 919),
 ('life', 778),
 ('example', 776),
 ('others', 765),
 ('government', 746),
 ('the way', 730),
 ('file', 724),
 ('today', 711),
 ('problem', 711),
 ('fact', 707),
 ('name', 703),
 ('#', 680),
 ('a lot', 637),
 ('day', 626),
 ('the problem', 615),
 ('use', 614),
 ('problems', 614),
 ('email', 600),
 ('the time', 586),
 ('times', 581),
 ('case', 577),
 ('course', 574),
 ('number', 573),
 ('the people', 565),
 ('point', 559),
 ('-newsreader', 558),
 ('files', 558),
 ('program', 557),
 ('questions', 556),
 ('place', 551),
 ('line', 540),
 ('order', 540),
 ('drive', 539),
 ('work', 538),
 ('days

In [68]:
# Take top 1000 frequent noun phrases as Candidate labels as suggested by Mei et al.
top_3000_candid = freq_all_noun_chunks.most_common(3000)

In [69]:
candidate_labels = []
for u,v in top_3000_candid:
    candidate_labels.append(u)


In [83]:
candidate_labels

['lines',
 'subject',
 'article',
 'host',
 'people',
 'distribution',
 '|',
 'time',
 '%',
 'x',
 'years',
 'things',
 'thanks',
 'way',
 'world',
 'thing',
 'version',
 'information',
 'system',
 'part',
 'year',
 'question',
 'life',
 'example',
 'others',
 'government',
 'the way',
 'file',
 'today',
 'problem',
 'fact',
 'name',
 '#',
 'a lot',
 'day',
 'the problem',
 'use',
 'problems',
 'email',
 'the time',
 'times',
 'case',
 'course',
 'number',
 'the people',
 'point',
 '-newsreader',
 'files',
 'program',
 'questions',
 'place',
 'line',
 'order',
 'drive',
 'work',
 'days',
 'opinions',
 'children',
 'internet',
 'power',
 'data',
 'the fact',
 '_',
 'reply',
 'one',
 'the world',
 'hand',
 'software',
 'mail',
 'evidence',
 'reason',
 'money',
 'games',
 'team',
 'car',
 'the government',
 'law',
 'message',
 'stuff',
 'a bit',
 'side',
 'men',
 'person',
 'e-mail',
 'phone',
 'help',
 'game',
 'mind',
 'rights',
 'opinion',
 'faith',
 'man',
 'card',
 'the end',
 'chip'

In [85]:
lemmatized_labels = [candidate_label_lemmatized(w) for w in candidate_labels]

In [86]:
lemmatized_labels

['line',
 'subject',
 'article',
 'host',
 'people',
 'distribution',
 '|',
 'time',
 '%',
 'x',
 'year',
 'thing',
 'thank',
 'way',
 'world',
 'thing',
 'version',
 'information',
 'system',
 'part',
 'year',
 'question',
 'life',
 'example',
 'other',
 'government',
 'the way',
 'file',
 'today',
 'problem',
 'fact',
 'name',
 '#',
 'a lot',
 'day',
 'the problem',
 'use',
 'problem',
 'email',
 'the time',
 'time',
 'case',
 'course',
 'number',
 'the people',
 'point',
 '-newsreader',
 'file',
 'program',
 'question',
 'place',
 'line',
 'order',
 'drive',
 'work',
 'day',
 'opinion',
 'child',
 'internet',
 'power',
 'datum',
 'the fact',
 '_',
 'reply',
 'one',
 'the world',
 'hand',
 'software',
 'mail',
 'evidence',
 'reason',
 'money',
 'game',
 'team',
 'car',
 'the government',
 'law',
 'message',
 'stuff',
 'a bit',
 'side',
 'man',
 'person',
 'e - mail',
 'phone',
 'help',
 'game',
 'mind',
 'right',
 'opinion',
 'faith',
 'man',
 'card',
 'the end',
 'chip',
 'g)r',
 's

In [87]:
def zero_order(freq,theta,lCandidat,NumTopic):
    """
    Calculate the Zero-Order Relevance

    Parameters:
    ----------
    freq : Array containing the frequency of occurrences of each word in the whole corpus
    theta : Array containing the frequency of occurrences of each word in each topic
    lcandidat: Array containing each label candidate
    NumTopic : The number of the topic
    
    Returns:
    -------
    topCandidate : Array containing the name of the top 10 score candidate for a given topic
    """
    
    #W matrice qui contient le score de chaque mot pour chaque topic
    W=np.log(theta/freq)
    
    # score des tous les candidats pour le topic NumTopic
    score=[]
    
    for indice in range (len(lCandidat)):
        candidat=lCandidat[indice].split(" ")
        i=id2word.doc2idx(candidat)
        # supprime les -1 (qui signifie pas trouvé)
        i[:] = [v for v in i if v != -1]
        
        score.append(np.sum(W[k,i]))
        
    #topValue, topCandidate = top10Score(score,lCandidat)
    dicti=pd.DataFrame({'Labels':lCandidat,'Score':score})
  
    return dicti



In [104]:
k=7
candid_for_topic_k = zero_order(freq,theta,lemmatized_labels,k)

In [105]:
candid_for_topic_k.sort_values('Score',ascending=False)

Unnamed: 0,Labels,Score
1190,video card,3.212222
1365,disk drives,3.164413
2358,disk drive,3.135483
2617,tape drive,2.976875
135,thank in advance,2.292984
2425,jumper,2.023956
2976,vram,1.910497
1845,power supply,1.888887
2355,volts,1.856623
2506,meg,1.828051
