In [2]:
import re
from ast import literal_eval
import numpy as np
import pandas as pd
from pprint import pprint
import collections
import math
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
#scipy
import scipy 
from sklearn.cluster import KMeans
# spacy for lemmatization
import spacy
import os
from sklearn.datasets import fetch_20newsgroups
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# Extract noun chunks from corpus
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
newsgroups_train = fetch_20newsgroups(subset='train', shuffle = True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle = True)

In [4]:
print(list(newsgroups_train.target_names))


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [5]:
print(newsgroups_train.filenames.shape, newsgroups_train.target.shape)


(11314,) (11314,)


In [33]:
def lemmatization(text):
    """https://spacy.io/api/annotation"""
    texts_out = []
    doc = nlp(" ".join(text)) 
    texts_out = [token.lemma_ for token in doc]
    return texts_out


# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    #print(result)
    lemmatized = lemmatization(result)
            
    return lemmatized
def candidate_label_lemmatized(label):
    w_labels = label.split()
    lemmatized = lemmatization(w_labels)
    return " ".join(lemmatized)

def news_group_target_names(k):
    return newsgroups_train.target_names[k]
def return_preprocessed_text(words):
    x = literal_eval(words)
    return " ".join(x)
def get_pd_of_cluster(dataset,cluster_idx):
    dataset_sub = dataset[dataset['k_means'] == cluster_idx]
    cnt = CountVectorizer()
    voc = cnt.fit_transform(dataset_sub.preprocessed_text)
    vocab = list(cnt.get_feature_names())
    counts = voc.sum(axis=0).A1
    freq_distribution = Counter(dict(zip(vocab, counts)))
    prob = {}
    for w in vocab:
        prob[w] = freq_distribution[w]/np.sum(counts)
    return prob


In [7]:
candidate_label_lemmatized('a eating rabbit')

'a eat rabbit'

In [8]:
document_num = 50
doc_sample = 'This disk has failed many times. I would like to get it replaced.'

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))


Original document: 
['This', 'disk', 'has', 'failed', 'many', 'times.', 'I', 'would', 'like', 'to', 'get', 'it', 'replaced.']


Tokenized and lemmatized document: 
['disk', 'fail', 'time', 'like', 'replace']


In [9]:
dataset = pd.read_csv('dataset_lda_20_newsgroup_withNounPhrases.csv',sep='\t')

In [14]:
dataset['preprocessed_text'] = dataset['preprocessed'].apply(return_preprocessed_text)

In [24]:
#apply k-means
n_clust=20
cnt = TfidfVectorizer(stop_words='english')
voc = cnt.fit_transform(dataset.preprocessed_text)


In [25]:
model = KMeans(n_clusters=n_clust, 
               init='k-means++', 
               random_state=0,
               max_iter=100, # Maximum number of iterations of the k-means algorithm for a single run.
               n_init=1)  # Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia.

model.fit(voc)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=20, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)

In [26]:
dataset['k_means'] = model.labels_

In [27]:
ct = pd.crosstab(dataset['Topic'], dataset['k_means'])


In [29]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = cnt.get_feature_names()

In [30]:
for i in range(n_clust):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

Cluster 0:
 jesus
 bible
 christ
 church
 christian
 people
 believe
 christians
 faith
 say
Cluster 1:
 government
 people
 gun
 weapon
 right
 firearm
 crime
 batf
 state
 think
Cluster 2:
 game
 team
 player
 play
 year
 hockey
 season
 baseball
 score
 league
Cluster 3:
 card
 driver
 video
 diamond
 port
 mode
 graphic
 vesa
 monitor
 nubus
Cluster 4:
 drive
 scsi
 pitt
 gordon
 disk
 banks
 controller
 hard
 floppy
 problem
Cluster 5:
 sale
 ohio
 price
 state
 magnus
 offer
 simms
 sell
 university
 line
Cluster 6:
 nasa
 space
 henry
 alaska
 moon
 toronto
 launch
 orbit
 shuttle
 lunar
Cluster 7:
 clipper
 chip
 encryption
 escrow
 key
 government
 phone
 crypto
 algorithm
 secure
Cluster 8:
 access
 digex
 online
 communication
 express
 greenbelt
 nntp
 host
 post
 steve
Cluster 9:
 gatech
 prism
 georgia
 institute
 hrivnak
 technology
 atlanta
 internet
 mule
 hplabs
Cluster 10:
 printer
 print
 font
 deskjet
 laser
 driver
 bubblejet
 canon
 grayscale
 postscript
Cluster 

In [31]:
vocab = list(terms)

In [32]:
len(vocab)

72688

In [34]:
prob_clusters = []
for i in range(0,n_clust):
    prob_clusters.append(get_pd_of_cluster(dataset,i))

In [35]:
theta=[]
for i in range(0,n_clust):
    p_w = []
    for w in vocab:
        if w in prob_clusters[i].keys():
            p_w.append(prob_clusters[i][w])
        else:
            p_w.append(0)
    theta.append(p_w)

In [36]:
x = pd.DataFrame(theta)
x.columns = vocab

In [37]:
x.shape

(20, 72688)

In [38]:
theta=x.values

In [39]:
theta.shape


(20, 72688)

In [40]:
freq = voc.sum(axis=0)


In [41]:
freq = (freq/np.sum(freq))

In [43]:
freq.shape

(1, 72688)

In [2]:
#function taken from Textacy library to identify noun, verb and prepositional phrases
def pos_regex_matches(doc, pattern):
    """
    Extract sequences of consecutive tokens from a spacy-parsed doc whose
    part-of-speech tags match the specified regex pattern.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc`` or ``spacy.Span``)
        pattern (str): Pattern of consecutive POS tags whose corresponding words
            are to be extracted, inspired by the regex patterns used in NLTK's
            `nltk.chunk.regexp`. Tags are uppercase, from the universal tag set;
            delimited by < and >, which are basically converted to parentheses
            with spaces as needed to correctly extract matching word sequences;
            white space in the input doesn't matter.

            Examples (see ``constants.POS_REGEX_PATTERNS``):

            * noun phrase: r'<DET>? (<NOUN>+ <ADP|CONJ>)* <NOUN>+'
            * compound nouns: r'<NOUN>+'
            * verb phrase: r'<VERB>?<ADV>*<VERB>+'
            * prepositional phrase: r'<PREP> <DET>? (<NOUN>+<ADP>)* <NOUN>+'

    Yields:
        ``spacy.Span``: the next span of consecutive tokens from ``doc`` whose
        parts-of-speech match ``pattern``, in order of apperance
    """
    # standardize and transform the regular expression pattern...
    pattern = re.sub(r'\s', '', pattern)
    pattern = re.sub(r'<([A-Z]+)\|([A-Z]+)>', r'( (\1|\2))', pattern)
    pattern = re.sub(r'<([A-Z]+)>', r'( \1)', pattern)

    tags = ' ' + ' '.join(tok.pos_ for tok in doc)

    for m in re.finditer(pattern, tags):
        yield doc[tags[0:m.start()].count(' '):tags[0:m.end()].count(' ')]

In [3]:
pattern = r'<DET>? (<NOUN>+ <ADP|CONJ>)* <NOUN>+'


In [31]:
all_noun_chunks =[]


In [32]:
for x in newsgroups_train.data:
    doc = nlp(x)
    lists = pos_regex_matches(doc, pattern)
    for j in lists:
        all_noun_chunks.append(j.text.lower().strip())
        
   
    

KeyboardInterrupt: 

517135

In [72]:
allnounchunks = pd.DataFrame({'NounChunks':all_noun_chunks})

In [75]:
#allnounchunks.to_csv('allnounchunks20NewsGroup.csv',index=False)

In [44]:
allnounchunks = pd.read_csv('allnounchunks20NewsGroup.csv')

In [45]:
all_noun_chunks = allnounchunks.NounChunks
print(len(all_noun_chunks))

517135


In [46]:
from collections import Counter

In [47]:
freq_all_noun_chunks = Counter(all_noun_chunks)

In [48]:
freq_all_noun_chunks.most_common(500)

[('lines', 11263),
 ('subject', 10916),
 ('article', 6258),
 ('host', 4841),
 ('people', 4234),
 ('distribution', 2625),
 ('|', 2085),
 ('time', 1898),
 ('%', 1810),
 ('x', 1725),
 ('years', 1618),
 ('things', 1324),
 ('thanks', 1188),
 ('way', 1136),
 ('world', 1089),
 ('thing', 1075),
 ('version', 1059),
 ('information', 1047),
 ('system', 1036),
 ('part', 1035),
 ('year', 956),
 ('question', 919),
 ('life', 778),
 ('example', 776),
 ('others', 765),
 ('government', 746),
 ('the way', 730),
 ('file', 724),
 ('today', 711),
 ('problem', 711),
 ('fact', 707),
 ('name', 703),
 ('#', 680),
 ('a lot', 637),
 ('day', 626),
 ('the problem', 615),
 ('use', 614),
 ('problems', 614),
 ('email', 600),
 ('the time', 586),
 ('times', 581),
 ('case', 577),
 ('course', 574),
 ('number', 573),
 ('the people', 565),
 ('point', 559),
 ('-newsreader', 558),
 ('files', 558),
 ('program', 557),
 ('questions', 556),
 ('place', 551),
 ('line', 540),
 ('order', 540),
 ('drive', 539),
 ('work', 538),
 ('days

In [49]:
# Take top 1000 frequent noun phrases as Candidate labels as suggested by Mei et al.
top_3000_candid = freq_all_noun_chunks.most_common(3000)

In [50]:
candidate_labels = []
for u,v in top_3000_candid:
    candidate_labels.append(u)


In [51]:
candidate_labels

['lines',
 'subject',
 'article',
 'host',
 'people',
 'distribution',
 '|',
 'time',
 '%',
 'x',
 'years',
 'things',
 'thanks',
 'way',
 'world',
 'thing',
 'version',
 'information',
 'system',
 'part',
 'year',
 'question',
 'life',
 'example',
 'others',
 'government',
 'the way',
 'file',
 'today',
 'problem',
 'fact',
 'name',
 '#',
 'a lot',
 'day',
 'the problem',
 'use',
 'problems',
 'email',
 'the time',
 'times',
 'case',
 'course',
 'number',
 'the people',
 'point',
 '-newsreader',
 'files',
 'program',
 'questions',
 'place',
 'line',
 'order',
 'drive',
 'work',
 'days',
 'opinions',
 'children',
 'internet',
 'power',
 'data',
 'the fact',
 '_',
 'reply',
 'one',
 'the world',
 'hand',
 'software',
 'mail',
 'evidence',
 'reason',
 'money',
 'games',
 'team',
 'car',
 'the government',
 'law',
 'message',
 'stuff',
 'a bit',
 'side',
 'men',
 'person',
 'e-mail',
 'phone',
 'help',
 'game',
 'mind',
 'rights',
 'opinion',
 'faith',
 'man',
 'card',
 'the end',
 'chip'

In [52]:
lemmatized_labels = [candidate_label_lemmatized(w) for w in candidate_labels]

In [53]:
lemmatized_labels

['line',
 'subject',
 'article',
 'host',
 'people',
 'distribution',
 '|',
 'time',
 '%',
 'x',
 'year',
 'thing',
 'thank',
 'way',
 'world',
 'thing',
 'version',
 'information',
 'system',
 'part',
 'year',
 'question',
 'life',
 'example',
 'other',
 'government',
 'the way',
 'file',
 'today',
 'problem',
 'fact',
 'name',
 '#',
 'a lot',
 'day',
 'the problem',
 'use',
 'problem',
 'email',
 'the time',
 'time',
 'case',
 'course',
 'number',
 'the people',
 'point',
 '-newsreader',
 'file',
 'program',
 'question',
 'place',
 'line',
 'order',
 'drive',
 'work',
 'day',
 'opinion',
 'child',
 'internet',
 'power',
 'datum',
 'the fact',
 '_',
 'reply',
 'one',
 'the world',
 'hand',
 'software',
 'mail',
 'evidence',
 'reason',
 'money',
 'game',
 'team',
 'car',
 'the government',
 'law',
 'message',
 'stuff',
 'a bit',
 'side',
 'man',
 'person',
 'e - mail',
 'phone',
 'help',
 'game',
 'mind',
 'right',
 'opinion',
 'faith',
 'man',
 'card',
 'the end',
 'chip',
 'g)r',
 's

In [64]:
vocab.index("xcx")

ValueError: 'xcx' is not in list

In [67]:
def zero_order(freq,theta,lCandidat,NumTopic):
    """
    Calculate the Zero-Order Relevance

    Parameters:
    ----------
    freq : Array containing the frequency of occurrences of each word in the whole corpus
    theta : Array containing the frequency of occurrences of each word in each topic
    lcandidat: Array containing each label candidate
    NumTopic : The number of the topic
    
    Returns:
    -------
    topCandidate : Array containing the name of the top 10 score candidate for a given topic
    """
    
    #W matrice qui contient le score de chaque mot pour chaque topic
    W=np.log(theta/freq)
    
    # score des tous les candidats pour le topic NumTopic
    score=[]
    
    for indice in range (len(lCandidat)):
        candidat=lCandidat[indice].split(" ")
        sum_w=[]
        #print(candidat)
        for w in candidat:
            if w in vocab:
                i=vocab.index(w)
                sum_w.append(W[k,i])
            else:
                sum_w.append(0)
        #i=id2word.doc2idx(candidat)
        # supprime les -1 (qui signifie pas trouvé)
        #i[:] = [v for v in i if v != -1]
        
        score.append(np.sum(sum_w))
        
    #topValue, topCandidate = top10Score(score,lCandidat)
    dicti=pd.DataFrame({'Labels':lCandidat,'Score':score})
  
    return dicti



In [63]:
lemmatized_labels

['line',
 'subject',
 'article',
 'host',
 'people',
 'distribution',
 '|',
 'time',
 '%',
 'x',
 'year',
 'thing',
 'thank',
 'way',
 'world',
 'thing',
 'version',
 'information',
 'system',
 'part',
 'year',
 'question',
 'life',
 'example',
 'other',
 'government',
 'the way',
 'file',
 'today',
 'problem',
 'fact',
 'name',
 '#',
 'a lot',
 'day',
 'the problem',
 'use',
 'problem',
 'email',
 'the time',
 'time',
 'case',
 'course',
 'number',
 'the people',
 'point',
 '-newsreader',
 'file',
 'program',
 'question',
 'place',
 'line',
 'order',
 'drive',
 'work',
 'day',
 'opinion',
 'child',
 'internet',
 'power',
 'datum',
 'the fact',
 '_',
 'reply',
 'one',
 'the world',
 'hand',
 'software',
 'mail',
 'evidence',
 'reason',
 'money',
 'game',
 'team',
 'car',
 'the government',
 'law',
 'message',
 'stuff',
 'a bit',
 'side',
 'man',
 'person',
 'e - mail',
 'phone',
 'help',
 'game',
 'mind',
 'right',
 'opinion',
 'faith',
 'man',
 'card',
 'the end',
 'chip',
 'g)r',
 's

In [68]:
for k in range(0,20):
    print("Topic %d"%k)
    candid_for_topic_k = zero_order(freq,theta,lemmatized_labels,k)
    candid_for_topic_k = candid_for_topic_k.sort_values('Score',ascending=False)
    print(candid_for_topic_k.head(3))
    print("=======================")

Topic 0




       Labels     Score
2726   schism  3.240709
2335  passage  2.900605
1642  passage  2.900605
Topic 1
                      Labels     Score
941              gun control  3.863974
2810  law enforcement agency  3.405538
657                  militia  2.744172
Topic 2
          Labels     Score
750   power play  3.407615
1493  defenseman  2.726862
73          team  2.678519
Topic 3
          Labels     Score
1190  video card  6.421991
1398         cpu  5.497844
1491          os  5.497844
Topic 4
           Labels     Score
2358   disk drive  6.261851
2617   tape drive  5.982684
1365  disk drives  5.657680
Topic 5
             Labels     Score
2225  amateur radio  2.952223
114            sale  2.765084
1714           sale  2.765084
Topic 6
         Labels     Score
1151  satellite  3.411588
1037  satellite  3.411588
2128     launch  3.226621
Topic 7
                      Labels     Score
2810  law enforcement agency  6.223605
1510         chip encryption  5.873976
2913        the wiretap

In [70]:
dataset.to_csv('dataset_20_newsgroup_kmeans.csv',sep='\t',index=False)

## Proposed Topic labeling


In [44]:
def filter_noun_adj_from_phrase(text):
    noun_phrases = []
    doc = nlp(text)
    #for np in doc.noun_chunks:
     #   noun_phrases.append(np.text.lower().strip())
    lists = pos_regex_matches(doc, pattern)
    for j in lists:
        noun_phrases.append(j.text.lower().strip())
    return noun_phrases
def list_phrases_cluster(cluster_idx,df,key):
    df_sub = df[df[key] == cluster_idx]
    hashs = []
    for index,row in df_sub.iterrows():
        row.Noun_Phrases = literal_eval(row.Noun_Phrases)
        for x in row.Noun_Phrases:
            hashs.append(x.lower())
    
    return collections.Counter(hashs)
def return_clusters_title(cluster_idx,dataset,col):
    hash_cluster1 = list_phrases_cluster(cluster_idx,dataset,col) 
    top10  = hash_cluster1.most_common(3)
    dataset_sub = dataset[dataset[col] == cluster_idx]
    cnt_vectorizer = TfidfVectorizer(ngram_range=(3,15))
    vec = cnt_vectorizer.fit_transform(dataset_sub['Text'])
    scores = (vec.toarray()) 
    # Getting top ranking features 
    sums = vec.sum(axis = 0) 
    data1 = [] 
    features = cnt_vectorizer.get_feature_names()
    for col, term in enumerate(features): 
        data1.append((term, sums[0, col] )) 
    ranking = pd.DataFrame(data1, columns = ['term', 'rank'])
    ranking['wCount'] = ranking['term'].apply(lambda x: len(x.split()))
    ranking = ranking.sort_values(['rank','wCount'],ascending=[False,True])
    print(ranking.head(5))
    my_top_4=[]
    for u,v in top10:
        my_top_4.append(u)
    my_top_4 = " ".join(my_top_4)
    my_top_4 = my_top_4.replace("_"," ")

    #print(my_top_4)
    test_list = (my_top_4.split())
    df = pd.DataFrame(columns=ranking.columns) #empty dataframe to add filtered rows
    for index,row in ranking.iterrows():
        # using list comprehension 
        # checking if string contains list element 
        res = all(ele in row['term'] for ele in test_list) 

        # print result 
        if res == True:
            df.loc[len(df)]=[row['term'],row['rank'],row['wCount']] 
    df = df.sort_values(['rank','wCount'],ascending=[False,True])
    #df = df.sort_values(['wCount'])
    if len(df)>0:
        return df.iloc[0]['term']
    else:
        return "No title constructed"


In [55]:
#Assign Topic to the dataset
dataset = pd.DataFrame({'Text':newsgroups_train.data})

In [57]:
dataset['preprocessed'] = processed_docs

In [66]:
dataset['Topic'] = newsgroups_train.target

In [72]:
dataset['Topic_label'] = dataset['Topic'].apply(news_group_target_names)

In [103]:
topic_list = []
prob_list = []
topic_distribution = lda_model.get_document_topics(lda_model[corpus])
for i,j in enumerate(topic_distribution):
    row = sorted(j, key=lambda x: (x[1]), reverse=True)
    topic_num,prop_topic = row[0]
    topic_list.append(topic_num)
    prob_list.append(prop_topic)

dataset['LDA_topic_list'] = topic_list
dataset['prob_list'] = prob_list

In [104]:
dataset

Unnamed: 0,Text,preprocessed,Topic,Topic_label,LDA_topic_list,prob_list
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,"[lerxst, thing, subject, nntp, post, host, org...",7,rec.autos,13,0.298798
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,"[guykuo, carson, washington, subject, clock, p...",4,comp.sys.mac.hardware,13,0.303605
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,"[twillis, purdue, thomas, willis, subject, que...",4,comp.sys.mac.hardware,13,0.286641
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,"[jgreen, amber, green, subject, weitek, organi...",1,comp.graphics,13,0.297614
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,"[head, harvard, jonathan, mcdowell, subject, s...",14,sci.space,13,0.297167
5,From: dfo@vttoulu.tko.vtt.fi (Foxvog Douglas)\...,"[vttoulu, foxvog, douglas, subject, reword, se...",16,talk.politics.guns,15,0.297881
6,From: bmdelane@quads.uchicago.edu (brian manni...,"[bmdelane, quad, uchicago, brian, manning, del...",13,sci.med,13,0.307357
7,From: bgrubb@dante.nmsu.edu (GRUBB)\nSubject: ...,"[bgrubb, dante, nmsu, grubb, subject, scsi, or...",3,comp.sys.ibm.pc.hardware,13,0.272423
8,From: holmes7000@iscsvax.uni.edu\nSubject: WIn...,"[holmes, iscsvax, subject, icon, help, organiz...",2,comp.os.ms-windows.misc,13,0.298071
9,From: kerr@ux1.cso.uiuc.edu (Stan Kerr)\nSubje...,"[kerr, uiuc, stan, kerr, subject, sigma, desig...",4,comp.sys.mac.hardware,13,0.286782


In [105]:
dataset.to_csv('dataset_lda_20_newsgroup.csv',sep='\t',index=False)

In [38]:
dataset = pd.read_csv('dataset_lda_20_newsgroup_withNounPhrases.csv',sep='\t')

In [39]:
dataset

Unnamed: 0,Text,preprocessed,Topic,Topic_label,LDA_topic_list,prob_list,Noun_Phrases
0,From: lerxst@wam.umd.edu (where's my thing)\r\...,"['lerxst', 'thing', 'subject', 'nntp', 'post',...",7,rec.autos,13,0.298798,"['lerxst@wam.umd.edu', 'thing', 'car', 'host',..."
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,"['guykuo', 'carson', 'washington', 'subject', ...",4,comp.sys.mac.hardware,13,0.303605,"['subject', 'call', 'summary', 'call', 'clock ..."
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,"['twillis', 'purdue', 'thomas', 'willis', 'sub...",4,comp.sys.mac.hardware,13,0.286641,"['subject', 'questions', 'distribution', 'folk..."
3,From: jgreen@amber (Joe Green)\r\r\nSubject: R...,"['jgreen', 'amber', 'green', 'subject', 'weite...",1,comp.graphics,13,0.297614,"['subject', 'p9000', 'lines', 'distribution', ..."
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,"['head', 'harvard', 'jonathan', 'mcdowell', 's...",14,sci.space,13,0.297167,"['article', 'article', 'c5jlwx.4h9.1@cs.cmu.ed..."
5,From: dfo@vttoulu.tko.vtt.fi (Foxvog Douglas)\...,"['vttoulu', 'foxvog', 'douglas', 'subject', 'r...",16,talk.politics.guns,15,0.297881,"['ideas', 'lines', 'article', 'article', 'arti..."
6,From: bmdelane@quads.uchicago.edu (brian manni...,"['bmdelane', 'quad', 'uchicago', 'brian', 'man...",13,sci.med,13,0.307357,"['brain', 'thanks', 'people', 'request for inf..."
7,From: bgrubb@dante.nmsu.edu (GRUBB)\r\r\nSubje...,"['bgrubb', 'dante', 'nmsu', 'grubb', 'subject'...",3,comp.sys.ibm.pc.hardware,13,0.272423,"['subject', 'distribution', 'world', 'posting'..."
8,From: holmes7000@iscsvax.uni.edu\r\r\nSubject:...,"['holmes', 'iscsvax', 'subject', 'icon', 'help...",2,comp.os.ms-windows.misc,13,0.298071,"['win', 'icon help please', 'lines', 'win', 'i..."
9,From: kerr@ux1.cso.uiuc.edu (Stan Kerr)\r\r\nS...,"['kerr', 'uiuc', 'stan', 'kerr', 'subject', 's...",4,comp.sys.mac.hardware,13,0.286782,"['designs', 'any information', 'board', 'board..."


In [40]:
#dataset['Noun_Phrases'] = dataset['Text'].apply(filter_noun_adj_from_phrase)


In [41]:
type(dataset['Noun_Phrases'])

pandas.core.series.Series

In [14]:
dataset['LDA_topic_list'].value_counts()

13    7876
15    3438
Name: LDA_topic_list, dtype: int64

In [16]:
dataset.iloc[1]['Noun_Phrases']

"['subject', 'call', 'summary', 'call', 'clock reports', 'si', 'acceleration', 'clock', 'posting', 'host', 'number', 'souls', 'clock oscillator', 'experiences', 'this poll', 'message', 'experiences', 'the procedure', 'speed', 'speed', 'cards', 'adapters', 'heat sinks', 'hour of usage per day', 'disk', 'functionality', 'm floppies', 'days', 'the network', 'knowledge base', 'the clock', 'poll', 'thanks']"

In [92]:
top_elements_topics = []
for i in range(0,20):
    hash_cluster1 = list_phrases_cluster(i,dataset,'Topic')
    top_elements_topics.append(hash_cluster1.most_common(10))

In [93]:
top_elements_topics

[[('people', 451),
  ('article', 382),
  ('subject', 361),
  ('lines', 350),
  ('host', 234),
  ('posting', 198),
  ('atheism', 193),
  ('atheists', 176),
  ('things', 138),
  ('system', 127)],
 [('lines', 446),
  ('subject', 354),
  ('host', 276),
  ('article', 183),
  ('version', 160),
  ('files', 151),
  ('posting', 150),
  ('graphics', 140),
  ('images', 135),
  ('image', 124)],
 [('g)r', 437),
  ('lines', 376),
  ('subject', 357),
  ('%', 270),
  ('host', 242),
  ('article', 225),
  ('w', 208),
  ('windows', 168),
  ('#', 155),
  ('files', 150)],
 [('lines', 383),
  ('subject', 325),
  ('drive', 274),
  ('host', 263),
  ('article', 190),
  ('system', 150),
  ('drives', 147),
  ('bus', 146),
  ('card', 133),
  ('posting', 123)],
 [('lines', 373),
  ('subject', 295),
  ('host', 295),
  ('article', 191),
  ('posting', 146),
  ('distribution', 128),
  ('drive', 116),
  ('problem', 79),
  ('mb', 79),
  ('thanks', 73)],
 [('x', 3047),
  ('lines', 480),
  ('subject', 335),
  ('host', 331

In [58]:
dataset.to_csv('dataset_lda_20_newsgroup_withNounPhrases.csv',sep='\t',index=False)

In [97]:
# initializing list of lists 
test_list = top_elements_topics
single_list = []
for x in test_list:
    for u,v in x:
        single_list.append(u)
freq = collections.Counter(single_list)


In [98]:
freq

Counter({'people': 10,
         'article': 19,
         'subject': 20,
         'lines': 20,
         'host': 19,
         'posting': 11,
         'atheism': 1,
         'atheists': 1,
         'things': 2,
         'system': 2,
         'version': 2,
         'files': 2,
         'graphics': 1,
         'images': 1,
         'image': 1,
         'g)r': 1,
         '%': 4,
         'w': 1,
         'windows': 1,
         '#': 2,
         'drive': 2,
         'drives': 1,
         'bus': 1,
         'card': 1,
         'distribution': 6,
         'problem': 1,
         'mb': 1,
         'thanks': 1,
         'x': 1,
         'program': 1,
         'information': 2,
         'file': 2,
         'sale': 1,
         'shipping': 1,
         'condition': 1,
         'offer': 1,
         'price': 1,
         'car': 1,
         'cars': 1,
         'the car': 1,
         'bike': 1,
         'dod': 1,
         'the bike': 1,
         'bikes': 1,
         'year': 1,
         'games': 2,
         

In [99]:
common_element=[]
for u,v in freq.items():
    if v>1:
        common_element.append(u)
common_element

['people',
 'article',
 'subject',
 'lines',
 'host',
 'posting',
 'things',
 'system',
 'version',
 'files',
 '%',
 '#',
 'drive',
 'distribution',
 'information',
 'file',
 'games',
 'players',
 'team',
 'years',
 'life',
 'time',
 'government']

In [104]:
new_top_elements_topics = []
for top10_in_topic in top_elements_topics:
    new_list_topic = []
    for u,v in top10_in_topic:
        if u not in common_element:
            new_list_topic.append((u,v))
    new_top_elements_topics.append(new_list_topic)

In [130]:
new_top_elements_topics[12]

[('wire', 88), ('ground', 70), ('use', 67), ('power', 66)]

In [131]:
top_elements_topics[12]

[('lines', 429),
 ('subject', 378),
 ('host', 269),
 ('article', 231),
 ('posting', 124),
 ('distribution', 116),
 ('wire', 88),
 ('ground', 70),
 ('use', 67),
 ('power', 66)]

In [136]:
topic_id = 12
dataset_sub = dataset[dataset['Topic'] == topic_id]
cnt_vectorizer = TfidfVectorizer(ngram_range=(10,15))
vec = cnt_vectorizer.fit_transform(dataset_sub['Text'])
scores = (vec.toarray()) 
# Getting top ranking features 
sums = vec.sum(axis = 0) 
data1 = [] 
features = cnt_vectorizer.get_feature_names()
for col, term in enumerate(features): 
    data1.append((term, sums[0, col] )) 
ranking = pd.DataFrame(data1, columns = ['term', 'rank'])
ranking['wCount'] = ranking['term'].apply(lambda x: len(x.split()))
ranking = ranking.sort_values(['rank','wCount'],ascending=[False,True])
print(ranking.head(5))
my_top_4=[]
for u,v in new_top_elements_topics[topic_id]:
    my_top_4.append(u)


print(my_top_4)
test_list = my_top_4
df = pd.DataFrame(columns=ranking.columns) #empty dataframe to add filtered rows
for index,row in ranking.iterrows():
    # using list comprehension 
    # checking if string contains list element 
    res = all(ele in row['term'] for ele in test_list) 

    # print result 
    if res == True:
        df.loc[len(df)]=[row['term'],row['rank'],row['wCount']] 
df = df.sort_values(['rank','wCount'],ascending=[False,True])


                                                     term      rank  wCount
448593   subject re need to find out number to phone line  0.397750      10
400265  re need to find out number to phone line organ...  0.278176      10
448606  subject re need to find out number to phone li...  0.278176      11
41144       and don want to call up the operator to place  0.275399      10
96744   call up the operator to place trace on it ques...  0.275399      10
['wire', 'ground', 'use', 'power']


In [137]:
df

Unnamed: 0,term,rank,wCount
