In [8]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import collections
import math
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
#scipy
import scipy 
# spacy for lemmatization
import spacy
import os
from sklearn.datasets import fetch_20newsgroups
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# Extract noun chunks from corpus
import spacy
nlp = spacy.load('en_core_web_sm')

In [9]:
newsgroups_train = fetch_20newsgroups(subset='train', shuffle = True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle = True)

In [10]:
print(list(newsgroups_train.target_names))


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [11]:
print(newsgroups_train.filenames.shape, newsgroups_train.target.shape)


(11314,) (11314,)


In [69]:
def lemmatization(text):
    """https://spacy.io/api/annotation"""
    texts_out = []
    doc = nlp(" ".join(text)) 
    texts_out = [token.lemma_ for token in doc]
    return texts_out


# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    #print(result)
    lemmatized = lemmatization(result)
            
    return lemmatized
def candidate_label_lemmatized(label):
    w_labels = label.split()
    lemmatized = lemmatization(w_labels)
    return " ".join(lemmatized)

def news_group_target_names(k):
    return newsgroups_train.target_names[k]
    

In [13]:
candidate_label_lemmatized('a eating rabbit')

'a eat rabbit'

In [14]:
document_num = 50
doc_sample = 'This disk has failed many times. I would like to get it replaced.'

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))


Original document: 
['This', 'disk', 'has', 'failed', 'many', 'times.', 'I', 'would', 'like', 'to', 'get', 'it', 'replaced.']


Tokenized and lemmatized document: 
['disk', 'fail', 'time', 'like', 'replace']


In [15]:
processed_docs = []

for doc in newsgroups_train.data:
    processed_docs.append(preprocess(doc))

In [16]:
newsgroups_train.data[1]

"From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 11\nNNTP-Posting-Host: carson.u.washington.edu\n\nA fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't answered this\npoll. Thanks.\n\nGuy Kuo <guykuo@u.washington.edu>\n"

In [17]:
# Create Dictionary
id2word = corpora.Dictionary(processed_docs)
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in processed_docs]
corpus[:2]


[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 2),
  (18, 1),
  (19, 2),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1)],
 [(13, 1),
  (24, 1),
  (25, 1),
  (27, 1),
  (34, 1),
  (36, 1),
  (38, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 2),
  (51, 5),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 2),
  (57, 1),
  (58, 2),
  (59, 2),
  (60, 1),
  (61, 2),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 3),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 2

In [74]:
#Convert a streamed corpus in bag-of-words format into a sparse matrix scipy.sparse.csc_matrix, with documents as columns.

term_doc = gensim.matutils.corpus2csc(corpus)
freq = term_doc.sum(axis=1)
freq = (freq/np.sum(freq)).T
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# In[21]:




In [75]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]


[(0,
  '0.111*"team" + 0.096*"game" + 0.053*"play" + 0.051*"player" + '
  '0.024*"division" + 0.021*"boston" + 0.019*"buffalo" + 0.016*"year" + '
  '0.016*"gatech" + 0.015*"kevin"'),
 (1,
  '0.051*"weapon" + 0.045*"gun" + 0.037*"car" + 0.033*"insurance" + '
  '0.031*"firearm" + 0.030*"safety" + 0.021*"police" + 0.020*"carry" + '
  '0.020*"auto" + 0.017*"vehicle"'),
 (2,
  '0.062*"_" + 0.034*"earth" + 0.030*"image" + 0.023*"moon" + 0.021*"comp" + '
  '0.019*"scientific" + 0.018*"scan" + 0.018*"science" + 0.016*"orbit" + '
  '0.015*"objective"'),
 (3,
  '0.029*"david" + 0.025*"article" + 0.022*"netcom" + 0.021*"washington" + '
  '0.017*"steve" + 0.016*"write" + 0.015*"andrew" + 0.014*"robert" + '
  '0.014*"patient" + 0.010*"scott"'),
 (4,
  '0.069*"chip" + 0.044*"clipper" + 0.039*"master" + 0.038*"encryption" + '
  '0.037*"uiuc" + 0.032*"key" + 0.028*"security" + 0.023*"algorithm" + '
  '0.020*"secret" + 0.019*"illinois"'),
 (5,
  '0.073*"color" + 0.052*"picture" + 0.043*"family" + 0.034

In [76]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)



Perplexity:  -15.495953331174443

Coherence Score:  0.4960214286263211


In [77]:
df=pd.DataFrame(data=lda_model.get_topics())


In [78]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,72800,72801,72802,72803,72804,72805,72806,72807,72808,72809
0,3.088903e-06,3.088903e-06,3.088903e-06,3.088903e-06,3.088903e-06,3.088903e-06,3.088903e-06,3.088903e-06,3.088903e-06,3.088903e-06,...,3.088903e-06,3.088903e-06,3.088903e-06,3.088903e-06,3.088903e-06,3.088903e-06,3.088903e-06,3.088903e-06,3.088903e-06,3.088938e-06
1,4.003671e-06,4.003671e-06,4.003671e-06,4.003671e-06,4.003671e-06,4.003671e-06,4.003671e-06,4.003671e-06,4.003671e-06,4.003671e-06,...,4.003671e-06,4.003671e-06,4.003671e-06,4.003671e-06,4.003671e-06,4.003671e-06,4.003671e-06,4.003671e-06,4.003671e-06,4.003819e-06
2,2.679623e-06,2.679623e-06,2.679623e-06,2.679623e-06,2.679623e-06,2.679623e-06,2.679623e-06,2.679623e-06,2.679623e-06,2.679623e-06,...,2.679647e-06,2.679623e-06,2.679779e-06,2.679779e-06,2.679779e-06,2.679936e-06,2.679779e-06,2.679623e-06,2.679623e-06,2.679623e-06
3,9.484425e-07,9.484425e-07,9.484425e-07,9.484425e-07,9.484425e-07,9.484425e-07,0.0003314277,9.484425e-07,9.484425e-07,9.484425e-07,...,9.48455e-07,9.484442e-07,9.484924e-07,9.484924e-07,9.484924e-07,9.485422e-07,9.484924e-07,9.484493e-07,9.48456e-07,9.484569e-07
4,2.668073e-06,2.668073e-06,2.668073e-06,2.668073e-06,2.668073e-06,2.668073e-06,2.668073e-06,2.668073e-06,2.668073e-06,2.668073e-06,...,2.668073e-06,2.668073e-06,2.668073e-06,2.668073e-06,2.668073e-06,2.668073e-06,2.668073e-06,2.668073e-06,2.668073e-06,2.668073e-06
5,4.992307e-06,4.992307e-06,4.992307e-06,4.992307e-06,4.992307e-06,4.992307e-06,4.992307e-06,4.992307e-06,4.992307e-06,4.992307e-06,...,4.992307e-06,4.992307e-06,4.992307e-06,4.992307e-06,4.992307e-06,4.992307e-06,4.992307e-06,4.992307e-06,4.992307e-06,4.992424e-06
6,6.328162e-06,6.328162e-06,6.328162e-06,6.328162e-06,6.328162e-06,6.328162e-06,6.328162e-06,6.328162e-06,6.328162e-06,6.328162e-06,...,6.328162e-06,6.328162e-06,6.328162e-06,6.328162e-06,6.328162e-06,6.328162e-06,6.328162e-06,6.328162e-06,6.328162e-06,6.328162e-06
7,6.357585e-06,6.357585e-06,6.357585e-06,6.357585e-06,6.357585e-06,6.357585e-06,6.357585e-06,6.357585e-06,6.357585e-06,0.05644533,...,6.357585e-06,6.357585e-06,6.357585e-06,6.357585e-06,6.357585e-06,6.357585e-06,6.357585e-06,6.357756e-06,6.357928e-06,6.357956e-06
8,3.79472e-06,3.79472e-06,3.79472e-06,3.79472e-06,3.79472e-06,3.79472e-06,3.79472e-06,3.79472e-06,3.79472e-06,3.79472e-06,...,3.79472e-06,3.79472e-06,3.79472e-06,3.79472e-06,3.79472e-06,3.79472e-06,3.79472e-06,3.79472e-06,3.79472e-06,3.79472e-06
9,1.987204e-06,1.987204e-06,1.987204e-06,1.987204e-06,1.987204e-06,1.987204e-06,1.987204e-06,1.987204e-06,1.987204e-06,1.987204e-06,...,1.987223e-06,1.987205e-06,1.987205e-06,1.987205e-06,1.987205e-06,1.987205e-06,1.987205e-06,1.987368e-06,1.987532e-06,1.987205e-06


In [79]:
df.shape

(20, 72810)

In [80]:
theta=df.values

In [81]:
theta.shape


(20, 72810)

In [82]:
freq.shape


(1, 72810)

In [83]:
np.ones((8,1))

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.]])

In [84]:
#function taken from Textacy library to identify noun, verb and prepositional phrases
def pos_regex_matches(doc, pattern):
    """
    Extract sequences of consecutive tokens from a spacy-parsed doc whose
    part-of-speech tags match the specified regex pattern.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc`` or ``spacy.Span``)
        pattern (str): Pattern of consecutive POS tags whose corresponding words
            are to be extracted, inspired by the regex patterns used in NLTK's
            `nltk.chunk.regexp`. Tags are uppercase, from the universal tag set;
            delimited by < and >, which are basically converted to parentheses
            with spaces as needed to correctly extract matching word sequences;
            white space in the input doesn't matter.

            Examples (see ``constants.POS_REGEX_PATTERNS``):

            * noun phrase: r'<DET>? (<NOUN>+ <ADP|CONJ>)* <NOUN>+'
            * compound nouns: r'<NOUN>+'
            * verb phrase: r'<VERB>?<ADV>*<VERB>+'
            * prepositional phrase: r'<PREP> <DET>? (<NOUN>+<ADP>)* <NOUN>+'

    Yields:
        ``spacy.Span``: the next span of consecutive tokens from ``doc`` whose
        parts-of-speech match ``pattern``, in order of apperance
    """
    # standardize and transform the regular expression pattern...
    pattern = re.sub(r'\s', '', pattern)
    pattern = re.sub(r'<([A-Z]+)\|([A-Z]+)>', r'( (\1|\2))', pattern)
    pattern = re.sub(r'<([A-Z]+)>', r'( \1)', pattern)

    tags = ' ' + ' '.join(tok.pos_ for tok in doc)

    for m in re.finditer(pattern, tags):
        yield doc[tags[0:m.start()].count(' '):tags[0:m.end()].count(' ')]

In [85]:
pattern = r'<DET>? (<NOUN>+ <ADP|CONJ>)* <NOUN>+'


In [31]:
all_noun_chunks =[]


In [32]:
for x in newsgroups_train.data:
    doc = nlp(x)
    lists = pos_regex_matches(doc, pattern)
    for j in lists:
        all_noun_chunks.append(j.text.lower().strip())
        
   
    

KeyboardInterrupt: 

517135

In [72]:
allnounchunks = pd.DataFrame({'NounChunks':all_noun_chunks})

In [75]:
#allnounchunks.to_csv('allnounchunks20NewsGroup.csv',index=False)

In [35]:
allnounchunks = pd.read_csv('allnounchunks20NewsGroup.csv')

In [36]:
all_noun_chunks = allnounchunks.NounChunks
print(len(all_noun_chunks))

517135


In [37]:
from collections import Counter

In [38]:
freq_all_noun_chunks = Counter(all_noun_chunks)

In [39]:
freq_all_noun_chunks.most_common(500)

[('lines', 11263),
 ('subject', 10916),
 ('article', 6258),
 ('host', 4841),
 ('people', 4234),
 ('distribution', 2625),
 ('|', 2085),
 ('time', 1898),
 ('%', 1810),
 ('x', 1725),
 ('years', 1618),
 ('things', 1324),
 ('thanks', 1188),
 ('way', 1136),
 ('world', 1089),
 ('thing', 1075),
 ('version', 1059),
 ('information', 1047),
 ('system', 1036),
 ('part', 1035),
 ('year', 956),
 ('question', 919),
 ('life', 778),
 ('example', 776),
 ('others', 765),
 ('government', 746),
 ('the way', 730),
 ('file', 724),
 ('today', 711),
 ('problem', 711),
 ('fact', 707),
 ('name', 703),
 ('#', 680),
 ('a lot', 637),
 ('day', 626),
 ('the problem', 615),
 ('use', 614),
 ('problems', 614),
 ('email', 600),
 ('the time', 586),
 ('times', 581),
 ('case', 577),
 ('course', 574),
 ('number', 573),
 ('the people', 565),
 ('point', 559),
 ('-newsreader', 558),
 ('files', 558),
 ('program', 557),
 ('questions', 556),
 ('place', 551),
 ('line', 540),
 ('order', 540),
 ('drive', 539),
 ('work', 538),
 ('days

In [40]:
# Take top 1000 frequent noun phrases as Candidate labels as suggested by Mei et al.
top_3000_candid = freq_all_noun_chunks.most_common(3000)

In [41]:
candidate_labels = []
for u,v in top_3000_candid:
    candidate_labels.append(u)


In [42]:
candidate_labels

['lines',
 'subject',
 'article',
 'host',
 'people',
 'distribution',
 '|',
 'time',
 '%',
 'x',
 'years',
 'things',
 'thanks',
 'way',
 'world',
 'thing',
 'version',
 'information',
 'system',
 'part',
 'year',
 'question',
 'life',
 'example',
 'others',
 'government',
 'the way',
 'file',
 'today',
 'problem',
 'fact',
 'name',
 '#',
 'a lot',
 'day',
 'the problem',
 'use',
 'problems',
 'email',
 'the time',
 'times',
 'case',
 'course',
 'number',
 'the people',
 'point',
 '-newsreader',
 'files',
 'program',
 'questions',
 'place',
 'line',
 'order',
 'drive',
 'work',
 'days',
 'opinions',
 'children',
 'internet',
 'power',
 'data',
 'the fact',
 '_',
 'reply',
 'one',
 'the world',
 'hand',
 'software',
 'mail',
 'evidence',
 'reason',
 'money',
 'games',
 'team',
 'car',
 'the government',
 'law',
 'message',
 'stuff',
 'a bit',
 'side',
 'men',
 'person',
 'e-mail',
 'phone',
 'help',
 'game',
 'mind',
 'rights',
 'opinion',
 'faith',
 'man',
 'card',
 'the end',
 'chip'

In [43]:
lemmatized_labels = [candidate_label_lemmatized(w) for w in candidate_labels]

In [44]:
lemmatized_labels

['line',
 'subject',
 'article',
 'host',
 'people',
 'distribution',
 '|',
 'time',
 '%',
 'x',
 'year',
 'thing',
 'thank',
 'way',
 'world',
 'thing',
 'version',
 'information',
 'system',
 'part',
 'year',
 'question',
 'life',
 'example',
 'other',
 'government',
 'the way',
 'file',
 'today',
 'problem',
 'fact',
 'name',
 '#',
 'a lot',
 'day',
 'the problem',
 'use',
 'problem',
 'email',
 'the time',
 'time',
 'case',
 'course',
 'number',
 'the people',
 'point',
 '-newsreader',
 'file',
 'program',
 'question',
 'place',
 'line',
 'order',
 'drive',
 'work',
 'day',
 'opinion',
 'child',
 'internet',
 'power',
 'datum',
 'the fact',
 '_',
 'reply',
 'one',
 'the world',
 'hand',
 'software',
 'mail',
 'evidence',
 'reason',
 'money',
 'game',
 'team',
 'car',
 'the government',
 'law',
 'message',
 'stuff',
 'a bit',
 'side',
 'man',
 'person',
 'e - mail',
 'phone',
 'help',
 'game',
 'mind',
 'right',
 'opinion',
 'faith',
 'man',
 'card',
 'the end',
 'chip',
 'g)r',
 's

In [86]:
def zero_order(freq,theta,lCandidat,NumTopic):
    """
    Calculate the Zero-Order Relevance

    Parameters:
    ----------
    freq : Array containing the frequency of occurrences of each word in the whole corpus
    theta : Array containing the frequency of occurrences of each word in each topic
    lcandidat: Array containing each label candidate
    NumTopic : The number of the topic
    
    Returns:
    -------
    topCandidate : Array containing the name of the top 10 score candidate for a given topic
    """
    
    #W matrice qui contient le score de chaque mot pour chaque topic
    W=np.log(theta/freq)
    
    # score des tous les candidats pour le topic NumTopic
    score=[]
    
    for indice in range (len(lCandidat)):
        candidat=lCandidat[indice].split(" ")
        i=id2word.doc2idx(candidat)
        # supprime les -1 (qui signifie pas trouvé)
        i[:] = [v for v in i if v != -1]
        
        score.append(np.sum(W[k,i]))
        
    #topValue, topCandidate = top10Score(score,lCandidat)
    dicti=pd.DataFrame({'Labels':lCandidat,'Score':score})
  
    return dicti



In [90]:
for k in range(0,20):
    print("Topic %d"%k)
    candid_for_topic_k = zero_order(freq,theta,lemmatized_labels,k)
    candid_for_topic_k = candid_for_topic_k.sort_values('Score',ascending=False)
    print(candid_for_topic_k.head(3))
    print("=======================")

Topic 0
          Labels     Score
2287    the team  5.451503
308     the team  5.451503
2563  the helmet  5.402439
Topic 1
           Labels     Score
941   gun control  6.505391
1018      the gun  5.880123
2671      the gun  5.880123
Topic 2
           Labels     Score
665      the moon  5.177251
1938   the planet  5.164918
1471  the surface  5.014376
Topic 3
       Labels     Score
498   patient  4.163988
2071    brain  3.805573
911     brain  3.805573
Topic 4
                 Labels     Score
1510    chip encryption  8.356317
1901  encryption device  7.412092
2683  government agency  6.366389
Topic 5
           Labels     Score
1760  the picture  6.420124
1945   the family  6.193415
2962    the color  5.966463
Topic 6
           Labels     Score
2218  the benefit  6.505139
2087  the benefit  6.505139
1210    processor  5.533729
Topic 7
          Labels     Score
764   the engine  6.670243
496     the bike  6.664844
2930   the noise  6.513925
Topic 8
           Labels     Score
2813

## Proposed Topic labeling


In [55]:
#Assign Topic to the dataset
dataset = pd.DataFrame({'Text':newsgroups_train.data})

In [57]:
dataset['preprocessed'] = processed_docs

In [66]:
dataset['Topic'] = newsgroups_train.target

In [72]:
dataset['Topic_label'] = dataset['Topic'].apply(news_group_target_names)

In [103]:
topic_list = []
prob_list = []
topic_distribution = lda_model.get_document_topics(lda_model[corpus])
for i,j in enumerate(topic_distribution):
    row = sorted(j, key=lambda x: (x[1]), reverse=True)
    topic_num,prop_topic = row[0]
    topic_list.append(topic_num)
    prob_list.append(prop_topic)

dataset['LDA_topic_list'] = topic_list
dataset['prob_list'] = prob_list

In [104]:
dataset

Unnamed: 0,Text,preprocessed,Topic,Topic_label,LDA_topic_list,prob_list
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,"[lerxst, thing, subject, nntp, post, host, org...",7,rec.autos,13,0.298798
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,"[guykuo, carson, washington, subject, clock, p...",4,comp.sys.mac.hardware,13,0.303605
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,"[twillis, purdue, thomas, willis, subject, que...",4,comp.sys.mac.hardware,13,0.286641
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,"[jgreen, amber, green, subject, weitek, organi...",1,comp.graphics,13,0.297614
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,"[head, harvard, jonathan, mcdowell, subject, s...",14,sci.space,13,0.297167
5,From: dfo@vttoulu.tko.vtt.fi (Foxvog Douglas)\...,"[vttoulu, foxvog, douglas, subject, reword, se...",16,talk.politics.guns,15,0.297881
6,From: bmdelane@quads.uchicago.edu (brian manni...,"[bmdelane, quad, uchicago, brian, manning, del...",13,sci.med,13,0.307357
7,From: bgrubb@dante.nmsu.edu (GRUBB)\nSubject: ...,"[bgrubb, dante, nmsu, grubb, subject, scsi, or...",3,comp.sys.ibm.pc.hardware,13,0.272423
8,From: holmes7000@iscsvax.uni.edu\nSubject: WIn...,"[holmes, iscsvax, subject, icon, help, organiz...",2,comp.os.ms-windows.misc,13,0.298071
9,From: kerr@ux1.cso.uiuc.edu (Stan Kerr)\nSubje...,"[kerr, uiuc, stan, kerr, subject, sigma, desig...",4,comp.sys.mac.hardware,13,0.286782


In [105]:
dataset.to_csv('dataset_lda_20_newsgroup.csv',sep='\t',index=False)

In [106]:
sf = pd.read_csv('dataset_lda_20_newsgroup.csv',sep='\t')

In [107]:
sf

Unnamed: 0,Text,preprocessed,Topic,Topic_label,LDA_topic_list,prob_list
0,From: lerxst@wam.umd.edu (where's my thing)\r\...,"['lerxst', 'thing', 'subject', 'nntp', 'post',...",7,rec.autos,13,0.298798
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,"['guykuo', 'carson', 'washington', 'subject', ...",4,comp.sys.mac.hardware,13,0.303605
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,"['twillis', 'purdue', 'thomas', 'willis', 'sub...",4,comp.sys.mac.hardware,13,0.286641
3,From: jgreen@amber (Joe Green)\r\nSubject: Re:...,"['jgreen', 'amber', 'green', 'subject', 'weite...",1,comp.graphics,13,0.297614
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,"['head', 'harvard', 'jonathan', 'mcdowell', 's...",14,sci.space,13,0.297167
5,From: dfo@vttoulu.tko.vtt.fi (Foxvog Douglas)\...,"['vttoulu', 'foxvog', 'douglas', 'subject', 'r...",16,talk.politics.guns,15,0.297881
6,From: bmdelane@quads.uchicago.edu (brian manni...,"['bmdelane', 'quad', 'uchicago', 'brian', 'man...",13,sci.med,13,0.307357
7,From: bgrubb@dante.nmsu.edu (GRUBB)\r\nSubject...,"['bgrubb', 'dante', 'nmsu', 'grubb', 'subject'...",3,comp.sys.ibm.pc.hardware,13,0.272423
8,From: holmes7000@iscsvax.uni.edu\r\nSubject: W...,"['holmes', 'iscsvax', 'subject', 'icon', 'help...",2,comp.os.ms-windows.misc,13,0.298071
9,From: kerr@ux1.cso.uiuc.edu (Stan Kerr)\r\nSub...,"['kerr', 'uiuc', 'stan', 'kerr', 'subject', 's...",4,comp.sys.mac.hardware,13,0.286782
