In [11]:
!pip install lxml
!pip install bs4
from bs4 import BeautifulSoup, SoupStrainer
import os, sys, codecs, string, time
from gensim import corpora, models, similarities
from itertools import chain
from collections import Counter
from gensim.models import CoherenceModel, phrases

from gensim.models.phrases import Phraser

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  as gensimvis  


import re
import xml.sax.saxutils as saxutils

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import pandas as pd
import numpy as np

# from keras.models import Sequential
# from keras.layers.embeddings import Embedding
# from keras.layers.recurrent import GRU
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
# from keras.layers import Dense, Dropout
# from keras.callbacks import ModelCheckpoint

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 20})
import matplotlib.ticker as mtick

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
#from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

import itertools

# use GPU
# import tensorflow as tf
# physical_devices = tf.config.experimental.list_physical_devices('GPU')
# tf.config.experimental.set_visible_devices(physical_devices[2], 'GPU')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vgkortsas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/vgkortsas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import sys
print(sys.executable)

/home/vgkortsas/.conda/envs/VOI/bin/python


# Functions that help to extract classes

In [3]:
ver='LDA'

def remove_tags(text):
    return re.sub('<[^<]+>', '', text).strip() # reference: https://stackoverflow.com/questions/12823568/python-strip-xml-tags-from-document

folder = './reuters21578/'

# files that have the classes
# we save the files in a dictionary where key is the category (i.e. topics, places, people, organizations, exchanges)
# each categoty has different classes (labels)
classes_files = {
    'Topics_': 'all-topics-strings.lc.txt',
    'Places_': 'all-places-strings.lc.txt',
    'People_': 'all-people-strings.lc.txt',
    'Organizations_': 'all-orgs-strings.lc.txt',
    'Exchanges_': 'all-exchanges-strings.lc.txt'
}


# we save the categories and the classes and initiate (i.e 0) a counter, 
# which measures the number of documents thet each class appears. 
# We will use that for our dataframe
classes = []

for key in classes_files.keys():
    with open(folder + classes_files[key], 'r') as file:
        for class_ in file.readlines():
            classes.append([key[:-1], key.lower() + class_.strip().lower(), 0])  # strip() to remove the leading and trailing characters
  
# create a dataframe
classes_df = pd.DataFrame(data=classes, columns=['category', 'class','number_of_documents'])

# for each document we read the classes 
def create_doc_classes(doc_):
    
    topics = doc_.topics.contents
    places = doc_.places.contents
    people = doc_.people.contents
    orgs = doc_.orgs.contents
    exchanges = doc_.exchanges.contents
        
    doc_classes=[]
    for topic in topics:
        doc_classes.append('topics_' + remove_tags(str(topic)))
                
    for place in places:
        doc_classes.append('places_' + remove_tags(str(place)))
                
    for person in people:
        doc_classes.append('people_' + remove_tags(str(person)))
                
    for org in orgs:
        doc_classes.append('organizations_' + remove_tags(str(org)))
                
    for exchange in exchanges:
        doc_classes.append('exchanges_' + remove_tags(str(exchange)))
                
    return doc_classes
   
    
# populate the dataframe, i.e. count in how many documents each class appears
def populate_df(classes_):
    for class_ in classes_:
        idx = classes_df[classes_df['class'] == class_].index[0]
        count = classes_df._get_value(idx, 'number_of_documents')
        classes_df._set_value(idx, 'number_of_documents', count+1)
        

# since each document can belong to multiple classes, we have a multi-class, multi-label classification problem, 
# so we need to do multi-hot encoding
def multi_hot_encoding(classes_, target_classes):
    multi_hot_vector = np.zeros(len(target_classes)).astype(np.float32)
    
    for i in range(len(target_classes)):
        # we set as 1 the element of the vector which corresponds to one of the document classes
        if target_classes[i] in classes_:
            multi_hot_vector[i] = 1.0
    
    return multi_hot_vector

# Process the documents to extract features and classes

In [4]:
doc_X = []

# read the SGML files
number_of_sgml_files = 22
for i in range(number_of_sgml_files):
    file_name = 'reut2-{}.sgm'.format(str(i).zfill(3))
    print('file: %s' % file_name)
    
    with open(folder + file_name, 'rb') as file:
        content = BeautifulSoup(file.read().lower(), "lxml")
        
        for doc in content('reuters'):
            
            # Step 1: Extract the text from the document
            doc_body=str(doc('text'))
            # Step 2: Remove tags
            doc_body=remove_tags(str(doc('text')[0]))
            # Step 3: remove 'reuter'
            doc_body=doc_body.replace('reuter', '')
            # Step 4: Remove the next line character
            doc_body=doc_body.replace('\n','')
            # Step 5: remove the end of text character, i.e. &#3
            doc_body = doc_body.replace('&#3','')
            # Step 6: remove escape characters like &lt, &gt
            doc_body = saxutils.unescape(doc_body)
        
            doc_X.append(doc_body)


file: reut2-000.sgm
file: reut2-001.sgm
file: reut2-002.sgm
file: reut2-003.sgm
file: reut2-004.sgm
file: reut2-005.sgm
file: reut2-006.sgm
file: reut2-007.sgm
file: reut2-008.sgm
file: reut2-009.sgm
file: reut2-010.sgm
file: reut2-011.sgm
file: reut2-012.sgm
file: reut2-013.sgm
file: reut2-014.sgm
file: reut2-015.sgm
file: reut2-016.sgm
file: reut2-017.sgm
file: reut2-018.sgm
file: reut2-019.sgm
file: reut2-020.sgm
file: reut2-021.sgm


In [13]:
# remove most common words
common_words = set("a &#3; \x03 per - -- --- vs cts said. pct mln mlns dlr dlrs reuter about after again air all along also an and another any are around as at away back be because been before below between both but by came can come could day did different do does don't down each end even every few find first for found from get give go good great had has have he help her here him his home house how I if in into is it its just know large last left like line little long look made make man many may me men might more most Mr. Mr must y name never new next no not now number of off old on one only or other our out over own part people place put read right said same saw say see she sould show small so some something sound still such take tell than that the them then there these they thing think this those thought three through time to together too tow under up us use very want way we wel went were what when where which while who why will with word work world would write year you your was".split())
bigtexts = [[word for word in doc.split() if word not in common_words]
         for doc in doc_X]

REMOVING MOST COMMON WORDS...


In [14]:
# remove words that appear only once
c = Counter(chain.from_iterable(bigtexts))
texts = [[word for word in x if c[word]>1] for x in bigtexts]

REMOVING WORDS THAT APPEAR ONLY ONCE...


In [15]:
# Create Dictionary.
id2word = corpora.Dictionary(texts)
# Creates the Bag of Word corpus.
bow_corpus = [id2word.doc2bow(text) for text in texts]

CREATING DICTIONARY...


In [46]:
# function format_topics_sentences is taken from https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#4whatdoesldado
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


# Train the LDA models
ntopics = 92
lda_model = models.ldamodel.LdaModel(corpus=bow_corpus, id2word=id2word, num_topics=ntopics)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=bow_corpus, texts=texts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(20)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,3.0,0.2276,"fears, reverse, pursue, link, tightening, changes, cme, trading., eased, newyork","[bahia, cocoa, review, salvador,, feb, 26, showers, continued, throughout, week, inthe, bahia, cocoa, zone,, alleviating, drought, since, earlyjanuary, improving, prospects, coming, normal, humidity, levels, smith, weekly, review., dry, period, means, temporao, late, year., arrivals, week, ended, february, 22, bagsof, 60, kilos, making, cumulative, total, season, against, 5.81, stage, year., cocoa, delivered, earlier, consignment, included, figures., comissaria, smith, doubt, howmuch, crop, cocoa, available, harvesting, end., total, bahia, crop, 6.4, bags, sales, standing, almost, 6.2, thereare, hundred, thousand, bags, hands, exporters, processors., doubts, much, cocoa, export, shippers, experiencing, inobtaining, certificates., view, lower, quality, recent, weeks, farmers, havesold, their, cocoa, held, comissaria, smith, ...]"
1,1,63.0,0.1493,"drug, pennzoil, burlington, 9, appeals, april, texaco, getty, approximately, 49","[standard, oil, <srd>, form, financial, unit, cleveland,, feb, 26, standard, oil, co, bp, north, americainc, plan, form, venture, manage, money, investment, activities, companies., bp, north, america, subsidiary, british, petroleum, coplc, <bp>,, owns, 55, interest, standard, oil., venture, called, financial, tradingand, operated, standard, oil, oversight, ajoint, management, committee.]"
2,2,72.0,0.1856,"beet, programs, tuesday,, saleof, 1.03, issuance, stockmarket, 13.3, equitable, 1.30","[texas, commerce, bancshares, <tcb>, files, plan, houston,, feb, 26, texas, commerce, bancshares, inc's, texascommerce, filed, application, thecomptroller, currency, effort, create, network, harris, county., bank, network, link, 31, banks, billion, assets, 7.5, billion, deposits.]"
3,3,71.0,0.2808,"trade, billion, analysts, japan, june, market, japanese, u.s., against, money","[talking, <bac>, equity, offer, janie, gabbett,, s, los, angeles,, feb, 26, bankamerica, corp, underpressure, act, quickly, proposed, equity, offering, andwould, well, delay, stock's, recent, poorperformance,, banking, analysts, analysts, recommended, bankamerica, delayits, one-billion-dlr, equity, offering,, yet, beapproved, securities, exchange, commission., bankamerica, stock, fell, week,, bankingissues,, news, brazil, suspended, interest, paymentson, portion, foreign, debt., stock, traded, 12,, 1/8,, afternoon,after, falling, 11-1/2, earlier, week, news., banking, analysts, immediate, threat, thefirst, interstate, bancorp, <i>, takeover, bid, gone,, bankamerica, isunder, pressure, sell, securities, market, thatwill, nervous, bank, stocks, near, term., bankamerica, filed, offer, january, 26., seen, asone, major, factors, leading, interstatewithdrawing, takeover, bid, february, 9., ...]"
4,4,66.0,0.9016,"oct, cocoa, u.s., beef, indonesia, tons, short, import, mills, auction","[national, average, prices, farmer-owned, reserve, washington,, feb, 26, u.s., agriculture, departmentreported, farmer-owned, reserve, national, five-day, averageprice, february, 25, follows, (dlrs/bu-sorghum, cwt), natl, loan, release, call, avge, rate-x, level, price, price, wheat, 2.55, 2.40, iv, 4.65, v, 4.65, vi, 4.45, corn, 1.35, 1.92, iv, 3.15, 3.15, v, 3.25, x, 1986, rates., natl, loan, release, call, avge, rate-x, level, price, price, oats, 1.24, 0.99, v, 1.65, barley, n.a., 1.56, iv, 2.55, 2.55, v, 2.65, sorghum, 2.34, 3.25-y, iv, 5.36, 5.36, v, 5.54, reserves, i,, ii, iii, matured., level, iv, reflectsgrain, entered, oct, 6,, 1981, feedgrain, july23,, 1981, wheat., level, v, wheat/barley, 5/14/82,corn/sorghum, ...]"
5,5,9.0,0.7249,"apr, rice, field, prev, argentine, yugoslavia, mar, oats, total, yugoslav","[argentine, 1986/87, grain/oilseed, registrations, buenos, aires,, feb, 26, argentine, grain, board, figures, registrations, grains,, oilseeds, their, products, tofebruary, 11,, thousands, tonnes,, showing, month,, 1986/87, total, 1985/86, total, 1986,, brackets:, bread, wheat, prev, feb, march, maize, mar, total, 48.0, (nil)., sorghum, nil, (nil), oilseed, export, registrations, were:, sunflowerseed, total, 15.0, soybean, total, 20.0, (nil), board, detailed, export, registrations, follows,, subproducts, wheat, prev, feb, march, apr, 10.0,, ., linseed, prev, feb, mar, apr, total, soybean, prev, feb, mar, nil,, apr, nil,, sunflowerseed, prev, feb, mar, apr, 149.8, vegetable, oil, registrations, :, sunoil, prev, feb, mar, apr, 10.0,, total, linoil, prev, feb, mar, ...]"
6,6,15.0,0.7865,"shares, common, group, securities, stake, company, exchange, stock, inc, shares,","[red, lion, inns, files, plans, offering, portland,, ore.,, feb, 26, red, lion, inns, limited, filed, registration, statement, securities, andexchange, commission, covering, proposed, offering, limited, partnership, interests., company, expects, offering, priced, 20dlrs, unit., proceeds, offering,, 102.5, mlndlr, mortgage, loan,, used, finance, 10, red, lion, hotels.]"
7,7,8.0,0.2553,"reagan, u.s., administration, washington,, usx, bill, iran, fire, post, van","[usx, <x>, debt, moody's, york,, feb, 26, moody's, investors, service, inc, itlowered, debt, preferred, stock, ratings, usx, corp, andits, units., seven, billion, securities, affected., moody's, marathon, oil, co's, recent, establishment, upto, billion, production, payment, facilities, yates, field, significant, negative, implications, unsecured, creditors., company, appears, positioned, steel, return, profit, late, 1987,, moody's, added., ratings, lowered, include, usx's, senior, debt, baa-3.]"
8,8,24.0,0.5706,"stock, shares, repurchase, share, common, shareholders, june, board, 20, split","[champion, products, <ch>, approves, stock, split, rochester,, n.y.,, feb, 26, champion, products, inc, itsboard, directors, approved, two-for-one, stock, split, itscommon, shares, shareholders, record, april, 1,, 1987., company, board, voted, recommend, toshareholders, annual, meeting, april, 23, increase, theauthorized, capital, stock, five, 25, shares.]"
9,9,48.0,0.5475,"offer, tender, company, systems, bid, shares, inc, software, data, computer","[computer, terminal, systems, completes, sale, n.y.,, feb, 26, computer, terminal, systems, inc, saidit, completed, sale, 200,000, shares, commonstock,, warrants, acquire, additional, shares,, n.v.>, lugano,, switzerland, 50,000, dlrs., company, warrants, exercisable, fiveyears, purchase, price, share., computer, terminal, buyadditional, shares, increase, total, holdings, 40, pctof, computer, terminal's, outstanding, common, stock, undercertain, circumstances, involving, change, control, thecompany., company, conditions, occur, warrants, wouldbe, exercisable, price, equal, 75, common, price, time,, exceed, 1.50, share., computer, terminal, sold, technolgy, rights, toits, dot, matrix, impact, technology,, including, inc>, houston,, tex., but,, continue, licensee, technology, company, moves, reorganizationplan, pay, current, operation, costs, delivery., computer, terminal, makes, computer, generated, ...]"


In [25]:
pd.set_option("display.max_colwidth", -1)

  """Entry point for launching an IPython kernel.


In [51]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(optimal_model, bow_corpus, id2word)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [78]:
dom_topic=0
df_dominant_topic[df_dominant_topic['Dominant_Topic']==dom_topic].sort_values(by=['Topic_Perc_Contrib'], ascending=False).head(5)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
312,312,0.0,0.9611,"tonnes, exports, imports, production, 1986/87, rose, fell, output, total, tonnes,","[dutch, imports, rose, 1986, hague,, march, 2, dutch, imports, fat-, materials,, fats, oils, rose, 2.16, tonnes, basis, 1986, 2.12, tonnes, 1985,, thecommodity, board, margarine,, fats, oils, exports, commodities, fell, 1.35, tonnesfrom, 1.38, basis., fat-, oil-bearing, raw, materials, imports, rose, tonnes, fat/oil, basis, 3.47, weight, 3.32, mln., main, imports, rising, tonnes, actual, weight, 2.75, mln., fell, tonnes, rose, 292,000, tonnes., exports, fat-, oil-bearing, raw, materials, rose, tonnes, 19,800, fat/oil, basis, 89,900, tonnes, actual, weight., soyabean, exports, rose, tonnes, actual, weight, tonnes., imports, vegetable, fats,, including, palm, oil,, rose, tonnes, exports, vegetable, oil, imports, fell, 227,500, tonnes, 1986, 1985,, exports, tonnes., soyabean, ...]"
326,326,0.0,0.9015,"tonnes, exports, imports, production, 1986/87, rose, fell, output, total, tonnes,","[dutch, animal, feed, usage, half, season, rotterdam,, march, 2, dutch, animal, feed, usage, firsthalf, current, season, july, december, 1986, 6.5, tonnes, 7.1, period, of1985,, figures, latest, newsletter, show., tapioca, usage, fell, 9.1, 1.4, tonnes, 1.6, mlnin, half, 1985/86, season., grain, usage, fell, 6.1, 1.1, tonnes, 1.2, mln,while, soymeal, usage, fell, 10, 967,000, tonnes, 1.1, mln., cornglutenfeed, usage, fell, 17.8, 729,000, tonnes, usage, nearly, doubled, citruspulp, usage, dropped, 62.8, 149,000, tonnes, tonnes,, rapeseed, meal, usage, rose, 9.6, tonnes, 198,000, sunmeal, rose, 25.6, tonnes, 172,000, tonnes., during, whole, season, july, 1985, june, soymeal, usage, fell, 12, 1.9, tonnes, 2.1, mlnthe, previous, season,, ...]"
1210,1210,0.0,0.8782,"tonnes, exports, imports, production, 1986/87, rose, fell, output, total, tonnes,","[oils/fats, stocks, seen, falling, sharply, 1986/87, hamburg,, march, 3, visible, stocks, 17, oils, fats, areprobably, peaking, likely, fall, sharply, year,, oil, newsletter, oil, forecast, stocks, oils, fats, cutto, 9.8, tonnes, season,, compared, earlier., survey, covered, 13, oils, cotton,, sesame,, corn,, coconut,, palm,, lin, four, animal, oils, fats, butter,, fish, oil, world's, analysis, predicted, slight, productionincrease, 0.5, tonnes, end-september, increases, 3.6, 4.0, tonnes, theprevious, two, seasons., consumption, continuing, rise., prices, prevailing, since, early, 1986, stimulated, demandfor, food, non-food, purposes,, consumption, increase, record, 2.8, mlntonnes, 71.8, tonnes, season, indian, government, artificially, curb, domesticdemand., oil, expect, european, community(ec), introduce, vegetable, oils, tax,, tax, ...]"
15862,15862,0.0,0.7739,"tonnes, exports, imports, production, 1986/87, rose, fell, output, total, tonnes,","[usda, estimates, soviet, wheat,, coarse, grains, washington,, april, 9, u.s., agriculture, departmentforecast, soviet, 1986/87, wheat, crop, 92.30, tonnes,, vs92.30, tonnes, month., 1985/86, crop, 78.10mln, tonnes,, 78.10, tonnes, month., soviet, 1986/87, coarse, grain, production, estimated, at103.30, tonnes,, 103.30, tonnes, month., productionin, 1985/86, projected, 99.99, tonnes,, 100.00, mlntonnes, month., ussr, wheat, imports, forecast, 15.00, tonnes, in1986/87,, 15.00, tonnes, month., imports, 1985/86, areput, 15.70, tonnes,, 15.70, tonnes, month., usdaestimated, soviet, 1986/87, coarse, grain, imports, 12.00, mlntonnes,, 10.00, tonnes, month,, 1985/86, imports, at13.70, tonnes,, 13.70, tonnes, month., usda, soviet, coarse, grain, imports, include, 1986/87, forecast, 8.00, tonnes,, 6.00, mlntonnes, month., corn, imports, 1985/86, estimated, ...]"
3313,3313,0.0,0.7677,"tonnes, exports, imports, production, 1986/87, rose, fell, output, total, tonnes,","[usda, estimates, soviet, wheat,, coarse, grains, washington,, march, 9, u.s., agriculture, departmentforecast, soviet, 1986/87, wheat, crop, 92.30, tonnes,, vs92.30, tonnes, month., 1985/86, crop, 78.10mln, tonnes,, 78.10, tonnes, month., soviet, 1986/87, coarse, grain, production, estimated, at103.30, tonnes,, 103.30, tonnes, month., productionin, 1985/86, projected, 100.00, tonnes,, 99.99, mlntonnes, month., ussr, wheat, imports, forecast, 15.00, tonnes, in1986/87,, 14.00, tonnes, month., imports, 1985/86, areput, 15.70, tonnes,, 15.70, tonnes, month., usdaestimated, soviet, 1986/87, coarse, grain, imports, 10.00, mlntonnes,, 8.00, tonnes, month,, 1985/86, imports, at13.70, tonnes,, 13.70, tonnes, month.]"


In [77]:
dom_topic=1
df_dominant_topic[df_dominant_topic['Dominant_Topic']==dom_topic].sort_values(by=['Topic_Perc_Contrib'], ascending=False).head(5)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
1958,1958,1.0,0.7682,"west, growth, german, germany, economic, industrial, industry, economy, oecd, u.s.","[german, economic, outlook, seen, fairly, bright, kiel,, west, germany,, march, 5, outlook, westgerman, economy, relatively, bright,, gross, nationalproduct, expected, expand, year,, institute, economy, gnp, forecast, institute,, five, leadingeconomic, research, bodies, west, germany,, institutes,, their, gnp, forecasts, two, 2.5, pct., report, kiel, institute, west, germany's, deteriorated, fundamentally, despite, against, dollar, major, currencies., ""the, danger, exports, slump, 1987, appears,, limited,"", report, ""on, contrary,, slight, risein, exports, expected."", institute, past, experience, shown, west, move, counterbalance, currency, factors, bycutting, costs,, trying, penetrate, markets, product, ranges., aided, 1987, expected, slight, rise, ineconomic, growth, industrial, countries., time,, thedecline, exports, oil, producing, countries, looks, set, year., west, ...]"
1291,1291,1.0,0.6566,"west, growth, german, germany, economic, industrial, industry, economy, oecd, u.s.","[goodyear, <gt>, chairman, criticizes, corporate, raids, miami,, march, 3, chairman, goodyear, tire, concern, recent, hostile, takeover, underminingthe, nation's, industrial, speech, meeting, south, florida, business, chairman, robert, mercer, lashed, corporate, takeover, specialists,, accusing, causing, serious, companies, target., ""their, interest, preserving, industrial, providing, simply, deals,, product, acountry, ..., base, future, on."", year,, mercer, fought, takeover, attempt, sir, james, goldsmith., goodyear's, independence, preserved, high, company, bought, goldsmith's, stock, 620, 93, profit., goodyear, paid, 37, mlndlrs, expenses, bought, 41, shares, overtwo, billion, dlrs., effort, trim, debt,, goodyear, closed, downthree, plants, mercer, believes, otherwise, motor, wheel, aerospace, units, reduced, 10, pct,, plans, testify, tomorrow, senate, ...]"
19003,19003,1.0,0.6552,"west, growth, german, germany, economic, industrial, industry, economy, oecd, u.s.","[******, oecd, sees, 1.5, west, german, real, gnp, growth, 1987]"
671,671,1.0,0.6056,"west, growth, german, germany, economic, industrial, industry, economy, oecd, u.s.","[ec, ministers, struggle, agree, dairy, cuts, brussels,, march, 2, european, community,, ec,, agricultureministers, struggled, today, finalise, rules, aimed, sales, public, cold, stores, unwanted, butter, guaranteed, ec, prices,, diplomats, plan, key, element, landmark, accord, output, 9.5, two, years, agreed, outline, lastdecember, virtually, nine, days, non-stop, negotiations., accord,, due, operate, start, thenew, milk, marketing, april, 1,, hailed, mostsignificant, step, on-going, campaign, reform, costly, ecfarm, policies, cut, embarrassing, food, surpluses., diplomats, december, agreement, itself, notthreatened, effect, considerably, proposals, altered, west, germany, ireland, opposed, proposed, limits, ona, farmer's, automatic, sell, surplus, butter, stores, market, prices, stocks, high., faced, butter, record, 1.2, two, store,, ec, ...]"
720,720,1.0,0.5863,"west, growth, german, germany, economic, industrial, industry, economy, oecd, u.s.","[ual, <ual>, unit, attacked, minority, hiring, chicago,, march, 2, ual, inc's, united, airlines, accused, ina, congressional, hearing, today, blacks, company, tremendous, progress, issue, aired, during, hearing, subcommittee, whose, chairman,, rep., treatment, minorities, country's, ""it, strikes, illinois, democrat,, ""that, thenumber, white, women, pilots, double, white, women, fairly, represented, management."", united, court, order, since, 1976, increaseits, minority, employment., david, united's, senior, vice, president, ""we, aggressive, approach, tremendous, progress, ..., continueto, pursue, wider, goals.""]"


In [76]:
dom_topic=2
df_dominant_topic[df_dominant_topic['Dominant_Topic']==dom_topic].sort_values(by=['Topic_Perc_Contrib'], ascending=False).head(5)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
1956,1956,2.0,0.7914,"tax, party, ruling, nakasone, takeover, democratic, bid, opposition, april, proposed","[japan, ruling, party, fixes, date, budget, hearing, tokyo,, march, 5, japan's, ruling, liberal, democratic, moves, push, delayed, draft, budget, for1987/88, parliament,, deepening, clash, theopposition, called, move, rash, absence, opposition,, ldp, members, meeting, lower, steering, committee, tohold, public, hearing, draft, budget, march, 13, officials, step, parliament, resumed, deliberations, lasttuesday, following, month-long, opposition, boycott, acontroversial, sales, tax, plan., four, opposition, parties, led, socialists, beendelaying, budget, deliberations, bid, shelve, sales, taxon, budget, plan, based., socialist, spokesman, opposition, parliament, unless, ldp, changed, mind., prime, minister, yasuhiro, nakasone,, vowed, pushthrough, tax, reforms,, told, reporters:, ruling, opposition, parties,, i, wouldlike, avoid, ..., passing, (the, five, tax,, ...]"
3152,3152,2.0,0.7184,"tax, party, ruling, nakasone, takeover, democratic, bid, opposition, april, proposed","[twa, <twa>, declines, comment, usair, <u>, york,, march, 9, transworld, airlines, inc, official, saidthe, airline, comment, usair, group, inc's, planned, buyoutof, piedmont, aviation, inc., twa,, however,, pursued, application, thedepartment, transportation, takeover, usair,, accordingto, mark, twa, general, counsel, vice, president., revised, application, today,, following, friday, incomplete, application, filed, week., earlier, usair, agreed, buy, piedmont, 69, dlrscash, share.]"
7772,7772,2.0,0.6789,"tax, party, ruling, nakasone, takeover, democratic, bid, opposition, april, proposed","[******u.s., agency, allow, u.s., buy, 51, piedmont, pending, final, okay, mergerblah, blah, blah.]"
1995,1995,2.0,0.66,"tax, party, ruling, nakasone, takeover, democratic, bid, opposition, april, proposed","[usair, <u>, rejects, twa, <twa>, takeover, bid, washington,, march, 5, usair, group, inc, board, hasrejected, trans, airlines, inc's, offer, acquire, usair, share, cash, grossly, adequate, thebest, interests, usair, shareholders,, employees, passengers., company, unsolicited, bid, carl, twa, ""highly, conditional."", usair, board, piedmont, aviation, met, separately, yesterday, consider, usair's, offer, toacquire, 50.1, piedmont, 71, share, andremaining, shares, 1.5, 1.9, common, shares, each,, valued, atabout, 73, share, based, average, closing, price, common, during, period, merger., company, continuing, talks, piedmont, definitive, merger, agreement, two, reach, shortly., usair, ""in, light, highly, conditional, nature, andother, terms, twa, offer,, timing, offer, thecircumstances, made,, usair, group, purpose, twa, ...]"
2449,2449,2.0,0.6489,"tax, party, ruling, nakasone, takeover, democratic, bid, opposition, april, proposed","[twa, <twa>, pilots, welcome, bid, usair, <u>, york,, march, 5, airline, pilots, association, saidmembers, working, trans, airlines, inc, welcomed, announced, proposal, acquire, usair, group, union, leaders, twa, pilot, group, offer, share, positive, indication, carl, icahn, intendsto, build, twa, viable, carrier.]"


In [75]:
dom_topic=4
df_dominant_topic[df_dominant_topic['Dominant_Topic']==dom_topic].sort_values(by=['Topic_Perc_Contrib'], ascending=False).head(5)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
2689,2689,4.0,0.9362,"inc, offering, co, corp, company, unit, notes, york,, inc., initial","[proposed, offerings, recently, filed, sec, washington,, march, 6, following, proposed, securitiesofferings, filed, recently, securities, exchangecommission:, evans, sutherland, computer, corp, offering, 50mln, 25-year, convertible, subordinated, debentures, quist, inc., gould, inc, offering, two, shares, convertibleexchangeable, preferred, stock, boston, corp, peabody, co, inc.]"
153,153,4.0,0.8985,"inc, offering, co, corp, company, unit, notes, york,, inc., initial","[proposed, offerings, recently, filed, sec, washington,, feb, 26, following, proposed, securitiesofferings, filed, recently, securities, exchangecommission:, general, corp, offering, 25, mlndlrs, convertible, senior, subordinated, notes, due, march, 1997through, drexel, burnham, lambert, inc.]"
7457,7457,4.0,0.8954,"inc, offering, co, corp, company, unit, notes, york,, inc., initial","[proposed, offerings, recently, filed, sec, washington,, march, 19, following, proposed, securitiesofferings, filed, recently, securities, exchangecommission:, ramada, inc, <ram>, offering, 100, subordinatednotes, due, 1999, salomon, brothers, inc., chock, full, o'nuts, corp, <chg>, offering, 60, ofconvertible, senior, subordinated, debentures, due, april, 15,, 2012through, group, led, drexel, burnham, lambert, inc.]"
996,996,4.0,0.889,"inc, offering, co, corp, company, unit, notes, york,, inc., initial","[keycorp, <key>, registers, subordinated, notes, albany,, n.y.,, marc, 3, keycorp, filed, thesecurities, exchange, commission, offering, 75, mlndlrs, subordinated, capital, notes, due, march, 1,, 1999., company, anticipated, notes, beoffered, week, underwriters, led, bostoncorp., proceeds, used, general, corporate, purposes,including, acquisition, trust, savings, scheduled, july, one,, keycorp]"
811,811,4.0,0.8846,"inc, offering, co, corp, company, unit, notes, york,, inc., initial","[commercial, credit, <ccc>, sells, 10-year, notes, york,, march, 2, commercial, credit, co, raising, 150, mlndlrs, offering, notes, due, 1997, yielding, pct,said, lead, manager, morgan, stanley, co, inc., notes, 8-1/8, coupon, priced, at99.375, yield, 105, basis, points, comparable, treasurysecurities., non-callable, life,, issue, rated, baa-2, moody'sand, bbb-plus, standard, poor's., boston, corp, andshearson, lehman, brothers, inc, co-managed, deal.]"


In [80]:
dom_topic=5
df_dominant_topic[df_dominant_topic['Dominant_Topic']==dom_topic].sort_values(by=['Topic_Perc_Contrib'], ascending=False).head(5)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
583,583,5.0,0.8222,"gas, texas, natural, defense, company, partners, pipeline, u.s., district, approval","[midway, <mdwy>, expand, service, chicago,, march, 2, midway, airlines, inc, begin, toserve, several, cities, united, states, april, five, flights, atlanta, chicago., carrier, fly, daily, atlanta, flights, midway, airport, using, two, airplanes]"
481,481,5.0,0.8108,"gas, texas, natural, defense, company, partners, pipeline, u.s., district, approval","[presidential, <pair>, start, service, washington,, march, 2, presidential, airways, inc, 12, cities, starts, operating, continental, joint, marketing, agreement, texas, corp's, airlines, march, 23., base, dulles, airport, washington,, serve, albany,, n.y.,, ohio,, daytona, beach,, melbourne, sarasota,, indianapolis,, york's, kennedy, portland,, me.,, savannah,, ga., agreement,, presidential, continue, aseparate, company, mileage, plan, services, willbe, combined, continental, airlines.]"
1618,1618,5.0,0.802,"gas, texas, natural, defense, company, partners, pipeline, u.s., district, approval","[standard, oil, <srd>, texas, natural, gas, houston,, march, 4, standard, oil, co, well, deep, montgomery, county,, texas,, flowed, 4,500,000, cubic, natural, gas, depths, inch, choke., contracted, perry, gas, cos, inc, gas, well,, perry, build, pipeline, connect, natural, gas, pipeline, line.]"
1804,1804,5.0,0.7652,"gas, texas, natural, defense, company, partners, pipeline, u.s., district, approval","[meridian, diagnostics, <kits>, gets, fda, approval, cincinnati,, march, 4, meridian, diagnostics, inc, itreceived, approval, food, drug, administration, tomarket, test, detect, disease, drains, fluids, aids, victims., company, test, detects, sp, disease, result, life, fluids,, company, added.]"
1863,1863,5.0,0.727,"gas, texas, natural, defense, company, partners, pipeline, u.s., district, approval","[former, employee, says, firm, unethical, washington,, march, 4, former, trw, inc, controller, told, acongressional, hearing, company, acted, inits, defense, contracts, government., ""it, my, contention, company, called, trw, highly, defense, contractor., trw, honest, citizen, scheme, best, financial, interests,"", larry, eagleye, testimony, ahouse, oversight, subcommittee, hearing., eagleye, controller, trw's, compressor, cleveland., subcommittee, chairman, john, dingell,, d-mich,, defense, department, taken, actionagainst, trw, though, admitted, 1984, report, ithad, substantially, overcharged, government, parts., ""in, 1984,, trw, officials, admitted, defense, general, evidence, two, sets, problems, company's, divisions, substantial, overcharging, ofthe, federal, government, various, military, aircraft, ""for, example,, falsifying, books, records,, price, military, engine, two, threetimes, higher, ...]"


In [81]:
dom_topic=6
df_dominant_topic[df_dominant_topic['Dominant_Topic']==dom_topic].sort_values(by=['Topic_Perc_Contrib'], ascending=False).head(5)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
764,764,6.0,0.8463,"bank, debt, banks, finance, foreign, brazil, bankers, loans, days, interest","[funaro, rejects, suggestion, imf, brazil, plan, london,, march, 2, brazilian, finance, minister, dilson, funaroflatly, rejected, u.k., suggestion, country, seekinternational, monetary, fund, (imf), order, negotiations, commercial, creditor, banks., talking, reporters,, funaro, attitude, nothelp, resolve, crisis, started, brazil, interest, payments, 68, billion, externaldebt, february, 20., funaro, britain, leg, tour, ofeuropean, capitals, explain, motives, brazilian, debtmoratorium, seek, support, intiatives, improvecapital, flows, developed, countries, third, worlddebtor, nations.]"
5132,5132,6.0,0.701,"bank, debt, banks, finance, foreign, brazil, bankers, loans, days, interest","[president, ratifies, unilateral, suspension, debt, payments, private, foreign, banksblah, blah, blah.]"
189,189,6.0,0.6729,"bank, debt, banks, finance, foreign, brazil, bankers, loans, days, interest","[banks, express, grave, concern, brazil, debt, move, york,, feb, 26, brazil's, 14-bank, advisory, committeeexpressed, ""grave, concern"", chief, debt, negotiator, antoniopadua, de, seixas, country's, suspension, interestpayments,, according, telex, committee, chairman, citibankto, creditor, banks, worldwide., bankers, diplomatic, phrase, belied, deep, angerand, frustration, committee, brazil's, unilateral, movelast, friday, subsequent, freeze, 15, billion, dlrsof, short-term, trade, interbank, lines., seixas,, director, brazilian, central, bank's, foreigndebt, department,, met, full, panel, tuesday, wednesday., seixas,, met, morning, senior, citibankexecutive, william, rhodes, representatives, committeevice-chairmen, morgan, guaranty, trust, co, lloyds, bank, plc,told, banks, government, preparing, telex, toexplain, clarify, freeze, short-term, credits., telex, sent, creditors, early, today,bankers, despite, rising, tempers,, bankers, ...]"
13253,13253,6.0,0.6614,"bank, debt, banks, finance, foreign, brazil, bankers, loans, days, interest","[argentine, minister, seeks, debt, rescheduling, buenos, aires,, april, 5, economy, minister, juan, heading, washington, talks, creditor, banks, onrescheduling, argentina's, foreign, debt., economy, ministry, sources, hoped, seal, 30, bln, private, foreign, debt., argentina,, seeking, 2.15, bln, fresh, loans, fromprivate, international, banks, meet, 1987, growth, targets,, hasbeen, negotiating, banks', steering, committee, sincefebruary., attend, international, monetary, fund, andworld, bank, meetings.]"
174,174,6.0,0.6437,"bank, debt, banks, finance, foreign, brazil, bankers, loans, days, interest","[banks, express, grave, concern, brazil, debt, moves, york,, feb, 26, brazil's, 14-bank, advisory, committeeexpressed, ""grave, concern"", chief, debt, negotiator, antoniopadua, de, seixas, country's, suspension, interestpayments,, according, telex, committee, chairman, citibankto, creditor, banks, worldwide., bankers, diplomatic, phrase, belied, deep, angerand, frustration, committee, brazil's, unilateral, movelast, friday, subsequent, freeze, 15, billion, dlrsof, short-term, trade, interbank, lines., seixas,, director, brazilian, central, bank's, foreigndebt, department,, met, full, panel, tuesday, wednesday., seixas,, met, morning, senior, citibankexecutive, william, rhodes, representatives, committeevice-chairmen, morgan, guaranty, trust, co, lloyds, bank, plc,told, banks, government, preparing, telex, toexplain, clarify, freeze, short-term, credits., telex, sent, creditors, early, today,bankers, despite, rising, tempers,, bankers, ...]"


In [151]:
dom_topic=54
df_dominant_topic[df_dominant_topic['Dominant_Topic']==dom_topic].sort_values(by=['Topic_Perc_Contrib'], ascending=False).head(5)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
1937,1937,54.0,0.9415,"iranian, gulf, iran, plant, military, northern, iraq, war, forces, news","[iran, reports, heavy, fighting, iraqi, kurdistan, london,, march, 5, iran, troops, repulsed, heavy, iraqicounter-attacks, continued, their, advance, iraqi, kurdistan, overnight, fighting, thenorthern, war, front., iran, launched, offensive,, codenamed, ontuesday, night, among, snow-capped, peaks, haj, area, northeast, iraq., iranian, news, agency, irna,, received, london,, thetroops, ""continued, their, successful, advance, ..., enemy, positions."", iraqi, brigade, 604, shattered, personnel, killed, third, brigade, thrown, counter-attacks, 70, losses,, agency, added., 208, prisoners, taken, front., irna, iranian, forces, backed, heavy, artillery, continuing, advance., iranian, casualties, given., area, haj, omran, kurdish, town, 65, km, inside, iraq, scene, heavy, fighting, backed, dissident, area, attacks, ongovernment, positions, installations, northern, ...]"
233,233,54.0,0.7364,"iranian, gulf, iran, plant, military, northern, iraq, war, forces, news","[british, conservatives, ahead, labour, polls, london,, march, 1, ruling, conservatives, their, lead, opposition, labour, party,, accordingto, results, two, opinion, polls, released, saturday., market, &, opinion, research, international, sunday, times, showed, conservatives, asix, point, lead,, poll, telephone, surveys, limited, forthe, sunday, express, four, points, ahead., sunday, express, poll, conducted, since, thesocial, democratic, party, scored, upset, victory, thursday, ina, parliamentary, by-election, former, labour, stronghold, near, london., mori, poll,, conducted, six, days, leading, showed, conservatives, 41, 35, alliance, social, democrats, 21, pct., sunday, express, poll,, conducted, conservatives, ahead, 35.6, vote,, labourwith, 31.9, alliance, 31.4, pct., harris, poll, published, observer, newspaper, lastsunday, gave, conservatives, ...]"
13364,13364,54.0,0.7338,"iranian, gulf, iran, plant, military, northern, iraq, war, forces, news","[iran, reports, important, victories, southern, front, london,, april, 7, iran, achieved, importantvictories, against, iraq, southern, war, fronts, night., brief, iranian, news, agency, irna, report, ""importantvictories, achieved, southern, fronts, monday, night."", gave, nofurther, details., iran, launched, major, offensive,, codenamed, karbala-5,towards, major, southern, iraqi, port, basra, january,, butthere, reports, heavy, fighting, area, inrecent, weeks.]"
13528,13528,54.0,0.7335,"iranian, gulf, iran, plant, military, northern, iraq, war, forces, news","[iran, reports, important, victories, southern, front, london,, april, 7, iran, achieved, importantvictories, against, iraq, southern, war, fronts, night., brief, iranian, news, agency, irna, report, ""importantvictories, achieved, southern, fronts, monday, night."", gave, nofurther, details., iran, launched, major, offensive,, codenamed, karbala-5,towards, major, southern, iraqi, port, basra, january,, butthere, reports, heavy, fighting, area, inrecent, weeks.]"
293,293,54.0,0.692,"iranian, gulf, iran, plant, military, northern, iraq, war, forces, news","[international, leisure, boeing, lease, london,, march, 2, <international, leisure, group, plc>, saidits, <air, subsidiary, advanced, negotiations, banks, lease, 10, co>, ltd>, engines., deal, lease,, purchase, options,, offive, boeing, planes, five, engines, fitted, boeing, delivery, 1988, 1989.]"


In [152]:
filename = outputfile + str(ntopics) + ".txt"
file = open(filename, "w")
topiccounter = 0
for top in optimal_model.print_topics(ntopics):
    topiccounter +=1
    file.write("topic #" + str(topiccounter) + "\n" + top[1] + "\n\n")

file.close()