In [1]:
import pandas as pd
data = pd.read_csv('bbc-news-data.csv', sep="\t")
data

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
...,...,...,...,...
2220,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...
2221,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...
2222,tech,399.txt,Be careful how you code,A new European directive could put software w...
2223,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...


In [2]:
import nltk 
import numpy as np
import re
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):

    # leave only letters and spaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc)
    doc = doc.lower()
    doc = doc.strip()
    tokens = wpt.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)
norm_data = normalize_corpus(data.title)
data.title = norm_data

norm_data = normalize_corpus(data.content)
data.content = norm_data
data['text'] = data['title'] + ' ' + data['content']

In [3]:
data[['category', 'text']]
tokenized_docs = [wpt.tokenize(doc) for doc in data.text]
tokenized_docs

[['ad',
  'sales',
  'boost',
  'time',
  'warner',
  'profit',
  'quarterly',
  'profits',
  'us',
  'media',
  'giant',
  'timewarner',
  'jumped',
  'bn',
  'three',
  'months',
  'december',
  'yearearlier',
  'firm',
  'one',
  'biggest',
  'investors',
  'google',
  'benefited',
  'sales',
  'highspeed',
  'internet',
  'connections',
  'higher',
  'advert',
  'sales',
  'timewarner',
  'said',
  'fourth',
  'quarter',
  'sales',
  'rose',
  'bn',
  'bn',
  'profits',
  'buoyed',
  'oneoff',
  'gains',
  'offset',
  'profit',
  'dip',
  'warner',
  'bros',
  'less',
  'users',
  'aol',
  'time',
  'warner',
  'said',
  'friday',
  'owns',
  'searchengine',
  'google',
  'internet',
  'business',
  'aol',
  'mixed',
  'fortunes',
  'lost',
  'subscribers',
  'fourth',
  'quarter',
  'profits',
  'lower',
  'preceding',
  'three',
  'quarters',
  'however',
  'company',
  'said',
  'aols',
  'underlying',
  'profit',
  'exceptional',
  'items',
  'rose',
  'back',
  'stronger',
  '

In [4]:
# number of unique categories
all_categories = set(data.category)
all_categories = list(all_categories)
num_categories = len(all_categories)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=30, max_df=0.6, ngram_range=(1,2), token_pattern=None, tokenizer=nltk.word_tokenize)
docs_cv = cv.fit_transform(data.text)
docs_cv

<2225x2473 sparse matrix of type '<class 'numpy.int64'>'
	with 235105 stored elements in Compressed Sparse Row format>

In [15]:
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=num_categories, solver='cd', max_iter=500, random_state=42, alpha=.1, l1_ratio=.85)
doc_topics = nmf_model.fit_transform(docs_cv)
vocabulary = np.array(cv.get_feature_names())



In [16]:
topic_terms = nmf_model.components_ # topic-term matrix (W)
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :15]
topic_keyterms = vocabulary[topic_key_term_idxs]

topic_labels = [', '.join(topic) for topic in topic_keyterms]
words_dict = {}
for i in range(num_categories):
    words_dict["Topic {}".format(i + 1)] = topic_labels[i]

pd.DataFrame(words_dict.items(), columns=['', 'words'])

Unnamed: 0,Unnamed: 1,words
0,Topic 1,"game, games, first, new, time, one, world, eng..."
1,Topic 2,"mr, would, labour, blair, election, government..."
2,Topic 3,"best, song, music, years, awards, last, britis..."
3,Topic 4,"people, mobile, music, technology, one, digita..."
4,Topic 5,"us, bn, year, also, would, last, growth, marke..."


In [13]:
manually_themes = ['sport', 'politics', 'entertainment', 'tech', 'business']
words_dict = dict(zip(manually_themes, words_dict.values()))

pd.DataFrame(words_dict.items(), columns=['', 'words'])

Unnamed: 0,Unnamed: 1,words
0,sport,"game, first, games, new, time, one, world, eng..."
1,politics,"mr, would, labour, blair, election, government..."
2,entertainment,"best, song, music, years, awards, last, robbie..."
3,tech,"people, mobile, music, technology, one, digita..."
4,business,"us, bn, year, also, would, last, growth, marke..."


In [14]:
new_doc = ['The US president says he will not approve the deal if Chinese company Bytedance remains involved.', 
           'smartphones are the most used device for accessing digital news about the coronavirus, with 61 sing them for pandemic-related news.',
           'ball got past the goalkeeper and was rolling slowly towards the net when Douglas Costa rushed back and cleared it off the line.']
new_doc_cv = cv.transform(new_doc)
new_doc_topics = nmf_model.transform(new_doc_cv)

for j in range(len(new_doc_topics)):
    for i in range(len(new_doc_topics[j])):
        if new_doc_topics[j][i] > 0.0:
            print("Sentence:" + new_doc[j])
            print(manually_themes[i], new_doc_topics[j][i])
            print("")
    


Sentence:The US president says he will not approve the deal if Chinese company Bytedance remains involved.
business 0.055432076747463074

Sentence:smartphones are the most used device for accessing digital news about the coronavirus, with 61 sing them for pandemic-related news.
tech 0.027195249382117488

Sentence:ball got past the goalkeeper and was rolling slowly towards the net when Douglas Costa rushed back and cleared it off the line.
sport 0.018435766748663065

Sentence:ball got past the goalkeeper and was rolling slowly towards the net when Douglas Costa rushed back and cleared it off the line.
tech 0.004878276423090906





In [15]:
# het gutenberg corpus from nltk
from nltk.corpus import gutenberg

text = gutenberg.raw('edgeworth-parents.txt')
text = normalize_document(text)
text



In [16]:
from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures

finder = TrigramCollocationFinder.from_words(text.split())
trigram_measures = TrigramAssocMeasures()
print("Raw Frequency:")
print(finder.nbest(trigram_measures.raw_freq, 10)) # most frequent trigrams
print("")
print("Likelihood Ratio:")
print(finder.nbest(trigram_measures.likelihood_ratio, 10)) # most possible trigrams
print("")
print("PMI:")
print(finder.nbest(trigram_measures.pmi, 10)) # most related (possible) based on pmi trigrams

Raw Frequency:
[('said', 'mr', 'gresham'), ('said', 'sir', 'arthur'), ('said', 'de', 'grey'), ('said', 'mrs', 'theresa'), ('said', 'mr', 'somerville'), ('lady', 'diana', 'sweepstakes'), ('mrs', 'theresa', 'tattle'), ('said', 'miss', 'somers'), ('mr', 'hopkins', 'agent'), ('said', 'dr', 'middleton')]

Likelihood Ratio:
[('said', 'de', 'grey'), ('replied', 'de', 'grey'), ('de', 'grey', 'suspect'), ('de', 'grey', 'shall'), ('congratulated', 'de', 'grey'), ('de', 'grey', 'fired'), ('de', 'grey', 'intercede'), ('answered', 'de', 'grey'), ('attachment', 'de', 'grey'), ('inwards', 'de', 'grey')]

PMI:
[('ab', 'origine', 'null'), ('affect', 'imitate', 'indiscriminately'), ('anecdotes', 'amusing', 'instructive'), ('angles', 'sines', 'cosines'), ('appreciated', 'numismatic', 'collectors'), ('arcs', 'centres', 'complements'), ('assistant', 'maria', 'edgeworth'), ('autre', 'mais', 'pas'), ('births', 'deaths', 'marriages'), ('bohns', 'recent', 'edition')]
