## Imports

In [7]:
import pandas as pd
import numpy as np
import re

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

## Data Input

In [2]:
# reading iteratively input files
input_df = pd.read_csv('BBCArticles.csv')
display(input_df.sample(5))
display(input_df['Category'].value_counts())

Unnamed: 0,ArticleId,Text,Category
427,930,casual gaming to take off games aimed at ca...,tech
60,464,dozens held over id fraud site twenty-eight pe...,tech
283,533,england s defensive crisis grows england s def...,sport
841,174,commons hunt protest charges eight protesters ...,politics
913,1792,choose hope over fear - kennedy voters will ha...,politics


sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64

## Data Preprocessing

Steps:
1. Cutting docs to words
2. Removing stop-words
3. Lemmatization of words


In [3]:
# splitting and cleaning text
splitted_texts = [doc.split() for doc in input_df['Text']]
input_df['Splitted Text'] = splitted_texts
stripped = []
for i in range(len(input_df['Splitted Text'])):
    strip = [word.strip(' .,)(""-\'') for word in input_df['Splitted Text'][i]]
    stripped.append(strip)
input_df['Stripped Text'] = stripped
input_df.head(5)

Unnamed: 0,ArticleId,Text,Category,Splitted Text,Stripped Text
0,1833,worldcom ex-boss launches defence lawyers defe...,business,"[worldcom, ex-boss, launches, defence, lawyers...","[worldcom, ex-boss, launches, defence, lawyers..."
1,154,german business confidence slides german busin...,business,"[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,..."
2,1101,bbc poll indicates economic gloom citizens in ...,business,"[bbc, poll, indicates, economic, gloom, citize...","[bbc, poll, indicates, economic, gloom, citize..."
3,1976,lifestyle governs mobile choice faster bett...,tech,"[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b..."
4,917,enron bosses in $168m payout eighteen former e...,business,"[enron, bosses, in, $168m, payout, eighteen, f...","[enron, bosses, in, $168m, payout, eighteen, f..."


In [4]:
# removing stop words
filtered = []
stop_words = set(stopwords.words('english'))
for i in range(len(input_df['Stripped Text'])):
    filtered_sentence = [w for w in input_df['Stripped Text'][i] if not w in stop_words]
    filtered.append(filtered_sentence)
input_df['Filtered Text'] = filtered

In [5]:
input_df

Unnamed: 0,ArticleId,Text,Category,Splitted Text,Stripped Text,Filtered Text
0,1833,worldcom ex-boss launches defence lawyers defe...,business,"[worldcom, ex-boss, launches, defence, lawyers...","[worldcom, ex-boss, launches, defence, lawyers...","[worldcom, ex-boss, launches, defence, lawyers..."
1,154,german business confidence slides german busin...,business,"[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,..."
2,1101,bbc poll indicates economic gloom citizens in ...,business,"[bbc, poll, indicates, economic, gloom, citize...","[bbc, poll, indicates, economic, gloom, citize...","[bbc, poll, indicates, economic, gloom, citize..."
3,1976,lifestyle governs mobile choice faster bett...,tech,"[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b..."
4,917,enron bosses in $168m payout eighteen former e...,business,"[enron, bosses, in, $168m, payout, eighteen, f...","[enron, bosses, in, $168m, payout, eighteen, f...","[enron, bosses, $168m, payout, eighteen, forme..."
5,1582,howard truanted to play snooker conservative...,politics,"[howard, truanted, to, play, snooker, conserva...","[howard, truanted, to, play, snooker, conserva...","[howard, truanted, play, snooker, conservative..."
6,651,wales silent on grand slam talk rhys williams ...,sport,"[wales, silent, on, grand, slam, talk, rhys, w...","[wales, silent, on, grand, slam, talk, rhys, w...","[wales, silent, grand, slam, talk, rhys, willi..."
7,1797,french honour for director parker british film...,entertainment,"[french, honour, for, director, parker, britis...","[french, honour, for, director, parker, britis...","[french, honour, director, parker, british, fi..."
8,2034,car giant hit by mercedes slump a slump in pro...,business,"[car, giant, hit, by, mercedes, slump, a, slum...","[car, giant, hit, by, mercedes, slump, a, slum...","[car, giant, hit, mercedes, slump, slump, prof..."
9,1866,fockers fuel festive film chart comedy meet th...,entertainment,"[fockers, fuel, festive, film, chart, comedy, ...","[fockers, fuel, festive, film, chart, comedy, ...","[fockers, fuel, festive, film, chart, comedy, ..."


## Lemmatization

In [9]:
lemmatizer = WordNetLemmatizer() 
#docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
lemmatized = []
for i in range(len(input_df['Filtered Text'])):
    lemat = [lemmatizer.lemmatize(w) for w in input_df['Filtered Text'][i] ]
    lemmatized.append(filtered_sentence)
input_df['Lemmatized Text'] = filtered
  

# Vector representations

Methods to compare:
- Bert
- Fasttext
- Word2Vec
- LDA

## 1. LDA

In [29]:
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

dictionary = Dictionary(input_df['Lemmatized Text'])
# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in input_df['Lemmatized Text']]

# Set training parameters.
num_topics = 5
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)


In [26]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 2350
Number of documents: 1490


In [31]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.2f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -1.51.
[([(0.025131451, 'mr'),
   (0.011413275, 'government'),
   (0.009469184, 'labour'),
   (0.00843337, 'people'),
   (0.007945857, 'election'),
   (0.0075567155, 'blair'),
   (0.007098471, 'party'),
   (0.006230452, 'could'),
   (0.006193483, 'minister'),
   (0.0057286043, 'new'),
   (0.0054799556, 'brown'),
   (0.0050757974, 'told'),
   (0.005011373, 'tax'),
   (0.0049213143, 'public'),
   (0.0045534684, 'uk'),
   (0.0044175605, 'plans'),
   (0.004326394, 'say'),
   (0.004140786, 'said:'),
   (0.0041294466, 'prime'),
   (0.0038743634, 'says')],
  -1.2540906902650997),
 ([(0.011924738, 'film'),
   (0.010472071, 'best'),
   (0.00893881, 'one'),
   (0.0071669226, 'first'),
   (0.00698584, 'year'),
   (0.0068693953, 'new'),
   (0.0059013776, 'us'),
   (0.005668539, 'last'),
   (0.005366594, 'two'),
   (0.005213705, 'top'),
   (0.004744745, 'number'),
   (0.004736518, 'time'),
   (0.004715911, 'show'),
   (0.0045059584, 'play'),
   (0.0044639697, 'game'),
   (0

In [34]:
input_df

Unnamed: 0,ArticleId,Text,Category,Splitted Text,Stripped Text,Filtered Text,Lemmatized Text
0,1833,worldcom ex-boss launches defence lawyers defe...,business,"[worldcom, ex-boss, launches, defence, lawyers...","[worldcom, ex-boss, launches, defence, lawyers...","[worldcom, ex-boss, launches, defence, lawyers...","[worldcom, ex-boss, launches, defence, lawyers..."
1,154,german business confidence slides german busin...,business,"[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,..."
2,1101,bbc poll indicates economic gloom citizens in ...,business,"[bbc, poll, indicates, economic, gloom, citize...","[bbc, poll, indicates, economic, gloom, citize...","[bbc, poll, indicates, economic, gloom, citize...","[bbc, poll, indicates, economic, gloom, citize..."
3,1976,lifestyle governs mobile choice faster bett...,tech,"[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b..."
4,917,enron bosses in $168m payout eighteen former e...,business,"[enron, bosses, in, $168m, payout, eighteen, f...","[enron, bosses, in, $168m, payout, eighteen, f...","[enron, bosses, $168m, payout, eighteen, forme...","[enron, bosses, $168m, payout, eighteen, forme..."
5,1582,howard truanted to play snooker conservative...,politics,"[howard, truanted, to, play, snooker, conserva...","[howard, truanted, to, play, snooker, conserva...","[howard, truanted, play, snooker, conservative...","[howard, truanted, play, snooker, conservative..."
6,651,wales silent on grand slam talk rhys williams ...,sport,"[wales, silent, on, grand, slam, talk, rhys, w...","[wales, silent, on, grand, slam, talk, rhys, w...","[wales, silent, grand, slam, talk, rhys, willi...","[wales, silent, grand, slam, talk, rhys, willi..."
7,1797,french honour for director parker british film...,entertainment,"[french, honour, for, director, parker, britis...","[french, honour, for, director, parker, britis...","[french, honour, director, parker, british, fi...","[french, honour, director, parker, british, fi..."
8,2034,car giant hit by mercedes slump a slump in pro...,business,"[car, giant, hit, by, mercedes, slump, a, slum...","[car, giant, hit, by, mercedes, slump, a, slum...","[car, giant, hit, mercedes, slump, slump, prof...","[car, giant, hit, mercedes, slump, slump, prof..."
9,1866,fockers fuel festive film chart comedy meet th...,entertainment,"[fockers, fuel, festive, film, chart, comedy, ...","[fockers, fuel, festive, film, chart, comedy, ...","[fockers, fuel, festive, film, chart, comedy, ...","[fockers, fuel, festive, film, chart, comedy, ..."


In [47]:
model.get_document_topics(corpus[1320])

[(1, 0.92560893), (2, 0.073680595)]

TypeError: get_topics() takes 1 positional argument but 2 were given