# Preprocessing and EDA

In [49]:
import nltk
import re
import string
import pandas as pd
import numpy as np
from pprint import pprint
from gensim.models.ldamodel import LdaModel
import gensim.corpora as corpora
from gensim.corpora import MmCorpus
from gensim.utils import simple_preprocess
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel

In [50]:
df = pd.read_csv("bbc-text.csv")
stopword_list = stopwords.words("english")
stopword_list = stopword_list + ["said", "also", "would", "first", "last", "one"]
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer('english')

In [3]:
print(set(df['category'].values))

{'business', 'politics', 'tech', 'entertainment', 'sport'}


In [12]:
# Get most frequent words
def get_stats(words, num_words=200):
    #words = [word for word in words if word not in stopword_list]
    #words = [word for word in words if re.search("[A-Za-z]", word)]
    freq_dist = FreqDist(words)
    return freq_dist

In [4]:
def preprocess(df):
    print(df['text'])
    df['word_list'] = df['text'].apply(lambda x: nltk.tokenize.word_tokenize(x))
    df['word_list'] = df['word_list'].apply(lambda x: [word for word in x if word not in stopword_list])
    df['word_list'] = df['word_list'].apply(lambda x: [word for word in x if re.search("[A-Za-z]", word)])
    df['lemmas'] = df['word_list'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    return df    

In [5]:
df = preprocess(df)
print(df)

0       tv future in the hands of viewers with home th...
1       worldcom boss  left books alone  former worldc...
2       tigers wary of farrell  gamble  leicester say ...
3       yeading face newcastle in fa cup premiership s...
4       ocean s twelve raids box office ocean s twelve...
                              ...                        
2220    cars pull down us retail figures us retail sal...
2221    kilroy unveils immigration policy ex-chatshow ...
2222    rem announce new glasgow concert us band rem h...
2223    how political squabbles snowball it s become c...
2224    souness delight at euro progress boss graeme s...
Name: text, Length: 2225, dtype: object
           category                                               text  \
0              tech  tv future in the hands of viewers with home th...   
1          business  worldcom boss  left books alone  former worldc...   
2             sport  tigers wary of farrell  gamble  leicester say ...   
3             sport  yeadi

In [154]:
flattened = [word for article in df['word_list'].values for word in article]
fd = get_stats(flattened)
#print(fd.most_common(num_words))

[('said', 7254), ('mr', 2994), ('would', 2577), ('also', 2156), ('people', 2044), ('new', 1970), ('us', 1932), ('year', 1829), ('one', 1763), ('could', 1511), ('last', 1381), ('first', 1282), ('world', 1186), ('two', 1181), ('government', 1154), ('time', 1147), ('uk', 1079), ('years', 1002), ('make', 927), ('best', 926), ('told', 911), ('get', 890), ('game', 857), ('made', 856), ('film', 855), ('like', 838), ('music', 835), ('many', 829), ('labour', 796), ('next', 780), ('bbc', 765), ('back', 764), ('three', 762), ('number', 760), ('take', 735), ('added', 731), ('way', 726), ('set', 714), ('well', 701), ('says', 687), ('market', 686), ('company', 685), ('may', 685), ('home', 648), ('good', 642), ('going', 624), ('still', 622), ('england', 616), ('games', 615), ('election', 613), ('party', 610), ('much', 609), ('win', 607), ('since', 607), ('firm', 606), ('work', 603), ('go', 598), ('blair', 598), ('show', 593), ('think', 586), ('use', 582), ('say', 581), ('week', 575), ('million', 570)

In [15]:
# N-grams
def get_ngrams(df, input_col_name, output_col_name, ngram_func):
    df[output_col_name] = df[input_col_name].apply(lambda x: list(ngram_func(x)))
    df[output_col_name] = df[output_col_name].apply(lambda x: [bigram for bigram in x 
                                                   if bigram[0] not in stopword_list
                                                  and bigram[1] not in stopword_list])
    df[output_col_name] = df[output_col_name].apply(lambda x: [bigram for bigram in x 
                                                   if re.search("[A-Za-z]", bigram[0])
                                                  and re.search("[A-Za-z]", bigram[1])])
    df[output_col_name] = df[output_col_name].apply(lambda x: [" ".join(bigram) for bigram in x])
    return df

In [156]:
df = get_ngrams(df, "word_list", "bigrams", nltk.bigrams)
flattened_bigrams = [ngram for article in df['bigrams'].values for ngram in article]
fd = get_stats(flattened_bigrams)
#print(fd.most_common(num_words))

[('last year', 454), ('told bbc', 362), ('said mr', 359), ('mr blair', 332), ('prime minister', 312), ('mr brown', 255), ('chief executive', 198), ('said would', 192), ('tony blair', 186), ('last week', 184), ('bbc news', 178), ('general election', 172), ('new york', 167), ('six nations', 162), ('mr howard', 155), ('number one', 141), ('first time', 140), ('michael howard', 130), ('years ago', 128), ('human rights', 118), ('next year', 116), ('gordon brown', 113), ('bbc radio', 112), ('also said', 105), ('mobile phone', 104), ('home secretary', 104), ('two years', 102), ('lib dems', 102), ('new zealand', 99), ('news website', 98), ('five years', 93), ('three years', 93), ('liberal democrats', 93), ('three months', 91), ('spokesman said', 91), ('last month', 90), ('box office', 89), ('manchester united', 87), ('world cup', 87), ('interest rates', 80), ('economic growth', 80), ('mobile phones', 79), ('grand slam', 79), ('lib dem', 78), ('blair said', 76), ('tory leader', 75), ('labour pa

In [157]:
df = get_ngrams(df, "word_list", "trigrams", nltk.trigrams)
flattened_trigrams = [ngram for article in df['trigrams'].values for ngram in article]
fd = get_stats(flattened_trigrams)
#print(fd.most_common(num_words))

[('told bbc news', 147), ('bbc news website', 97), ('told bbc radio', 76), ('leader michael howard', 58), ('mr blair said', 54), ('million dollar baby', 53), ('radio today programme', 49), ('told bbc sport', 48), ('bbc radio today', 47), ('mr howard said', 41), ('tory leader michael', 41), ('next general election', 39), ('chancellor gordon brown', 33), ('leader charles kennedy', 32), ('prime minister tony', 32), ('minister tony blair', 32), ('two years ago', 28), ('george w bush', 28), ('world number one', 27), ('mr blair told', 27), ('coach andy robinson', 27), ('bbc world service', 26), ('london stock exchange', 25), ('said mr blair', 25), ('deputy prime minister', 23), ('sir alex ferguson', 23), ('digital music players', 23), ('radio five live', 23), ('foreign secretary jack', 22), ('secretary jack straw', 22), ('secretary charles clarke', 22), ('consumer electronics show', 21), ('uk independence party', 21), ('rbs six nations', 21), ('securities exchange commission', 21), ('mr brow

In [158]:
#print(df['lemmas'])
flattened_lemmas = [lemma for article in df['lemmas'].values for lemma in article]
fd = get_stats(flattened_lemmas)
#print(fd.most_common(num_words))

[('said', 7254), ('mr', 3035), ('year', 2831), ('would', 2577), ('also', 2156), ('people', 2045), ('new', 1970), ('u', 1935), ('one', 1809), ('could', 1511), ('game', 1472), ('time', 1449), ('last', 1381), ('first', 1283), ('say', 1268), ('world', 1214), ('government', 1190), ('two', 1181), ('company', 1113), ('film', 1113), ('uk', 1079), ('make', 1072), ('firm', 1002), ('get', 967), ('best', 930), ('told', 911), ('number', 893), ('service', 880), ('like', 879), ('take', 861), ('made', 856), ('way', 841), ('music', 835), ('month', 831), ('many', 829), ('country', 826), ('player', 820), ('market', 812), ('back', 805), ('labour', 796), ('week', 785), ('next', 780), ('party', 779), ('bbc', 765), ('set', 764), ('three', 762), ('show', 761), ('minister', 737), ('want', 736), ('sale', 734), ('home', 731), ('added', 731), ('well', 722), ('win', 706), ('election', 702), ('good', 700), ('plan', 695), ('go', 692), ('work', 691), ('may', 685), ('million', 681), ('day', 680), ('technology', 669), 

# Vectorizing the data

In [6]:
def tokenize_and_lemmatize(sentence):
    tokens = nltk.tokenize.word_tokenize(sentence)
    filtered_tokens = [t for t in tokens if t not in stopword_list and t not in string.punctuation and re.search('[a-zA-Z]', t)]
    lemmas = [lemmatizer.lemmatize(t) for t in filtered_tokens]
    return lemmas

In [7]:
def tokenize_and_stem(sentence):
    tokens = nltk.word_tokenize(sentence)
    filtered_tokens = [t for t in tokens if t not in stopword_list and t not in string.punctuation and re.search('[a-zA-Z]', t)]
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [8]:
def create_vectorizer(text_list):
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_df=0.9, min_df=0.02, 
                                       ngram_range=(1,2), stop_words=stopword_list, tokenizer=tokenize_and_stem)
    data = tfidf_vectorizer.fit_transform(text_list)
    return (data, tfidf_vectorizer)

In [9]:
(vect_data, vectorizer) = create_vectorizer(df['text'].values)
#print(list(vectorizer.vocabulary_.keys())[0:100])

  'stop_words.' % sorted(inconsistent))


# Building the model

In [10]:
def create_and_fit_lda_sklearn(data, num_topics):
    lda = LDA(n_components=num_topics)
    lda.fit(data)
    return lda

In [14]:
lda = create_and_fit_lda_sklearn(vect_data, 5)

In [11]:
def get_most_common_words_for_topics(model, vectorizer, n_top_words):
    words = vectorizer.get_feature_names()
    word_dict = {}
    for topic_index, topic in enumerate(model.components_):
        this_topic_words = [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        word_dict[topic_index] = this_topic_words
    return word_dict

In [12]:
def print_topic_words(word_dict):
    for key in word_dict.keys():
        print(f"Topic {key}")
        print("\t", word_dict[key])

In [15]:
topic_words = get_most_common_words_for_topics(lda, vectorizer, 10)
print_topic_words(topic_words)

Topic 0
	 ['said', 'compani', 'firm', 'use', 'mr', 'peopl', 'us', 'new', 'mobil', 'technolog']
Topic 1
	 ['growth', 'economi', 'rate', 'price', 'econom', 'bank', 'profit', 'rise', 'year', 'sale']
Topic 2
	 ['mr', 'labour', 'elect', 'parti', 'said', 'blair', 'tori', 'govern', 'minist', 'would']
Topic 3
	 ['film', 'game', 'play', 'said', 'win', 'best', 'award', 'star', 'year', 'england']
Topic 4
	 ['club', 'chelsea', 'arsenal', 'liverpool', 'unit', 'leagu', 'manchest', 'manchest unit', 'striker', 'footbal']


# Testing with a new example

In [72]:
new_example = """Manchester United players slumped to the turf 
at full-time in Germany on Tuesday in acknowledgement of what their 
latest pedestrian first-half display had cost them. The 3-2 loss at 
RB Leipzig means United will not be one of the 16 teams in the draw 
for the knockout stages of the Champions League. And this is not the 
only price for failure. The damage will be felt in the accounts, in 
the dealings they have with current and potentially future players 
and in the faith the fans have placed in manager Ole Gunnar Solskjaer. 
With Paul Pogba's agent angling for a move for his client and ex-United 
defender Phil Neville speaking of a "witchhunt" against his former team-mate 
Solskjaer, BBC Sport looks at the ramifications and reaction to a big loss for United."""

In [70]:
def test_new_example(lda, vect, example):
    vectorized = vect.transform([example])
    topic = lda.transform(vectorized)
    print(topic)
    return topic

In [73]:
test_new_example(lda, vectorizer, new_example)

[[0.74556144 0.16589133 0.02962948 0.02917745 0.0297403 ]]


array([[0.74556144, 0.16589133, 0.02962948, 0.02917745, 0.0297403 ]])

# Save model

In [None]:
def save_model(lda, lda_path, vect, vect_path):
    pickle.dump(lda, open(lda_path, 'wb'))
    pickle.dump(vect, open(vect_path, 'wb'))