# Library Imports

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk, re
from nltk.stem.wordnet import WordNetLemmatizer

stopwords = set(nltk.corpus.stopwords.words('english'))

# Load feeds into DF

In [2]:
import json
google_json=open("/Github/google_deduplicated.json").readlines()

In [3]:
feed_text = []

for feed in google_json:
    a = json.loads(feed)
    feed_text.append(a['text'])

In [4]:
print("Total number of text: " + str(len(feed_text)))

Total number of text: 18116


# Word Tokenizer

In [6]:
def tokenize_stories(text):
    tokens = nltk.word_tokenize(text)
    lmtzr = WordNetLemmatizer()
    filtered_tokens = []
    
    for token in tokens:
        token = token.replace("'s", " ").replace("n’t", " not").replace("’ve", " have")
        token = re.sub(r'[^a-zA-Z0-9 ]', '', token)
        if token not in stopwords:
            filtered_tokens.append(token.lower())
    
    lemmas = [lmtzr.lemmatize(t,'v') for t in filtered_tokens]

    return lemmas

# Training LDA Model

In [None]:
# Through multiple testings, the best results for topic modeling are the below parameters
#max_df = 0.15
#min_df = 0.01
#max_features = 1000
#max_iter = 500

In [7]:
def test_lda_model(tf, tf_vectorizer, num_topics, max_iter, n_top_words):
    lda = LatentDirichletAllocation(n_components=num_topics, max_iter=max_iter, learning_method='batch', learning_offset=10, random_state=1)
    lda.fit(tf)
    tf_feature_names = tf_vectorizer.get_feature_names()

    topics = dict()
    for topic_idx, topic in enumerate(lda.components_):
        topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]

    return topics

In [8]:
tf_vectorizer = CountVectorizer(max_df=0.15, min_df=0.01, max_features=1000, tokenizer=tokenize_stories, ngram_range=(1, 1))
tf = tf_vectorizer.fit_transform(feed_text)

In [9]:
lda = LatentDirichletAllocation(n_components=8, max_iter=500, learning_method='batch', learning_offset=10, random_state=1)
lda_model = lda.fit(tf)

In [10]:
topics = test_lda_model(tf, tf_vectorizer, 8, 500, 10)
print(topics)

{0: ['cloud', 'technology', 'team', 'digital', 'health', 'network', 'platform', 'design', 'tool', 'develop'], 1: ['page', 'https', 'website', 'web', 'site', 'chrome', 'browser', 'link', 'file', 'user'], 2: ['trump', 'podcast', 'president', 'tech', 'law', 'privacy', 'government', 'order', 'tweet', 'claim'], 3: ['police', 'black', 'coronavirus', 'city', 'health', 'case', 'officer', 'floyd', 'protest', 'pm'], 4: ['million', 'india', 'per', 'pay', 'digital', 'increase', 'billion', 'stock', 'businesses', 'revenue'], 5: ['game', 'good', 'really', 'lot', 'school', 'nt', 'things', 'students', 'something', 'every'], 6: ['android', 'phone', 'apps', 'apple', 'de', 'game', 'device', 'pixel', 'store', 'devices'], 7: ['log', 'smart', 'tv', 'amazon', 'music', 'voice', 'stream', 'assistant', 'youtube', 'never']}


# LDA on 10 random articles

In [11]:
import random
sample = random.sample(range(1, len(feed_text)), 10)

random_10_text = [feed_text[i] for i in sample]

In [12]:
lda_results = lda.fit_transform(tf)
sample_text_results = lda_results[sample,]

In [18]:
import pandas as pd
df = pd.DataFrame(sample_text_results, index=sample)
df

Unnamed: 0,0,1,2,3,4,5,6,7
3387,0.001625,0.001625,0.001626,0.001626,0.040522,0.001626,0.949724,0.001626
7547,0.00272,0.002722,0.002723,0.520908,0.00272,0.407885,0.002725,0.057595
16954,0.005012,0.005002,0.005006,0.225241,0.005014,0.610612,0.139092,0.00502
15396,0.3817,0.136717,0.001509,0.001507,0.001507,0.201019,0.274533,0.001508
4546,0.20827,0.000513,0.196529,0.000513,0.254618,0.058234,0.28081,0.000513
10721,0.00115,0.001148,0.200276,0.588801,0.14551,0.060818,0.001148,0.001148
6700,0.256704,0.000921,0.033379,0.000921,0.492184,0.188476,0.00092,0.026496
12455,0.013889,0.013892,0.013891,0.013889,0.013889,0.013891,0.013889,0.90277
13220,0.62143,0.345943,0.005442,0.005436,0.005438,0.005437,0.005438,0.005436
15689,0.008415,5.7e-05,0.458526,0.076257,0.066695,0.389936,5.7e-05,5.7e-05


In [19]:
for x in range(len(sample)):
    print("For index " + str(sample[x])+
         ", the max value comes from topics " + str(int(df.iloc[[x]].idxmax(1)))+
         ", with a max value of " + str(round(float(max(sample_text_results[x])),4)))

For index 3387, the max value comes from topics 6, with a max value of 0.9497
For index 7547, the max value comes from topics 3, with a max value of 0.5209
For index 16954, the max value comes from topics 5, with a max value of 0.6106
For index 15396, the max value comes from topics 0, with a max value of 0.3817
For index 4546, the max value comes from topics 6, with a max value of 0.2808
For index 10721, the max value comes from topics 3, with a max value of 0.5888
For index 6700, the max value comes from topics 4, with a max value of 0.4922
For index 12455, the max value comes from topics 7, with a max value of 0.9028
For index 13220, the max value comes from topics 0, with a max value of 0.6214
For index 15689, the max value comes from topics 2, with a max value of 0.4585
