In [1]:
import pandas as pd
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/franciscadias/data/master/abcnews-date-text.csv", filename="abcnews-date-text.csv")
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)

In [2]:
print(data.head(5))

   publish_date                                      headline_text
0      20030219  aba decides against community broadcasting lic...
1      20030219     act fire witnesses must be aware of defamation
2      20030219     a g calls for infrastructure protection summit
3      20030219           air nz staff in aust strike for pay rise
4      20030219      air nz strike to affect australian travellers


In [3]:
text = data[['headline_text']]
text.head(5)

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
text['headline_text'] = text['headline_text'].apply(lambda row: row.split())
print(text.head(5))

                                       headline_text
0  [aba, decides, against, community, broadcastin...
1  [act, fire, witnesses, must, be, aware, of, de...
2  [a, g, calls, for, infrastructure, protection,...
3  [air, nz, staff, in, aust, strike, for, pay, r...
4  [air, nz, strike, to, affect, australian, trav...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = text['headline_text'].apply(lambda row: row.split())


In [6]:
from nltk.stem import WordNetLemmatizer
text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
print(text.head(5))

                                       headline_text
0  [aba, decide, against, community, broadcast, l...
1  [act, fire, witness, must, be, aware, of, defa...
2  [a, g, call, for, infrastructure, protection, ...
3  [air, nz, staff, in, aust, strike, for, pay, r...
4  [air, nz, strike, to, affect, australian, trav...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])


In [7]:
tokenized_doc = text['headline_text'].apply(lambda x:[word for word in x if len(word) > 3])
print(tokenized_doc[:5])

0    [decide, against, community, broadcast, licence]
1            [fire, witness, must, aware, defamation]
2          [call, infrastructure, protection, summit]
3                         [staff, aust, strike, rise]
4            [strike, affect, australian, travellers]
Name: headline_text, dtype: object


In [8]:
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)
text['headline_text'] = detokenized_doc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = detokenized_doc


In [17]:
text['headline_text'][:5]
text['headline_text'][0]

'decide against community broadcast licence'

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(text['headline_text'])
X.shape

(1082168, 1000)

In [12]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components=10, learning_method='online', random_state=777,max_iter=1)

In [13]:
lda_top=lda_model.fit_transform(X)

In [14]:
print(lda_model.components_)
print(lda_model.components_.shape)

[[1.00001532e-01 1.00001275e-01 1.00004202e-01 ... 1.00006119e-01
  1.00003104e-01 1.00002829e-01]
 [1.00001194e-01 1.13512235e+03 3.50221619e+03 ... 1.00009295e-01
  1.00001882e-01 1.00002882e-01]
 [1.00001812e-01 1.00001156e-01 1.00003586e-01 ... 1.00002692e-01
  1.00002057e-01 7.49009676e+02]
 ...
 [1.00001065e-01 1.00001698e-01 1.00003298e-01 ... 1.00006718e-01
  1.00004891e-01 1.00004771e-01]
 [1.00002401e-01 1.00000736e-01 1.00003006e-01 ... 1.00003514e-01
  1.00001425e-01 1.00005284e-01]
 [1.00003416e-01 1.00002315e-01 1.00007341e-01 ... 1.00003729e-01
  1.00001201e-01 1.00005258e-01]]
(10, 1000)


In [16]:
terms = vectorizer.get_feature_names()

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1 : -1]])
get_topics(lda_model.components_,terms)

Topic 1: [('government', 8727.11), ('sydney', 8394.94), ('queensland', 7721.74), ('change', 5874.15), ('home', 5670.7)]
Topic 2: [('australia', 13699.12), ('australian', 11101.13), ('melbourne', 7530.16), ('world', 6707.97), ('south', 6679.8)]
Topic 3: [('death', 5937.02), ('interview', 5924.99), ('kill', 5808.97), ('jail', 4635.04), ('life', 4276.22)]
Topic 4: [('house', 6104.92), ('2016', 5488.01), ('state', 4924.26), ('brisbane', 4858.1), ('tasmania', 4611.19)]
Topic 5: [('court', 7545.41), ('attack', 6941.83), ('open', 5665.2), ('face', 5195.37), ('warn', 5091.83)]
Topic 6: [('market', 5544.78), ('rural', 5502.84), ('plan', 4814.24), ('indigenous', 4223.28), ('power', 3969.15)]
Topic 7: [('charge', 8427.66), ('election', 7565.69), ('adelaide', 6761.93), ('make', 5653.0), ('test', 5056.52)]
Topic 8: [('police', 12098.05), ('crash', 5272.65), ('drug', 4272.48), ('beat', 3256.8), ('rise', 2943.36)]
Topic 9: [('fund', 4686.61), ('labor', 4048.76), ('national', 4039.24), ('council', 400