In [29]:
#all imports
import json
import nltk
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.cluster import KMeans
import spacy
import re
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
from gensim import corpora
from nltk.corpus import stopwords

In [30]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jayaram/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
data=pd.read_json(r'News_Category_Dataset_v2.json',lines=True)

In [6]:
environment=(data["category"]=="ENVIRONMENT")

In [37]:
dataset=data[environment]
dataset=dataset["headline"]


In [69]:

#based on part of speech, the tokenize function would lemmatize the word
spacy_obj=spacy.load('en',disable=['parser', 'ner'])
punc = {'.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']',"//",".org","'s", '{', '}',"%","$","&","+","=","-","--","-PRON-","-pron-","..","...","/"}
stop_words = text.ENGLISH_STOP_WORDS.union(punc)
def tokenize(text):
    text=re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text)
    text = re.sub("\d+", "", text)
    lemmatized=spacy_obj(text.lower())
    return [token.lemma_ for token in lemmatized]
def tokenize_lsa(text,stop_words):
    text=tokenize(text)
    
    result=[]
    for w in text:
        if w not in stop_words:
            w.encode(encoding='UTF-8',errors='strict')
            result.append(w)
    return result

In [56]:
punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']',"//",".org","'s", '{', '}',"%","$","&","+","=","-","--","-PRON-","-pron-","..","...","/"]
stop_words = text.ENGLISH_STOP_WORDS.union(punc)
vectorizer = TfidfVectorizer(stop_words = stop_words,tokenizer=tokenize,max_features=2000)
vectorized=vectorizer.fit_transform(dataset)

  'stop_words.' % sorted(inconsistent))


In [57]:
words=vectorizer.get_feature_names()
kmeans = KMeans(n_clusters = 23, n_init = 10, n_jobs = -1)
kmeans.fit(vectorized)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=23, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [58]:
common_words = kmeans.cluster_centers_.argsort()[:,-1:-11:-1]
for num, centroid in enumerate(common_words):
    print(str(num) + ' : ' + ', '.join(words[word] for word in centroid))

0 : sound, metal, backyard, recording, hear, music, audio, cicada, scrap, temperature
1 : climate, change, talk, week, need, say, study, ice, action, carbon
2 : new, specie, york, farmer, discover, photo, energy, normal, animal, nyc
3 : sea, rise, level, video, study, fall, photo, underwater, deep, melting
4 : day, earth, mother, world, gift, photo, fall, green, celebrate, endanger
5 : baby, photo, animal, week, panda, picture, zoo, elephant, gorilla, giraffes
6 : animal, week, photo, picture, tiger, elephant, monkey, panda, lion, cub
7 :  , use, park, photo, national, amazing, animal, odd, weather.com, way
8 : time, celebrate, lapse, year, video, week, cool, wildlife, puppy, draw
9 : keystone, xl, pipeline, sand, tar, obama, protester, arrest, student, department
10 : extreme, weather, week, photo, hero, hill, hilarious, highlight, high, zoonose
11 : photo, animal, life, nature, thing, pollution, world, big, storm, free
12 : spill, bp, oil, trial, gulf, exxon, molasse, testify, hallib

cluster 17,cluster 3,cluster 12,cluster 21,cluster 22, cluster 20. relevant water words and some of the irrelevant words with some connection to environment.

## LSA MODEL

In [70]:
clean_data_for_lsa=[]
for sentence in dataset:
    clean_data_for_lsa+=(tokenize_lsa(sentence,stop_words))
clean_data_for_lsa[20:50]

['pacific',
 'northwest',
 'salmon',
 'trout',
 'river',
 'america',
 'exotic',
 'pet',
 'obsession',
 'control',
 'photo',
 'video',
 'time',
 'leave',
 'bad',
 'carbon',
 'boyfriend',
 'landslide',
 'oso',
 'washington',
 'blame',
 'nature',
 'act',
 'god',
 'reckless',
 'logging',
 'float',
 'farm',
 'city',
 'harvest']

In [84]:
def prepare_corpus(doc_clean):
    """
    Input  : clean document
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary([doc_clean])
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = dictionary.doc2bow(doc_clean)
    # generate LDA model
    return dictionary,doc_term_matrix
def create_gensim_lsa_model(doc_clean,number_of_topics,words):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    # generate LSA model
    lsamodel = LsiModel(doc_term_matrix, number_of_topics, id2word = dictionary)  # train model
    print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel
def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
    """
    Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LSA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        # generate LSA model
        model = LsiModel(doc_term_matrix, num_topics, id2word = dictionary)  # train model
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values
def plot_graph(doc_clean,start, stop, step):
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    model_list, coherence_values = compute_coherence_values(dictionary, doc_term_matrix,doc_clean,
                                                            stop, start, step)
    # Show graph
    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

In [85]:
start,stop,step=2,15,1
plot_graph(clean_data_for_lsa,start,stop,step)

TypeError: zip argument #1 must support iteration