In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import string
import json
import pickle
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim import corpora, models, similarities, matutils
from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def load_borowitz():
    with open('final_borowitz.json', 'r') as fp:
        d = json.load(fp)
    fp.close()
    titles = []
    dates = []
    urls = []
    article_text = []

    for article in d:
        titles.append(article[0])
        dates.append(article[1])
        urls.append(article[2])
        article_text.append(article[3])
    return titles, dates, urls, article_text

In [4]:
def get_onion_text(page_url):
    page = urlopen(page_url)
    soup = BeautifulSoup(page, "html.parser")
    stuff = soup.find_all("p", attrs={"class":None})
    article_text = ""
    for things in stuff:
        blah = things.text.strip()
        article_text += blah + ' '
    article_text = "".join((char.lower() for char in article_text if char not in string.punctuation))
    article_text = article_text.replace('"', " ").replace("'",'').replace('“',' ').replace('”',' ').replace("’",'').replace('-',' ').replace('--',' ').replace('—',' ').replace('…',' ')
    return article_text

In [5]:
def get_nytimes_text(page_url):
    page = urlopen(page_url)
    soup = BeautifulSoup(page, "html.parser")
    stuff = soup.find_all("div", attrs={"class":"css-18sbwfn StoryBodyCompanionColumn"}) # Updated August 9, 2018
    article_text = ""
    for things in stuff:
        blah = things.text.strip()
        article_text += blah + ' '
    article_text = "".join((char.lower() for char in article_text if char not in string.punctuation))
    article_text = article_text.replace('"', " ").replace("'",'').replace('“',' ').replace('”',' ').replace("’",'').replace('-',' ').replace('--',' ').replace('—',' ').replace('…',' ')
    return article_text

In [33]:
def find_similar_borowitz(new_text):
    full_text = article_text + [new_text]
    tfidf_vectorizer = TfidfVectorizer(analyzer='word',
                                  ngram_range=(1, 4), stop_words=stop_words,
                                  min_df=1)
    tfidf_matrix = tfidf_vectorizer.fit_transform(full_text)
    M = cosine_similarity(tfidf_matrix)
    n = len(M)
    i = np.argmax(M[n-1][0:n-1]) # [0:n-1] because we don't want the last element, because it will always be 1: new_text is
    # obviously most similar to itself.  We don't care about that.
    return titles[i], urls[i]

In [3]:
titles, dates, urls, article_text = load_borowitz()

In [4]:
stop_words = ['i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves','he','him','his','himself',
    'she','her','hers','herself','it','its','itself','they','them','their','theirs','themselves','what','which','who','whom','this',
    'that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did',
    'doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against',
    'between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under',
    'again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most',
    'other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don',
    'should','now','d','ll','m','o','re','ve','y','ain','aren','couldn','didn','doesn','hadn','hasn','haven','isn','ma','mightn',
    'mustn','needn','shan','shouldn','wasn','weren','won','wouldn','said','mr', 'obama', 'would', 'president']

In [5]:
number_topics = 20

In [6]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word',
                                  ngram_range=(1, 4), stop_words=stop_words,
                                  min_df=1, max_df=0.9)
tfidf_vectorizer.fit(article_text)
doc_vecs = tfidf_vectorizer.transform(article_text).transpose()
#tfidf_vectorizer.get_feature_names()
corpus = matutils.Sparse2Corpus(doc_vecs)
id2word = dict((v, k) for k, v in tfidf_vectorizer.vocabulary_.items())
lda = models.LdaModel(corpus, id2word=id2word, num_topics=number_topics, passes=10, alpha='auto', eta='auto')

In [7]:
lda_corpus = lda[corpus]
lda_docs = [doc for doc in lda_corpus]
doc_topics = [[titles[i],lda_docs[i]] for i in range(len(article_text))]
#doc_topics

In [8]:
topics = [sorted(lda.show_topic(i, topn=10), key=lambda x: x[1], reverse=True) [:10] for i in range(number_topics)]

In [9]:
with open('final_stuff.pickle', 'wb') as handle:
    pickle.dump([lda_docs, topics], handle)
handle.close()

In [99]:
file = open("final_stuff.pickle",'rb')
b,c = pickle.load(file)
file.close()

In [471]:
#lda.print_topics(num_words=20, num_topics=12)

In [639]:
m = max(lda_docs[0], key=lambda x: x[1])[0]

In [641]:
important = [topics[m][i][0] for i in range(len(topics[m]))]
#[topics[m][i][0] for i in len(topics[m])]

In [642]:
result = 'Most important LDA topic composed of: %s, %s, %s, %s, %s, %s, %s, %s, %s, %s' %tuple(important)

In [643]:
print(result)

Most important LDA topic composed of: kochs, apple, shiites, customers, sunnis, sunnis shiites, format, cook, new format, cheney


In [629]:
s = ['language', 'Python', 'rocks']
some_text = "There is a %s called %s which %s."
x = some_text % tuple(s)

In [630]:
print(x)

There is a language called Python which rocks.


In [48]:
lda_corpus2 = lda2[corpus2]
lda_docs2 = [doc for doc in lda_corpus2]
doc_topics2 = [[titles[i],lda_docs2[i]] for i in range(len(article_text))]
#doc_topics2

In [223]:
#lda.print_topics(num_words=20, num_topics=12)

In [364]:
news_text = 'dispatched the following telegram to Dewey: “I thank you for your statement which I have heard over the air a few minutes ago.”  Earlier on Election Day, Dewey told his staff that "whatever the result, I think we have made a mighty contribution toward the unity of our country, toward the war effort and the peace to come.”  At the time, the Democratic national headquarters was in the Biltmore Hotel, on New York’s Madison Avenue.  This time, Donald J. Trump, the Republican nominee, will be watching from the New York Hilton Midtown, a few blocks from his Trump Tower home. Hillary Clinton, the Democratic nominee, will be at the Jacob K. Javits Convention Center on the West Side of Manhattan.'

In [107]:
# count_vectorizer = CountVectorizer(analyzer='word',
#                                   ngram_range=(1, 4), stop_words=stop_words,
#                                   min_df=1)

In [113]:
tfidf_vectorizer_nmf = TfidfVectorizer(max_df=0.95, min_df=1,
                                   stop_words=stop_words, ngram_range=(1, 4))

In [114]:
tfidf_nmf = tfidf_vectorizer_nmf.fit_transform(article_text)

In [118]:
nmf = NMF(n_components=12, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf_nmf)

In [119]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [17]:
tfidf_feature_names = tfidf_vectorizer_nmf.get_feature_names()
#print_top_words(nmf, tfidf_feature_names, 20)

NameError: name 'tfidf_vectorizer_nmf' is not defined

In [123]:
vectorizer = CountVectorizer(min_df = 1, stop_words = 'english')
dtm = vectorizer.fit_transform(article_text)
nmf_model = NMF(2)

dtm_nmf = nmf_model.fit_transform(dtm)
dtm_nmf = Normalizer(copy=False).fit_transform(dtm_nmf)

In [136]:
# tfidf_vectorizer = TfidfVectorizer(analyzer='word',
#                                   ngram_range=(1, 4), stop_words=stop_words,
#                                   min_df=1)

# tfidf_vectorizer.fit(article_text)
# doc_vecs = tfidf_vectorizer.transform(article_text).transpose()
# corpus = matutils.Sparse2Corpus(doc_vecs)

# id2word = dict((v, k) for k, v in tfidf_vectorizer.vocabulary_.iteritems())

In [137]:
#lda = models.LdaModel(corpus, id2word=id2word, num_topics=12, passes=10, alpha='auto', eta='auto')

In [264]:
new = tfidf_vectorizer.transform(news_text.decode('utf-8'))

In [265]:
doc_vecs_new = new.transpose()
corpus_new = matutils.Sparse2Corpus(doc_vecs_new)

id2word_new = dict((v, k) for k, v in tfidf_vectorizer.vocabulary_.iteritems())

In [266]:
new_lda = lda[corpus_new]

In [267]:
new_lda_docs = [doc for doc in new_lda]

In [255]:
#lda.update(corpus_new)

In [16]:
# for doc in lda[corpus]:
#     print(doc)   

In [430]:
def which_newspaper(text):
    if text.startswith('http://www.nytimes'):
        return get_nytimes_text(text)
    elif text.startswith('http://www.theonion'):
        return get_onion_text(text)
    else:
        return text

In [14]:
#print(M)

In [441]:
sorted_M = sorted(M[694][0:694], reverse=True)

In [13]:
#print(sorted_M)

In [456]:
last_row = M[n-1][0:n-1]
blah = M[n-1][0:n]
sorted_last_row = sorted(last_row, reverse=True)
#i, j = last_row.index(sorted_last_row[0]), last_row.index(sorted_last_row[1])

In [450]:
type(sorted_last_row)

list

In [467]:
x = np.where(last_row==sorted_last_row[1])[0][0]

In [462]:
x = np.where(blah==float(1))

In [468]:
last_row[x]

0.030100124524445795

In [515]:
from sklearn.decomposition import TruncatedSVD

In [528]:
news_text = 'Cringing at the mere thought of the ceremonial rite she would have to perform, Queen Elizabeth II told reporters Thursday she hopes to die before having to knight any DJs. “God willing, I’ll pass away long before I’m ever called upon to bestow an honorary knighthood on Calvin Harris or Grooverider,” said the queen, adding that she would rather be entombed in the royal burial grounds than endure a ceremony in which she grants the highest honor in the British Empire to any club DJ in recognition of their contributions to dubstep, electro house, big beat, trip-hop, dance pop, or nu-funk. “It’s only a matter of time before the requests to knight all these trance and rave DJs start pouring in. I just pray I’m a goner and worms are eating away at my decaying corpse, because there’s simply no way I’m saying ‘I dub thee Sir Jackmaster.’” The queen went on to confirm that the complete collapse of the British monarchy was far more preferable than any member of the British Royal Family having to knight Fatboy Slim.'

In [12]:
full_text2 = article_text + [news_text]
vectorizer = CountVectorizer(min_df = 1, stop_words = stop_words)
dtm = vectorizer.fit_transform(full_text) 
pd.DataFrame(dtm.toarray(), index=range(len(full_text)), columns=vectorizer.get_feature_names()).head(10)
lsa = TruncatedSVD(100, algorithm = 'arpack')
dtm_lsa = lsa.fit_transform(dtm)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)
#pd.DataFrame(dtm_lsa.round(5), index = range(len(full_text)), columns = ["component_"+str(i) for i in range(1,101)])

NameError: name 'news_text' is not defined

In [11]:
#[[i, dtm_lsa[i][0].round(5), dtm_lsa[i][1].round(5), dtm_lsa[i][2].round(5), dtm_lsa[i][3].round(5), dtm_lsa[i][4].round(5), dtm_lsa[i][5].round(5), dtm_lsa[i][6].round(5), dtm_lsa[i][7].round(5), dtm_lsa[i][8].round(5), dtm_lsa[i][9].round(5)] for i in range(len(full_text))]

In [545]:
similarity = np.asarray(np.asmatrix(dtm_lsa) * np.asmatrix(dtm_lsa).T) 
pd.DataFrame(similarity.round(6),index=(range(len(full_text))), columns=(range(len(full_text))))[690:]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,685,686,687,688,689,690,691,692,693,694
690,0.054407,0.05658,0.127381,0.178711,0.079304,0.080273,0.044679,0.084182,0.044891,0.172808,...,0.013617,0.094923,0.029002,0.038443,0.206758,1.0,0.029182,0.110324,0.065013,0.473127
691,0.029434,0.196643,0.044026,0.126383,0.232523,0.143647,0.120874,0.178603,0.087766,0.175625,...,0.045086,0.025184,0.043477,0.067761,0.123255,0.029182,1.0,0.128478,0.217874,0.126703
692,0.262827,0.085699,0.083664,0.152583,0.367697,0.128876,0.194694,0.216182,0.250153,0.178353,...,0.090397,0.096538,0.202078,0.011268,0.116986,0.110324,0.128478,1.0,0.282627,0.016985
693,0.196692,0.125427,0.024427,0.082065,0.086871,0.094853,0.0463,0.125554,0.051417,0.081392,...,0.07807,0.032756,0.046685,0.107938,0.018203,0.065013,0.217874,0.282627,1.0,0.026129
694,0.073635,0.120988,0.137149,0.16728,0.319255,0.073393,0.200284,0.084309,0.194826,0.381593,...,0.1102,0.174477,0.051854,0.255068,0.061701,0.473127,0.126703,0.016985,0.026129,1.0


In [546]:
last_row = similarity[695-1][0:695-1]
sorted_last_row = sorted(last_row, reverse=True)
i = np.where(last_row==sorted_last_row[0])[0][0]
j = np.where(last_row==sorted_last_row[1])[0][0]

In [547]:
print(i,j)

(64, 392)


In [548]:
similarity[694][64]

0.60525607315580709

In [549]:
titles[64]

u'Poll: Unconscious Clinton More Fit to Be President Than Conscious Trump'

In [550]:
titles[392]

u'Trump: Obama Vacationing Instead of Running ISIS'

In [7]:
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score

In [19]:
km = KMeans(n_clusters=10, init='random', random_state=1, n_init=1, max_iter=1)

In [20]:
km.fit(doc_vecs)

KMeans(copy_x=True, init='random', max_iter=1, n_clusters=10, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=1, tol=0.0001,
    verbose=0)

In [21]:
y = km.predict(doc_vecs)

In [None]:
# reseting the labels to 3 clusters 
k_num = 10
km = KMeans(n_clusters=k_num, random_state=1)
km.fit(doc_vecs)
labels = km.labels_

# start by calculating the mean for each cluster
# you can check your answer with 
km.cluster_centers_

def get_cluster_centers(X, labels, k_num):
    CC_list = []
    for k in range(k_num):
        # get the mean coordinates of each cluster
        CC_list.append(np.mean(X[labels == k], axis = 0))
    return CC_list

# for each cluster substract the mean from each data point to get the error
# then get the magnitude of each error, square it, and sum it
def get_SSE(X, labels):
    k_num = len(np.unique(labels))
    CC_list = get_cluster_centers(X, labels, k_num)
    CSEs = []
    for k in range(k_num):
        # for each cluster of k we get the coordinates of how far off each point is to the cluster
        error_cords = X[labels == k] - CC_list[k]
        # square the coordinates and sum to get the magnitude squared
        error_cords_sq = error_cords ** 2
        error_mag_sq = np.sum(error_cords_sq, axis = 1)
        # since we already have the magnitude of the error squared we can just take the sum for the cluster
        CSE = np.sum(error_mag_sq)
        CSEs.append(CSE)
    # sum each cluster's sum of squared errors
    return sum(CSEs)

In [None]:
# similar to the loop before
# generate values of k, fit and label data, append Sum of Squared Errors scores to a list, and print the scores

SSEs = []
Sil_coefs = []
for k in range(10,11):
    km = KMeans(n_clusters=k, random_state=1)
    km.fit(doc_vecs)
    labels = km.labels_
    Sil_coefs.append(metrics.silhouette_score(doc_vecs, labels, metric='euclidean'))
    SSEs.append(get_SSE(doc_vecs, labels))