In [42]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
import requests

In [5]:
unfulfilled_dreams = "https://kinginstitute.stanford.edu/king-papers/documents/unfulfilled-dreams"
response = requests.get(unfulfilled_dreams)

In [33]:
page = response.text
soup = BeautifulSoup(page, "lxml").text
soup = soup.split('\n')[170:]
example = soup

In [34]:
vectorizer = CountVectorizer(stop_words='english')
doc_word = vectorizer.fit_transform(example)
doc_word.shape

(176, 557)

In [35]:
dtm_lsa = pd.DataFrame(doc_word.toarray(), index=example, columns=vectorizer.get_feature_names()).head(10)
dtm_lsa

Unnamed: 0,10a,1968how,2092,2093,4146,466,650,723,78,80,...,wings,woman,wonderful,woodrow,words,wrong,yeah,years,yes,zoroastrianism
"""Unfulfilled Dreams""",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Author: King, Martin Luther, Jr.Date: March 3, 1968How do we determine conjectured information?&nbsp?Location: Atlanta, Ga.How do we determine conjectured information?&nbsp?Genre: AudioSermonTopic: Martin Luther King, Jr. - Career in MinistryMartin Luther King, Jr. - Political and Social ViewsNonviolenceDetailsI want to preach this morning from the subject: ""Unfulfilled Dreams."" ""Unfulfilled Dreams."" My text is taken from the eighth chapter of First Kings. Sometimes it’s overlooked. It is not one of the most familiar passages in the Old Testament. But I never will forget when I first came across it. It struck me as a passage having cosmic significance because it says so much in so few words about things that we all experience in life. David, as you know, was a great king. And the one thing that was foremost in David’s mind and in his heart was to build a great temple. The building of the temple was considered to be the most significant thing facing the Hebrew people, and the king was expected to bring this into being. David had the desire; he started.",0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
"And then we come to that passage over in the eighth chapter of First Kings, which reads, ""And it was in the heart of David my father to build an house for the name of the Lord God of Israel. And the Lord said unto David my father, ‘Whereas it was in thine heart to build an house unto my name, thou didst well that it was within thine heart.’"" And that’s really what I want to talk about this morning: it is well that it was within thine heart. As if to say, ""David, you will not be able to finish the temple. You will not be able to build it. But I just want to bless you, because it was within thine heart. Your dream will not be fulfilled. The majestic hopes that guided your days will not be carried out in terms of an actual temple coming into being that you were able to build. But I bless you, David, because it was within thine heart. You had the desire to do it; you had the intention to do it; you tried to do it; you started to do it. And I bless you for having the desire and the intention in your heart. It is well that it was within thine heart.""",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"So many of us in life start out building temples: temples of character, temples of justice, temples of peace. And so often we don’t finish them. Because life is like Schubert’s ""Unfinished Symphony."" At so many points we start, we try, we set out to build our various temples. And I guess one of the great agonies of life is that we are constantly trying to finish that which is unfinishable. We are commanded to do that. And so we, like David, find ourselves in so many instances having to face the fact that our dreams are not fulfilled.",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Now let us notice first that life is a continual story of shattered dreams. Mahatma Gandhi labored for years and years for the independence of his people. And through a powerful nonviolent revolution he was able to win that independence. For years the Indian people had been dominated politically, exploited economically, segregated and humiliated by foreign powers, and Gandhi struggled against it. He struggled to unite his own people, and nothing was greater in his mind than to have India’s one great, united country moving toward a higher destiny. This was his dream.",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,0
"But Gandhi had to face the fact that he was assassinated and died with a broken heart, because that nation that he wanted to unite ended up being divided between India and Pakistan as a result of the conflict between the Hindus and the Moslems. Life is a long, continual story of setting out to build a great temple and not being able to finish it.",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Woodrow Wilson dreamed a dream of a League of Nations, but he died before the promise was delivered.",0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [36]:
lsa = TruncatedSVD(2)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

array([0.22339979, 0.10940063])

In [37]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = ["component_1","component_2"],
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,10a,1968how,2092,2093,4146,466,650,723,78,80,...,wings,woman,wonderful,woodrow,words,wrong,yeah,years,yes,zoroastrianism
component_1,0.0,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.008,0.016,...,0.004,0.004,0.018,0.0,0.005,0.025,0.034,0.003,0.569,0.007
component_2,0.001,0.019,0.0,0.0,0.0,0.0,0.0,0.0,-0.001,-0.002,...,-0.001,-0.0,0.011,-0.0,0.019,-0.004,-0.038,0.003,-0.138,0.001


In [38]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [41]:
display_topics(lsa, vectorizer.get_feature_names(), 10)


Topic  0
yes, heart, sir, preach, god, going, morning, right, life, good

Topic  1
heart, thine, david, build, able, bless, want, temple, desire, king


In [46]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(example)
idf = vectorizer.idf_
terms = dict(zip(vectorizer.get_feature_names(), idf))

In [49]:
sorted_terms = {k: v for k, v in sorted(terms.items(), key=lambda item: item[1], reverse=True)}
sorted_terms

{'10a': 5.483002552013883,
 '1968how': 5.483002552013883,
 '2092': 5.483002552013883,
 '2093': 5.483002552013883,
 '4146': 5.483002552013883,
 '466': 5.483002552013883,
 '650': 5.483002552013883,
 '723': 5.483002552013883,
 '78': 5.483002552013883,
 '80': 5.483002552013883,
 'abernathy': 5.483002552013883,
 'absolute': 5.483002552013883,
 'accept': 5.483002552013883,
 'access': 5.483002552013883,
 'accessibility': 5.483002552013883,
 'across': 5.483002552013883,
 'actual': 5.483002552013883,
 'ago': 5.483002552013883,
 'agree': 5.483002552013883,
 'air': 5.483002552013883,
 'alabama': 5.483002552013883,
 'alone': 5.483002552013883,
 'also': 5.483002552013883,
 'amid': 5.483002552013883,
 'anchor': 5.483002552013883,
 'anguishes': 5.483002552013883,
 'another': 5.483002552013883,
 'anything': 5.483002552013883,
 'approve': 5.483002552013883,
 'arrived': 5.483002552013883,
 'ascend': 5.483002552013883,
 'assassinated': 5.483002552013883,
 'atlanta': 5.483002552013883,
 'audiosermontopic'