In [1]:
import pandas as pd

df_idf = pd.read_csv("state-of-the-union.csv",low_memory=False,names=["text", "B","C","D","E","F","G","H","I","J"])


In [2]:
print("Number of rows,columns=",df_idf.shape)

Number of rows,columns= (167814, 10)


In [3]:
df_idf = df_idf.replace('NaN', 0)
print("Number of rows,columns=",df_idf.shape)

Number of rows,columns= (167814, 10)


In [4]:
 print(df_idf.sample(100))

                                                     text  \
75363                 it difficult and complicated duties   
81657   confidence of the victim into an opportunity t...   
129813  agricultural crop. It will place every produce...   
69866   general an introduction of this feature of mai...   
117444                                           business   
...                                                   ...   
98033   was promptly recognized by the Government of t...   
121016                                    Nation's needs.   
110288  The necessary statistics are now being gathere...   
40425                              grades in the service.   
90309                        who commit the crime of rape   

                                              B  \
75363        and requires the exaction from the   
81657                                       NaN   
129813                                      NaN   
69866                                       NaN   
117444      

In [5]:
df_idf = df_idf.drop(['B','C','D','E','F','G','H','I','J'],1)

In [6]:
# print schema
print("Schema:\n\n",df_idf.dtypes)
print("Number of rows,columns=",df_idf.shape)

Schema:

 text    object
dtype: object
Number of rows,columns= (167814, 1)


In [7]:
print(df_idf.sample(10))

                                                     text
149792                                                NaN
115950  relationship that we have shared during these ...
70051   related to a sacred duty of the Government and...
157521                             For the last two years
41539               beyond that period? Our abundant room
52745   and of such as may hereafter arise. While by e...
114136  directed the temporary transfer of the Army Di...
10269                                                 NaN
35082   From this statement it is easy to account for ...
99614                                                 NaN


In [8]:
df_idf = df_idf.dropna()

In [9]:
print(df_idf.sample(10))

                                                     text
41752   trust that in view of the great responsibility...
83216   their allotments. The effort should be steadil...
13863   themselves from the injurious effects of a sup...
90946   have the same chance to rise and develop as ot...
100917  admirals under his orders. This is not as it s...
103395  exhibition and protect foreign exhibitors agai...
20993                                            the same
102850  the conduct of his department have proven to b...
151965  never again permit wild currency swings to cri...
91506                 Republics of the southern continent


In [10]:
import re
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("<!--?.*?-->","",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

In [11]:
print(df_idf.sample(10))

                                                     text
90395   instincts and passions in order to arouse one ...
42119   Navy in regard to the policy of fostering and ...
126179  can get a good idea of how much our country sh...
94214   power has prepared best in time of peace. The ...
64601                                      Samoan Islands
93740                                                mice
29689   agents among these tribes in all treaties to m...
20225                                     a barbarous age
44126   and industry. It would alleviate the present t...
147115  The talents of women should continue to be use...


In [12]:
df_idf['text'] = df_idf['text'].apply(lambda x:pre_process(x))


In [13]:
#show the second 'text' just for fun
print(df_idf.sample(10))

                                                     text
78727   commissioners proposed that the subject of the...
101411  republics for the purpose of giving evidence o...
99713                                   britain of april 
98654   in may last the supreme court handed down deci...
10623                                         authorities
34693   france having consented to observe them for th...
101303  administration to demand for american citizens...
89807   which converted the island of cuba from a pest...
13515   in the question could be obtained to this prop...
98716   terms of the statute this is wholly untrue a r...


In [14]:
# print schema
print("Schema:\n\n",df_idf.dtypes)
print("Number of rowss,columns=",df_idf.shape)

Schema:

 text    object
dtype: object
Number of rowss,columns= (147710, 1)


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
import re
 
#load a set of stop words
stopwords='english'
 
#get the text column 
docs=df_idf['text'].tolist()
 
#create a vocabulary of words, 
#ignore words that appear in 85% of documents, 
#eliminate stop words
cv=CountVectorizer(max_df=0.85,stop_words=stopwords)
word_count_vector=cv.fit_transform(docs)
 

In [16]:
print(word_count_vector.shape)

(147710, 20669)


In [18]:
#Let's limit our vocabulary size to 20,000
cv=CountVectorizer(max_df=0.85,stop_words=stopwords,max_features=20000)
word_count_vector=cv.fit_transform(docs)
print(word_count_vector.shape)

(147710, 20000)


In [29]:
list(cv.vocabulary_.keys())[:10]

['protection',
 'important',
 'interests',
 'citizens',
 'engaged',
 'commerce',
 'fisheries',
 'sea',
 'vessels',
 'likewise']

In [32]:
#get the vocabulary by using get_feature_names() only for testing purpose
list(cv.get_feature_names())[2000:2015]

['blackburn',
 'blacken',
 'blacker',
 'blackford',
 'blackmail',
 'blackmailed',
 'blackouts',
 'blacks',
 'blacksmith',
 'blacksmiths',
 'blackwell',
 'blaine',
 'blair',
 'blake',
 'blamable']

In [33]:
#TfidfTransformer to Compute Inverse Document Frequency (IDF)

In [34]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [35]:
#some of the IDF values:
print(tfidf_transformer.idf_)

[12.20986576 12.20986576 12.20986576 ... 12.20986576 12.20986576
 11.29357503]


In [48]:
# get test docs into a list I could not find out sample text of 1900 decade
docs_test=df_idf['text'].tolist()[2000:10000]

In [49]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [54]:
# only needs to do this once
feature_names=cv.get_feature_names()

# get the document that we want to extract keywords from
doc=docs_test[250]

#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,100)

In [55]:
# now print the results
print("\n==Keywords===\n")
for k in keywords:
    print(k,keywords[k])


==Keywords===

probable 0.456
extensive 0.427
society 0.409
human 0.375
condition 0.331
commerce 0.328
present 0.291


In [None]:
# put the common code into several methods
def get_keywords(idx):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([docs_test[idx]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords

def print_results(idx,keywords):
    # now print the results
    print("\n=====Title=====")
    print(docs_title[idx])
    print("\n=====Body=====")
    print(docs_body[idx])
    print("\n===Keywords===")