In [19]:
import numpy as np
import pandas as pd
import sklearn

import nltk
from nltk.corpus import opinion_lexicon

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
bill_info = pd.read_csv("/Users/sundipta/Insight_notebooks/bill_descriptions.csv",index_col=0)

In [3]:
bill_info.tail()

Unnamed: 0,index,Active,Agency,Bill_ID,LongDescription,ShortDescription,ID
2059,2080,True,Senate,9336,,MONICA A. ALEXANDER,2060
2060,2081,True,Senate,9338,,GREG SZABO,2061
2061,2082,True,Senate,9339,,SUSAN BIRCH,2062
2062,2083,True,Senate,9340,,ROSS HUNTER,2063
2063,2084,True,Senate,9800,,CHRISTOPHER R. POULOS,2064


In [4]:
bill_info_subset = bill_info[~bill_info['LongDescription'].isnull()]

In [36]:
len(bill_info_subset)

1946

In [5]:
#Looking at all the words that occur at least once in the short descriptions
vect = CountVectorizer()

In [6]:
vect.fit(bill_info_subset['LongDescription'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [60]:
len(vect.vocabulary_)
#number unique words being used in short bill descriptions

2696

In [11]:
vocab_dist = vect.vocabulary_

In [21]:
type(vocab_dist)

dict

In [25]:
vocab_dist_df = pd.DataFrame(list(vocab_dist.items()),columns = ['Word','Count'])

In [32]:
vocab_dist_df_sorted = vocab_dist_df.sort_values('Count',ascending = False)

In [61]:
#this constructs and inverted index from the columns (each word) of X back to the vocabulary
ivoc = {j:i for i,j in vect.vocabulary_.items()}

In [63]:
#now that we've fit the vectorizer, let's transform the data to look at the features
X = vect.transform(bill_info_subset['LongDescription'])
X

<1209x2696 sparse matrix of type '<class 'numpy.int64'>'
	with 13074 stored elements in Compressed Sparse Row format>

In [71]:
lda = LatentDirichletAllocation(n_topics=20)

In [72]:
theta = lda.fit_transform(X)



In [73]:
def show_topics(lda,ivoc):
    for k,topic in enumerate(lda.components_):
        print(k,[ivoc[i] for i in topic.argsort()[::-1][:7]])

In [74]:
show_topics(lda,ivoc)

0 ['the', 'of', 'and', 'to', 'for', 'concerning', 'in']
1 ['assisted', 'population', 'living', 'park', 'with', 'thousand', 'located']
2 ['program', 'the', 'concerning', 'in', 'creating', 'health', 'for']
3 ['the', 'act', 'of', 'concerning', 'sexual', 'washington', 'addressing']
4 ['license', 'concerning', 'family', 'allowing', 'property', 'to', 'driver']
5 ['the', 'and', 'system', 'of', 'to', 'concerning', 'requirements']
6 ['property', 'tax', 'from', 'that', 'concerning', 'land', 'offenses']
7 ['concerning', 'fire', 'protection', 'district', 'institutions', 'on', 'and']
8 ['updating', 'youth', 'and', 'between', 'fairs', 'concerning', 'laws']
9 ['for', 'concerning', 'the', 'and', 'of', 'to', 'law']
10 ['products', 'voting', 'definition', 'day', 'religious', 'contractors', 'purposes']
11 ['housing', 'income', 'low', 'revolving', 'loan', 'finance', 'creating']
12 ['high', 'graduation', 'success', 'supporting', 'karen', 'honoring', 'fraser']
13 ['concerning', 'of', 'the', 'or', 'to', 'rec