In [1]:
import pandas as pd
import re
import spacy
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import LatentDirichletAllocation
import datetime
import gensim
from gensim import corpora, models, similarities

In [2]:
def clean_text(text):
    #text = text.decode("UTF-8")
    text = text.replace('\n'," ")
    text = text.replace('\x0c'," ")
    text = re.sub(r"-", " ", text) # Split the words with "-" (for example：pre-processing ==> pre processing）
    text = re.sub(r"\d+/\d+/\d+", "", text) # Take out the dates
    text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text) # Take out the time
    text = re.sub(r"[\w]+@[\.\w]+", "", text) # Take out the emails
    text = re.sub(r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text) # Take out the websites
    pure_text = ''
    # Validate to check if there are any non-text content 
    for letter in text:
        # Keep only letters and spaces
        if letter.isalpha() or letter==' ':
            pure_text += letter
    # Join the words are not stand-alone letters
    text = ' '.join(word for word in pure_text.split() if len(word)>1)
    return text

# Create our list of punctuation marks
punctuations = string.punctuation
# Load English tokenizer, tagger, parser, NER and word vectors
parser = spacy.load('en_core_web_sm')
# Create our list of stopwords
stop_words = spacy.lang.en.stop_words.STOP_WORDS
# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)
    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    # return preprocessed list of tokens
    return mytokens

def n_topic_df(n):
    lda_tfidf = LatentDirichletAllocation(n_components=n, random_state=0)
    lda_tfidf.fit(word_matrix)
    topic_matrix = lda_tfidf.transform(word_matrix)
    topic_matrix_df = pd.DataFrame(topic_matrix).add_prefix('topic_')
    topic_matrix_df["topic"] = topic_matrix_df.iloc[:,:].idxmax(axis=1)
    all_df = pd.concat([df_subID,topic_matrix_df], axis=1)
    return all_df

def n_topic_word(n):
    lda_tfidf = LatentDirichletAllocation(n_components=n, random_state=0)
    lda_tfidf.fit(word_matrix)
    word_topic_matrix_df = pd.DataFrame(lda_tfidf.components_, columns=vocab).T.add_prefix('topic_')
    return word_topic_matrix_df

In [3]:
df = pd.read_csv("data/city_SanJose_Minutes.csv")

df = df.iloc[:, 1:]

df['date'] = pd.to_datetime(df['date'])

art_df = pd.DataFrame(df.groupby('artID').sum('content')['subID'])

art_df = art_df.loc[art_df.subID>3]

df = df.merge(art_df, on='artID', how = 'inner')

df_subID = df[df['subID_x']!=0]

df_subID = df_subID.reset_index()

text = df_subID['content']

text = text.apply(lambda x: clean_text(x))

text = text.apply(lambda x: ' '.join(spacy_tokenizer(x)))

tfidf_vectorizer = TfidfVectorizer(min_df=0.0085, max_df=0.9, stop_words=ENGLISH_STOP_WORDS)

word_matrix = tfidf_vectorizer.fit_transform(text)

vocab = tfidf_vectorizer.get_feature_names()

all_df = n_topic_df(10)

word_topic_matrix_df = n_topic_word(10)

clean_list = [clean_text(i) for i in text]

spacy_list =[spacy_tokenizer(i) for i in clean_list]

w2v = gensim.models.Word2Vec(spacy_list, size=100, window=5, min_count=1, workers=2, sg=1)

In [36]:
all_df.content[167]

'6.4 20-179 Second Amendment to the Consultant Agreement with Brown and Caldwell for \nEngineering Services for the Digester and Thickener Facilities Upgrade  \n\nProject. \n\nApprove the Second Amendment to the Consultant Agreement with  \n\nBrown and Caldwell for engineering services for the Digester and  \n\nThickener Facilities Upgrade project at the San José-Santa Clara  \n\nRegional Wastewater Facility, modifying the scope of services,  \n\nextending the term of agreement from June 30, 2020 to June 30, 2022,  \n\nand increasing the amount of compensation by $2,530,734 for a total  \n\nagreement amount not to exceed $16,548,144, subject to the  \n\nappropriation of funds. \n\nCEQA:  San José-Santa Clara Regional Wastewater Facility Digester  \n\nand Thickener Facilities Upgrade Project Mitigated Negative  \n\nDeclaration, File No. PP15-055. (Environmental Services) \n\nAction:  Upon motion by Vice Mayor Chappie Jones, seconded by Councilmember Lan Diep \n\nand carried unanimously,

In [6]:
from pprint import pprint
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [37]:
doc = nlp(all_df.content[167])
pprint([(X.text, X.label_) for X in doc.ents])

[('6.4', 'CARDINAL'),
 ('Second Amendment', 'LAW'),
 ('the Consultant Agreement', 'LAW'),
 ('Brown', 'PERSON'),
 ('Caldwell', 'PERSON'),
 ('Digester', 'PERSON'),
 ('Thickener Facilities', 'ORG'),
 ('the Second Amendment', 'LAW'),
 ('the Consultant Agreement', 'LAW'),
 ('Brown', 'PERSON'),
 ('Caldwell', 'PERSON'),
 ('Digester', 'PERSON'),
 ('Thickener Facilities', 'ORG'),
 ('the San José-Santa Clara', 'FAC'),
 ('Regional Wastewater Facility', 'PERSON'),
 ('June 30, 2020 to June 30, 2022', 'DATE'),
 ('2,530,734', 'MONEY'),
 ('16,548,144', 'MONEY'),
 ('San José', 'GPE'),
 ('Digester', 'PERSON'),
 ('Thickener Facilities Upgrade Project Mitigated Negative  \n\nDeclaration,',
  'ORG'),
 ('Environmental Services', 'ORG'),
 ('Chappie Jones', 'PERSON'),
 ('Councilmember Lan', 'PERSON'),
 ('the Second Amendment', 'LAW'),
 ('the Consultant Agreement', 'LAW'),
 ('Brown', 'PERSON'),
 ('Caldwell', 'PERSON'),
 ('Carrasco', 'GPE'),
 ('Davis', 'PERSON'),
 ('San José', 'GPE')]


In [38]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(6.4, 'B', 'CARDINAL'),
 (20, 'O', ''),
 (-, 'O', ''),
 (179, 'O', ''),
 (Second, 'B', 'LAW'),
 (Amendment, 'I', 'LAW'),
 (to, 'O', ''),
 (the, 'B', 'LAW'),
 (Consultant, 'I', 'LAW'),
 (Agreement, 'I', 'LAW'),
 (with, 'O', ''),
 (Brown, 'B', 'PERSON'),
 (and, 'O', ''),
 (Caldwell, 'B', 'PERSON'),
 (for, 'O', ''),
 (
, 'O', ''),
 (Engineering, 'O', ''),
 (Services, 'O', ''),
 (for, 'O', ''),
 (the, 'O', ''),
 (Digester, 'B', 'PERSON'),
 (and, 'O', ''),
 (Thickener, 'B', 'ORG'),
 (Facilities, 'I', 'ORG'),
 (Upgrade, 'O', ''),
 ( 

, 'O', ''),
 (Project, 'O', ''),
 (., 'O', ''),
 (

, 'O', ''),
 (Approve, 'O', ''),
 (the, 'B', 'LAW'),
 (Second, 'I', 'LAW'),
 (Amendment, 'I', 'LAW'),
 (to, 'O', ''),
 (the, 'B', 'LAW'),
 (Consultant, 'I', 'LAW'),
 (Agreement, 'I', 'LAW'),
 (with, 'O', ''),
 ( 

, 'O', ''),
 (Brown, 'B', 'PERSON'),
 (and, 'O', ''),
 (Caldwell, 'B', 'PERSON'),
 (for, 'O', ''),
 (engineering, 'O', ''),
 (services, 'O', ''),
 (for, 'O', ''),
 (the, 'O', ''),
 (Digester, 'B', '

In [39]:
len(doc.ents)

31

In [40]:
labels = [x.label_ for x in doc.ents]
Counter(labels)

Counter({'CARDINAL': 1,
         'LAW': 6,
         'PERSON': 13,
         'ORG': 4,
         'FAC': 1,
         'DATE': 1,
         'MONEY': 2,
         'GPE': 3})

In [41]:
items = [x.text for x in doc.ents]
Counter(items).most_common(10)

[('the Consultant Agreement', 3),
 ('Brown', 3),
 ('Caldwell', 3),
 ('Digester', 3),
 ('Thickener Facilities', 2),
 ('the Second Amendment', 2),
 ('San José', 2),
 ('6.4', 1),
 ('Second Amendment', 1),
 ('the San José-Santa Clara', 1)]

In [42]:
sentences = [x for x in doc.sents]
print(sentences[1])

Approve the Second Amendment to the Consultant Agreement with  

Brown and Caldwell for engineering services for the Digester and  

Thickener Facilities Upgrade project at the San José-Santa Clara  

Regional Wastewater Facility, modifying the scope of services,  

extending the term of agreement from June 30, 2020 to June 30, 2022,  

and increasing the amount of compensation by $2,530,734 for a total  

agreement amount not to exceed $16,548,144, subject to the  

appropriation of funds. 




In [43]:
displacy.render(nlp(str(sentences[1])), jupyter=True, style='ent')

In [46]:
displacy.render(nlp(str(sentences[1])), style='dep', jupyter = True, options = {'distance': 100})

In [47]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[1])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Approve', 'VERB', 'approve'),
 ('Second', 'PROPN', 'Second'),
 ('Amendment', 'PROPN', 'Amendment'),
 ('Consultant', 'PROPN', 'Consultant'),
 ('Agreement', 'PROPN', 'Agreement'),
 (' \n\n', 'SPACE', ' \n\n'),
 ('Brown', 'PROPN', 'Brown'),
 ('Caldwell', 'PROPN', 'Caldwell'),
 ('engineering', 'NOUN', 'engineering'),
 ('services', 'NOUN', 'service'),
 ('Digester', 'NOUN', 'digester'),
 (' \n\n', 'SPACE', ' \n\n'),
 ('Thickener', 'PROPN', 'Thickener'),
 ('Facilities', 'PROPN', 'Facilities'),
 ('Upgrade', 'PROPN', 'Upgrade'),
 ('project', 'NOUN', 'project'),
 ('San', 'PROPN', 'San'),
 ('José', 'PROPN', 'José'),
 ('Santa', 'PROPN', 'Santa'),
 ('Clara', 'PROPN', 'Clara'),
 (' \n\n', 'SPACE', ' \n\n'),
 ('Regional', 'PROPN', 'Regional'),
 ('Wastewater', 'PROPN', 'Wastewater'),
 ('Facility', 'PROPN', 'Facility'),
 ('modifying', 'VERB', 'modify'),
 ('scope', 'NOUN', 'scope'),
 ('services', 'NOUN', 'service'),
 (' \n\n', 'SPACE', ' \n\n'),
 ('extending', 'VERB', 'extend'),
 ('term', 'NOUN', 'te

In [49]:
dict([(str(x), x.label_) for x in nlp(str(sentences[1])).ents])


{'the Second Amendment': 'LAW',
 'the Consultant Agreement': 'LAW',
 'Brown': 'PERSON',
 'Caldwell': 'PERSON',
 'Digester': 'PERSON',
 'Thickener Facilities': 'ORG',
 'the San José-Santa Clara': 'FAC',
 'Regional Wastewater Facility': 'PERSON',
 'June 30, 2020 to June 30, 2022': 'DATE',
 '2,530,734': 'MONEY',
 '16,548,144': 'MONEY'}

In [50]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[1]])

[(Approve, 'O', ''), (the, 'B', 'LAW'), (Second, 'I', 'LAW'), (Amendment, 'I', 'LAW'), (to, 'O', ''), (the, 'B', 'LAW'), (Consultant, 'I', 'LAW'), (Agreement, 'I', 'LAW'), (with, 'O', ''), ( 

, 'O', ''), (Brown, 'B', 'PERSON'), (and, 'O', ''), (Caldwell, 'B', 'PERSON'), (for, 'O', ''), (engineering, 'O', ''), (services, 'O', ''), (for, 'O', ''), (the, 'O', ''), (Digester, 'B', 'PERSON'), (and, 'O', ''), ( 

, 'O', ''), (Thickener, 'B', 'ORG'), (Facilities, 'I', 'ORG'), (Upgrade, 'O', ''), (project, 'O', ''), (at, 'O', ''), (the, 'B', 'FAC'), (San, 'I', 'FAC'), (José, 'I', 'FAC'), (-, 'I', 'FAC'), (Santa, 'I', 'FAC'), (Clara, 'I', 'FAC'), ( 

, 'O', ''), (Regional, 'B', 'PERSON'), (Wastewater, 'I', 'PERSON'), (Facility, 'I', 'PERSON'), (,, 'O', ''), (modifying, 'O', ''), (the, 'O', ''), (scope, 'O', ''), (of, 'O', ''), (services, 'O', ''), (,, 'O', ''), ( 

, 'O', ''), (extending, 'O', ''), (the, 'O', ''), (term, 'O', ''), (of, 'O', ''), (agreement, 'O', ''), (from, 'O', ''), (June, 'B

In [60]:
displacy.render(nlp(str(sentences[:])), jupyter=True, style='ent')

## Put together

In [61]:
from pprint import pprint
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
doc = nlp(all_df.content[167])
sentences = [x for x in doc.sents]
displacy.render(nlp(str(sentences[:])), jupyter=True, style='ent')