## Load Dataset

In [1]:
import pickle

In [2]:
def load_obj(name ):
    with open('data/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [3]:
d = load_obj("council_minutes_dict")

## Pre-processing

In [6]:
import re

In [7]:
def clean_text(text):
    text = text.decode("UTF-8")
    text = text.replace('\n'," ")
    text = text.replace('\x0c'," ")
    text = re.sub(r"-", " ", text) # Split the words with "-" (for example：pre-processing ==> pre processing）
    text = re.sub(r"\d+/\d+/\d+", "", text) # Take out the dates
    text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text) # Take out the time
    text = re.sub(r"[\w]+@[\.\w]+", "", text) # Take out the emails
    text = re.sub(r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text) # Take out the websites
    pure_text = ''
    # Validate to check if there are any non-text content 
    for letter in text:
        # Keep only letters and spaces
        if letter.isalpha() or letter==' ':
            pure_text += letter
    # Join the words are not stand-alone letters
    text = ' '.join(word for word in pure_text.split() if len(word)>1)
    return text

In [16]:
# stop_words = ENGLISH_STOP_WORDS.union(word for word in ['docx','fyi','fw','get','see','ok','pm','whose','would','pls','thx','yes','print','okay','pis'])

In [8]:
text = clean_text(d['10'])

## Lemmatization

In [9]:
import spacy
import string
# Create our list of punctuation marks
punctuations = string.punctuation
# Load English tokenizer, tagger, parser, NER and word vectors
parser = spacy.load('en_core_web_sm')
# Create our list of stopwords
stop_words = spacy.lang.en.stop_words.STOP_WORDS
# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)
    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    # return preprocessed list of tokens
    return ' '.join(mytokens)

In [10]:
text = spacy_tokenizer(text)

## Steminization

In [12]:
import nltk
from nltk.stem.porter import PorterStemmer

def nltk_stemmer(text):
    porter_stemmer = PorterStemmer()
    word_data = text
    # First Word tokenization
    nltk_tokens = nltk.word_tokenize(word_data)
    #Next find the roots of the word
    t = []
    for w in nltk_tokens:       
        t.append(porter_stemmer.stem(w))
    return t

In [14]:
text = nltk_stemmer(text)

## TF-IDF

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS

In [77]:
# text = [text]

In [16]:
tfidf_vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)

word_matrix = tfidf_vectorizer.fit_transform(text)

vocab = tfidf_vectorizer.get_feature_names()

## LDA

In [19]:
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [27]:
lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=0)
lda_tfidf.fit(word_matrix)

  and should_run_async(code)


LatentDirichletAllocation(n_components=4, random_state=0)

In [28]:
pyLDAvis.sklearn.prepare(lda_tfidf, word_matrix, tfidf_vectorizer)

  and should_run_async(code)
