In [3]:
import json
import pandas as pd
from nltk.corpus import stopwords
import nltk
import re
import time
import datetime
from gensim import corpora, models, similarities, matutils
import os
import operator
import pprint
from sklearn.decomposition import RandomizedPCA, PCA
from scipy import io

In [4]:
comments = pd.read_json('./data/comments_small.json')
comments

Unnamed: 0,author_id,author_name,content,date,house
0,/people/mr-charles-grant-1,,"begged to be indulged, while he stated in just...","[1803, nov, 25]",commons
1,/people/baron-charles-abbot,,observed that there were two distinct question...,"[1803, nov, 25]",commons
2,/people/mr-george-tierney,,did not think the case of Mr. Grey applicable ...,"[1803, nov, 25]",commons
3,/people/mr-charles-bragge,,agreed with his hon. friend (Mr. Tierney) that...,"[1803, nov, 25]",commons
4,/people/baron-charles-abbot,,"acquainted the House that the House h3d, in ob...","[1803, nov, 22]",commons
5,/people/mr-john-burland,,"&#x2014;Sir, In rising to second the motion of...","[1803, nov, 22]",commons
6,/people/mr-charles-fox-1,,"&#x2014;Sir, I do not rise with the intention ...","[1803, nov, 22]",commons
7,/people/sir-francis-burdett,,"&#x2014;I do not rise, Sir, for the purpose of...","[1803, nov, 22]",commons
8,/people/mr-robert-jenkinson,,moved that Lord Walsingham be appointed chairm...,"[1803, nov, 22]",lords
9,/people/mr-robert-jenkinson,,"replied, that he was not then prepared to stat...","[1803, dec, 12]",lords


In [5]:
def clean(text):
    # 2. Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", text)

    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()

    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))

     # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]

    # 6. Join the words back into one string separated by space,
    # and return the result.
    return( " ".join( meaningful_words ))

authors = comments['author_id'].unique()
print(len(authors), "speakers in total")
print()
comments_by_author = []
for author in authors:
    comments_by_author.append(clean(' '.join(comments[comments['author_id']==author]['content'])))
    
corpus = ' '.join(comments_by_author).split(' ')
# Remove words which orrur less thatn twice in the corpus. As these are not duplicated between
# documents they do not contribute to the differences between documents.
# see https://radimrehurek.com/gensim/tut1.html
import collections
counter=collections.Counter(corpus)
corpus_words_non_singleton = [word for word, count in counter.items() if count > 1]

documents = []

print("each document represents one speaker")
print()

for i, doc in enumerate(comments_by_author[:3]):
    start_time = time.time()
    print("processing document %s" % (i + 1), "of %s" % len(comments_by_author))
    doc = doc.split(' ')
    doc = [word for word in doc if word in corpus_words_non_singleton]
    documents.append(doc)
    if start_time is not None:
        print("took %s" % (time.time() - start_time))
        
comments_by_author =  pd.DataFrame(comments_by_author, index=authors , columns=['comments'])
comments_by_author

170 speakers in total

each document represents one speaker

processing document 1 of 170
took 0.011493921279907227
processing document 2 of 170
took 0.24931097030639648
processing document 3 of 170
took 0.6408729553222656


Unnamed: 0,comments
/people/mr-charles-grant-1,begged indulged stated justification hon membe...
/people/baron-charles-abbot,observed two distinct questions house consider...
/people/mr-george-tierney,think case mr grey applicable present instance...
/people/mr-charles-bragge,agreed hon friend mr tierney proceeding appear...
/people/mr-john-burland,x sir rising second motion honourable friend f...
/people/mr-charles-fox-1,x sir rise intention objecting particularly pr...
/people/sir-francis-burdett,x rise sir purpose disturbing unanimity house ...
/people/mr-robert-jenkinson,moved lord walsingham appointed chairman commi...
/people/mr-french-laurence,said misapprehension existed exemptions respec...
/people/mr-william-baldwin,presented petition prisoners confined debt cou...


In [6]:
# create the dictionary object used by gensim

# repeat the process using memory efficiant method and not loading corpus into memory
def tokenize_raw(doc):
    letters_only = re.sub("[^a-zA-Z]", " ", doc)
    return letters_only.lower().split()

DICT_PATH = './data/hansard_corpus.dict'
if os.path.exists(DICT_PATH):
    # load dictionary
    dictionary = corpora.Dictionary.load(DICT_PATH)
else:
    # load dictionary
    dictionary = corpora.Dictionary(tokenize_raw(json.loads(line[:-2])['content']) for line in open('../comments_small.json'))
    # remove stop words and words that appear only once
    stop_ids = [dictionary.token2id[stopword] for stopword in stopwords.words("english") if stopword in dictionary.token2id]
    once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
    dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
    dictionary.compactify() # remove gaps in id sequence after words that were removed
    dictionary.save(DICT_PATH)

print(dictionary.num_docs, 'documents')
print(len(dictionary.values()), 'unique tokens')

dict_sorted_by_occurances = sorted(dictionary.dfs.items(), key=operator.itemgetter(1))
print('common_words:\n', pprint.pprint({dictionary.get(id):count for id, count in dict_sorted_by_occurances[-30:]}))
dictionary.filter_extremes()
print('common_words:\n', pprint.pprint({dictionary.get(id):count for id, count in dict_sorted_by_occurances[-30:]}))

315358 documents
70996 unique tokens
{'bill': 83707,
 'case': 59044,
 'could': 100032,
 'country': 68190,
 'gentleman': 71331,
 'government': 85957,
 'great': 72976,
 'hon': 132723,
 'house': 121332,
 'lord': 63131,
 'made': 85462,
 'member': 84261,
 'might': 81933,
 'much': 56994,
 'must': 59971,
 'noble': 57951,
 'one': 94617,
 'present': 77259,
 'question': 81435,
 'right': 81908,
 'said': 150456,
 'say': 67151,
 'state': 58885,
 'subject': 64425,
 'thought': 74820,
 'time': 73383,
 'upon': 90248,
 'whether': 70365,
 'would': 173308,
 'x': 88314}
common_words:
 None
{None: 150456,
 'attrition': 74820,
 'auxiliaries': 63131,
 'burghley': 64425,
 'deficiencies': 85957,
 'establishing': 81435,
 'glocestershire': 59044,
 'imitate': 70365,
 'memorialise': 57951,
 'percival': 84261,
 'robust': 56994,
 'sandgate': 173308,
 'sizarships': 83707,
 'tend': 94617,
 'thefree': 100032}
common_words:
 None


In [None]:
## TOPIC MODELLING - unsupervised clustering of documents

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

CORPUS_PATH = './data/large_files/corpus.mm'

class Corpus(object):
    def __iter__(self):
        for line in open('./data/large_files/comments.json'):
            # assume there's one json per line, remove the comma at the end of the line and only use the 'content' section
            yield dictionary.doc2bow(tokenize_raw(json.loads(line[:-2])['content']))

# try loading from file othewise load from iterator.
if os.path.exists(CORPUS_PATH):
    print('loading existing corpus')
    BOW_corpus = corpora.MmCorpus(CORPUS_PATH)
else:
    print('building corpus')
    BOW_corpus = Corpus()
    corpora.MmCorpus.serialize(CORPUS_PATH, BOW_corpus)

print(BOW_corpus)

# once you have a BOW corpus you can convert it to different model formats

TFIDF = models.TfidfModel(BOW_corpus) # step 1 -- initialize a model
TFIDF_corpus = TFIDF[BOW_corpus]
corpora.MmCorpus.serialize('./data/large_files/tfidf_corpus.mm', TFIDF_corpus)

print(TFIDF_corpus)
#lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=) # initialize an LSI transformation

#lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=100)
#corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

#lsi.print_topics(5)

loading existing corpus
MmCorpus(315358 documents, 45481 features, 29827150 non-zero entries)


In [None]:
# documents in the BOW corpus are sparse vectors where each element represents term_id, count; respectively.
print(BOW_corpus[2])

In [None]:
print TFIDF_corpus[2]

In [None]:
# SCIPY can read the matrix format from disk
#from scipy import io
#sparse_corpus = scipy.io.mmread('./data/corpus.mm')

# sparse matrix can also be generated using gensim matrix utils
# https://radimrehurek.com/gensim/tut1.html#compatibility-with-numpy-and-scipy
scipy_csc_matrix = matutils.corpus2csc(TFIDF_corpus)
dense_scipy_TFIDF = scipy_csc_matrix.toarray()