In [1]:
%load_ext autoreload
%matplotlib inline

%autoreload 2

from glob import glob
from dask.delayed import delayed
import dask.dataframe as dd
import pandas as pd
import numpy as np
from functools import reduce
from tqdm import tqdm
from spacy.lang.en.stop_words import STOP_WORDS
from wordcloud import WordCloud

import matplotlib.pylab as plt
plt.style.use('ggplot')

from utils import read_ngrams, save_ngrams, process, ARTICLES, AMENDMENTS, QUERIES

In [2]:
from pylab import rcParams
rcParams['figure.figsize'] = 12, 6

In [3]:
df_raw = read_ngrams("/mnt/volume_sfo2_03/downloads/google_ngrams/5/constitution-parsed-gz/part-*")
df_raw.columns = ['query', 'r1', 'r2', 'r3', 'year', 'total', 'distinct']
df_raw = df_raw.compute()

df = process(df_raw)
stop_words = STOP_WORDS | { 'constitution', 'amendment' }
df = df[(df['year'] >= 1788) & ~ df['w'].isin(stop_words) & df['w'].str.isalnum()]

df.sample(10, random_state=42)

Unnamed: 0,w,query,year,total,distinct,decade
318458,question,fourteenth amendment,1999,20,20,1990
159039,enforce,fourth amendment,1977,15,9,1970
441321,years,fifteenth amendment,1978,1,1,1970
374335,suggested,first amendment,1996,21,19,1990
225863,interpretation,fourteenth amendment,2004,143,111,2000
208527,ignore,first amendment,1995,3,3,1990
253277,measurement,fourth amendment,1986,17,13,1980
328553,refers,second amendment,1985,1,1,1980
97542,case,sixth amendment,1984,10,10,1980
294927,precludes,eighth amendment,2009,4,3,2000


In [4]:
def draw_word_cloud(freq_dict):

    word_cloud = (WordCloud(width=800, height=400, stopwords=STOP_WORDS, max_words=20, background_color='white')
        .fit_words(freq_dict))
    
    plt.figure(figsize=(12, 6))
    plt.imshow(word_cloud, interpolation="bilinear")
    plt.axis("off")
    
def plot_amendment_decade(amendment, decade):
    df_query = df[(df['query'] == amendment)]
    draw_word_cloud(df_query[df_query['decade'] == decade].set_index("w")['total'].to_dict())
    plt.title("{} in {}s".format(amendment.title(), decade))
    
decades = range(1800, 2020, 10)
plt.ioff()

amendment = AMENDMENTS[2-1]
for decade in decades:
    try:
        
        plot_amendment_decade(amendment, decade);
        plt.savefig('../plots/{}_{}.png'.format(amendment.replace(' ', '_'), decade))
        plt.figure()
    except ValueError:
        pass


  
