In [38]:
import pandas as pd
import pymongo
import pprint
import re
import operator
import nltk
import numpy as np
import pylab
import matplotlib.pyplot as plt
import os
from collections import Counter
from scipy.stats import binom

client = pymongo.MongoClient (host="da1.eecs.utk.edu")
db = client ['GDA']
coll = db ['NYT']

stop_words = nltk.corpus.stopwords.words('english') + [
 'ut', '\'re','.', ',', '--', '\'s', '?', ')', '(', ':', '\'',
 '\"', '-', '}', '{', '&', '|', u'\u2014','one','','im' ]

# We also want to remove special characters, quotes, etc. from each word
def cleanWord (w):
    # r in r'[.,"\']' tells to treat \ as a regular character 
    # but we need to escape ' with \'
    # any character between the brackets [] is to be removed 
    wn = re.sub('[,"\.\'&\|:@>*;/=��]', "", w)
    # get rid of numbers
    re.sub('^[0-9\.]*$', "", wn)
    return wn.lower()

def get_total_count():
    print('Total NYT articles {0}'.format(str(coll.count_documents({}))))

def article_count_by_generation():
    boom=[]
    genX=[]
    genY=[]
    genZ=[]

    cursor = coll.aggregate( [{ '$match': { 'year' : { '$gte' : 1960, '$lte' : 1964 } } },{'$count': 'Article Count'}] )
    boom = list(cursor)
    cursor = coll.aggregate( [{ '$match': { 'year' : { '$gte' : 1965, '$lte' : 1979 } } },{'$count': 'Article Count'}] )
    genX = list(cursor)
    cursor = coll.aggregate( [{ '$match': { 'year' : { '$gte' : 1980, '$lte' : 1999 } } },{'$count': 'Article Count'}] )
    genY = list(cursor)
    cursor = coll.aggregate( [{ '$match': { 'year' : { '$gte' : 2000, '$lte' : 2019 } } },{'$count': 'Article Count'}] )
    genZ = list(cursor)

    df = pd.DataFrame(boom+genX+genY+genZ, index =['Boomers: 1960 to 1964',
                                                   'Gen X:   1965 to 1979',
                                                   'Gen Y:   1980 to 2000',
                                                   'Gen Z:   2000 to 2019'],
                      columns =['Article Count']) 
    display(df)
    display(df.sum())

def get_dataframe_by_year(year):
    cursor = coll.aggregate( [{ '$match': { 'year' : { '$eq' : year} } }] )
    df = pd.DataFrame(list(cursor))
    return df

def Diff(list1, list2): 
    return list(set(list1) - set(list2)) 

def Same(list1, list2, list3, list4):
    return list(set(list1) & set(list2) & set(list3) & set(list4)) 

def Sym(list1, list2):
    return list(set(list1) ^ set(list2))

def get_text_by_generation():
    boom = ''
    genX = ''
    genY = ''
    genZ = ''

    cursor = coll.aggregate( [{ '$match': { 'year' : { '$gte' : 1960, '$lte' : 1964 } } }] )
    for record in cursor:
        boom = boom + record ['text']
      
    cursor = coll.aggregate( [{ '$match': { 'year' : { '$gte' : 1965, '$lte' : 1979 } } }] )
    for record in cursor:
        genX = genX + record ['text']
        
    cursor = coll.aggregate( [{ '$match': { 'year' : { '$gte' : 1980, '$lte' : 1999 } } }] )
    for record in cursor:
        genY = genY + record ['text']
        
    cursor = coll.aggregate( [{ '$match': { 'year' : { '$gte' : 2000, '$lte' : 2019 } } }] )
    for record in cursor:
        genZ = genZ + record ['text']
        
    return boom, genX, genY, genZ, boom+genX+genY+genZ

In [45]:
def process_data(text, size): 
    # split string into an array of words using any sequence of spaces "\s+" 
    wds = re .split('\s+',text)
    
    # remove periods, commas, etc stuck to the edges of words
    for i in range(len(wds)):
        wds [i] = cleanWord (wds [i])
    
    # If satisfied with results, lets go to the next step: calculate frequencies
    # We can write a loop to create a dictionary, but 
    # there is a special function for everything in python
    # in particular for counting frequencies (like function table() in R)
    wf = Counter (wds)
    
    # Remove stop words from the dictionary wf
    for k in stop_words:
        wf. pop(k, None)
        
    #how many regular words in the document?
    tw = 0
    for w in wf:
        tw += wf[w]   
    
    # Get ordered list
    wfs = sorted (wf .items(), key = operator.itemgetter(1), reverse=True)
    ml = min(len(wfs),size)
    
    only = []
    for i in range (size):
        only.append(wfs[i][0])  
    
    #Reverse the list
    return (wfs [0:ml][::-1], tw, only)

%matplotlib inline
def plotTwoLists (women_total, men_total, title, subtitle1, subtitle2):
    plt.style.use('seaborn-dark')
    f = plt.figure (figsize=(10, 6))
    # this is painfully tedious....
    f .suptitle (title, fontsize=20)
    ax = f.add_subplot(111)
    ax .spines ['top'] .set_color ('none')
    ax .spines ['bottom'] .set_color ('none')
    ax .spines ['left'] .set_color ('none')
    ax .spines ['right'] .set_color ('none')
    ax .tick_params (labelcolor='w', top=False, bottom=False, left=False, right=False, labelsize=20)

    # Create two subplots, this is the first one
    ax1 = f .add_subplot (121)
    plt .subplots_adjust (wspace=.5)

    pos = np .arange (len(women_total)+1) 
    ax1.title.set_text(subtitle1)
    ax1 .tick_params (axis='both', which='major', labelsize=14)
    pylab .yticks (pos, [ x [0] for x in women_total ])
    ax1 .barh (range(len(women_total)), [ x [1] for x in women_total ], align='center',color='#1D1A1D')

    ax2 = f .add_subplot (122)
    ax2 .tick_params (axis='both', which='major', labelsize=14)
    pos = np .arange (len(men_total)+1)
    ax2.title.set_text(subtitle2)
    pylab .yticks (pos, [ x [0] for x in men_total ])
    ax2 .barh (range (len(men_total)), [ x [1] for x in men_total ], align='center',color='#1D1A1D')
    
    plt.savefig(title +'.png')

def binomial_plot(n_values,p_values,title,lables,colors,low,high):
    fig, ax = plt.subplots(1, 1)
    #x = np.arange(low,high)
    for (n, p, lb, clr) in zip(n_values, p_values, lables, colors):
        dist = binom(n, p)
        plt.plot(x, dist.pmf(x), color=clr, label=lb,)
    vals = ax.get_yticks()
    ax.set_yticklabels(['{:,.2%}'.format(x) for x in vals])
    plt.title(title)
    plt.legend()
    plt.savefig(title +'.png')

In [46]:
# Populate data 
data_size=30
print_size=15
bm, X, Y, Z, all_text  = get_text_by_generation()
#All years
(bm_both, bm_count, bm_words) = process_data(bm,data_size)
(X_both, X_count, X_words) = process_data(X,data_size)
(Y_both, Y_count, Y_words) = process_data(Y,data_size)
(Z_both, Z_count, Z_words) = process_data(Z,data_size)
(allyears_both, allyears_count, allyears_words) = process_data(all_text,data_size)

In [53]:
same = Same(bm_words, X_words, Y_words, Z_words)
print(same)
#Binomial distributions
for k in same:
    n=[]
    p=[]
    lables=[]
    colors=[]
    for v1 in bm_both[:]:
        print(v1[0])
        if v1[0]==k:
            n.append(bm_count)
            p.append(bm_both[i][1]/bm_count)
            lables.append('Ok Boomer')
            colors.append('red')

['first', 'year', 'would', 'york', 'college', 'also', 'school', 'mr', 'said', 'years', 'new', 'students']
state
1964
beloved
st
text
timesmachine
original
view
full
education
home
last
may
president
million
article
program
mrs
state
1964
beloved
st
text
timesmachine
original
view
full
education
home
last
may
president
million
article
program
mrs
state
1964
beloved
st
text
timesmachine
original
view
full
education
home
last
may
president
million
article
program
mrs
state
1964
beloved
st
text
timesmachine
original
view
full
education
home
last
may
president
million
article
program
mrs
state
1964
beloved
st
text
timesmachine
original
view
full
education
home
last
may
president
million
article
program
mrs
state
1964
beloved
st
text
timesmachine
original
view
full
education
home
last
may
president
million
article
program
mrs
state
1964
beloved
st
text
timesmachine
original
view
full
education
home
last
may
president
million
article
program
mrs
state
1964
beloved
st
text
timesmachine
origina

In [None]:
    for v1 in X_both[:]:
        if v1[0]==k:
            n.append(X_count)
            p.append(X_both[i][1]/X_count)
            lables.append('GenX')
            colors.append('blue')
    for v1 in Y_words[:]:
        if v1[0]==k:
            n.append(Y_count)
            p.append(Y_words[i][1]/Y_count)
            lables.append('GenY')
            colors.append('green')            
    for v1 in Z_words[:]:
        if v1[0]==k:
            n.append(Z_count)
            p.append(Z_words[i][1]/Z_count)  
            lables.append('GenZ')
            colors.append('purple')            
    binomial_plot(n, p, 'Binomial Distribution of ' + k,lables,colors,500,5000)