### NLTK Brown Corpus
This corpus contains text from 500 sources, and the sources have been categorized by genre, such as news, editorial, and so on.<br>
The purpose of this notebook is to extract the occurences of certain words in the Brown corpus.

In [1]:
import scipy as sp
from nltk.corpus import brown
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
def initial_clean(text):
    """
    Function to clean text of websites, email addressess and any punctuation
    We also lower case the text
    Args:
        text: raw corpus

    Returns: tokenized corpus

    """
    text = re.sub(r"((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text)
    text = re.sub(r"[^a-zA-Z ]", "", text)
    text = text.lower()
    #text = strip_short(text, minsize=3)
    #text = nltk.word_tokenize(text)
    if text.strip():
        return text

In [4]:
#join sentences into docs
docs = []
for fileid in brown.fileids():
    sents = []
    for sent in brown.sents(fileids=fileid):
        sents.extend([initial_clean(s) for s in sent]) #do basic pre-processing

    docs.append(sents)
    

In [5]:
#make dataframe to write results to
df = pd.DataFrame()
df_norm = pd.DataFrame()

said_count = []
showed_count = []
kennedy_count = []

said_norm = []
showed_norm = []
kennedy_norm = []

d_len = []
#three_word_len = []
N = 10
for d in docs:
    said_count.append(d.count("said"))
    showed_count.append(d.count("showed"))
    kennedy_count.append(d.count("kennedy"))
    d_len.append(len(d)) #document length
    three_word_len = d.count("said") + d.count("showed") + d.count("kennedy")
    if three_word_len > 0:
        said_norm.append(np.round((N * d.count("said"))/three_word_len))
        showed_norm.append(np.round((N * d.count("showed"))/three_word_len))
        kennedy_norm.append(np.round((N * d.count("kennedy"))/three_word_len))
    
df.loc[:,'N'] = d_len
df.loc[:,'said'] = said_count
df.loc[:,'showed'] = showed_count
df.loc[:,'kennedy'] = kennedy_count
df_norm.loc[:,'said_norm'] = said_norm
df_norm.loc[:,'showed_norm'] = showed_norm
df_norm.loc[:,'kennedy_norm'] = kennedy_norm
df_norm["N"] = df_norm.sum(axis=1)

In [6]:
print(df_norm)

     said_norm  showed_norm  kennedy_norm     N
0         10.0          0.0           0.0  10.0
1         10.0          0.0           0.0  10.0
2          7.0          0.0           3.0  10.0
3          3.0          0.0           7.0  10.0
4         10.0          0.0           0.0  10.0
..         ...          ...           ...   ...
347       10.0          0.0           0.0  10.0
348       10.0          0.0           0.0  10.0
349        9.0          1.0           0.0  10.0
350       10.0          0.0           0.0  10.0
351       10.0          0.0           0.0  10.0

[352 rows x 4 columns]


In [7]:
df.to_csv('brown_word_counts.csv')
df_norm.to_csv('brown_normalized_counts.csv')

## Make new normalized corpus

In [8]:
print(d)

['dear', 'sirs', None, 'let', 'me', 'begin', 'by', 'clearing', 'up', 'any', 'possible', 'misconception', 'in', 'your', 'minds', None, 'wherever', 'you', 'are', None, 'the', 'collective', 'by', 'which', 'i', 'address', 'you', 'in', 'the', 'title', 'above', 'is', 'neither', 'patronizing', 'nor', 'jocose', 'but', 'an', 'exact', 'industrial', 'term', 'in', 'use', 'among', 'professional', 'thieves', None, 'it', 'is', None, 'i', 'am', 'reliably', 'given', 'to', 'understand', None, 'the', 'technical', 'argot', 'for', 'those', 'who', 'engage', 'in', 'your', 'particular', 'branch', 'of', 'the', 'boost', None, None, 'ie', None, 'burglars', 'who', 'rob', 'while', 'the', 'tenants', 'are', 'absent', None, 'in', 'contrast', 'to', 'hotslough', 'prowlers', None, 'those', 'who', 'work', 'while', 'the', 'occupants', 'are', 'home', None, 'since', 'the', 'latter', 'obviously', 'require', 'an', 'audacity', 'you', 'do', 'not', 'possess', None, 'you', 'may', 'perhaps', 'suppose', 'that', 'i', 'am', 'taunting

In [12]:
docs_3words= []
l=['said', 'showed', 'kennedy']
doc_lens = []
said_count = []
showed_count = []
kennedy_count = []
for d in docs:
    words = [x for i, x in enumerate(d) if x in l]
    doc_lens.append(len(words))
    said_count.append(words.count("said"))
    showed_count.append(words.count("showed"))
    kennedy_count.append(d.count("kennedy"))
    docs_3words.append([(x,words.count(x)) for x in set(words)])

In [13]:
print(docs_3words[0], doc_lens[0])

[('said', 24)] 24


In [15]:
N = 10
docs_normalized = []
said_count = []
showed_count = []
kennedy_count = []

for d in range(len(docs_3words)):
    new_doc = []
    for word in docs_3words[d]:
        word_count = word[1]
        new_wordcount = (word_count * N)/doc_lens[d]
        if word[0] == 'said':
            new_word = (word[0], np.round(new_wordcount))
            new_doc.append(new_word)
    if new_doc: docs_normalized.append(new_doc)
        

In [16]:
print(docs_normalized)

[[('said', 10.0)], [('said', 10.0)], [('said', 7.0)], [('said', 3.0)], [('said', 10.0)], [('said', 9.0)], [('said', 9.0)], [('said', 5.0)], [('said', 10.0)], [('said', 10.0)], [('said', 10.0)], [('said', 10.0)], [('said', 9.0)], [('said', 10.0)], [('said', 8.0)], [('said', 10.0)], [('said', 10.0)], [('said', 10.0)], [('said', 10.0)], [('said', 10.0)], [('said', 10.0)], [('said', 10.0)], [('said', 10.0)], [('said', 5.0)], [('said', 10.0)], [('said', 8.0)], [('said', 10.0)], [('said', 10.0)], [('said', 10.0)], [('said', 5.0)], [('said', 1.0)], [('said', 10.0)], [('said', 6.0)], [('said', 10.0)], [('said', 10.0)], [('said', 10.0)], [('said', 10.0)], [('said', 4.0)], [('said', 1.0)], [('said', 10.0)], [('said', 4.0)], [('said', 5.0)], [('said', 7.0)], [('said', 2.0)], [('said', 10.0)], [('said', 10.0)], [('said', 10.0)], [('said', 10.0)], [('said', 10.0)], [('said', 3.0)], [('said', 10.0)], [('said', 1.0)], [('said', 10.0)], [('said', 4.0)], [('said', 7.0)], [('said', 5.0)], [('said', 10.0