In [3]:
import nltk.data
from nltk.util import bigrams 
from nltk.tokenize import TreebankWordTokenizer
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
treebank_tokenizer = TreebankWordTokenizer()

# set this to the full path of the class files
dir_base = "/f18_ds_nlp/class_2/"

def read_file_and_tokenize(filename):
    input_file = open(dir_base + "data/" + filename , encoding='utf-8').read()
    punkt_sentences = sentence_tokenizer.tokenize(input_file)
    sentences_words = [treebank_tokenizer.tokenize(sentence) for sentence in punkt_sentences]
    return sentences_words

In [4]:
newswire_tokens = read_file_and_tokenize("newswire.txt")
print(newswire_tokens)
all_tokens = [word for sentence in newswire_tokens for word in sentence]
bigrams = nltk.bigrams(all_tokens)
print(list(bigrams))

[['Smoke', 'filled', 'the', 'air', 'as', 'multiple', 'fire', 'departments', 'battled', 'a', 'grass', 'fire', 'Monday', 'north', 'of', 'Henry', '.'], ['A', 'large', 'amount', 'of', 'smoke', 'was', 'reported', 'at', 'about', '3:35', 'p.m.', ',', 'about', 'six', 'miles', 'north', 'of', 'Henry', ',', 'Morrill', 'Fire', 'Chief', 'Matt', 'Hinman', 'said', '.'], ['Firefighters', 'from', 'Mitchell', ',', 'Morrill', ',', 'Lyman', ',', 'Torrington', ',', 'Scottsbluff', 'Rural', ',', 'Scottsbluff', 'and', 'Gering', 'were', 'called', 'out', 'to', 'respond', 'to', 'the', 'fire', '.'], ['Firefighters', 'from', 'Yoder', 'and', 'Lingle', ',', 'Wyoming', ',', 'fire', 'departments', ',', 'as', 'well', 'as', 'Hot', 'Springs', ',', 'South', 'Dakota', ',', 'have', 'also', 'been', 'called', 'to', 'assist', '.'], ['Hinman', 'estimated', '50', 'to', '60', 'firefighters', 'have', 'responded', 'to', 'battle', 'the', 'fire', '.'], ['“', 'Right', 'now', ',', 'a', 'wild', 'estimate', 'would', 'be', '200-300', 'acr

In [5]:
all_tokens = [word for sentence in newswire_tokens for word in sentence]
stop_words = nltk.corpus.stopwords.words('english')
content = [w for w in all_tokens if w.lower() not in stop_words]
print(content)
bigrams = nltk.bigrams(content)
print(list(bigrams))

['Smoke', 'filled', 'air', 'multiple', 'fire', 'departments', 'battled', 'grass', 'fire', 'Monday', 'north', 'Henry', '.', 'large', 'amount', 'smoke', 'reported', '3:35', 'p.m.', ',', 'six', 'miles', 'north', 'Henry', ',', 'Morrill', 'Fire', 'Chief', 'Matt', 'Hinman', 'said', '.', 'Firefighters', 'Mitchell', ',', 'Morrill', ',', 'Lyman', ',', 'Torrington', ',', 'Scottsbluff', 'Rural', ',', 'Scottsbluff', 'Gering', 'called', 'respond', 'fire', '.', 'Firefighters', 'Yoder', 'Lingle', ',', 'Wyoming', ',', 'fire', 'departments', ',', 'well', 'Hot', 'Springs', ',', 'South', 'Dakota', ',', 'also', 'called', 'assist', '.', 'Hinman', 'estimated', '50', '60', 'firefighters', 'responded', 'battle', 'fire', '.', '“', 'Right', ',', 'wild', 'estimate', 'would', '200-300', 'acres', ',', '”', 'burning', ',', 'Hinman', 'said', '6', 'p.m.', ',', '“', 'One', 'firefighter', ',', 'Mike', 'Kindred', ',', 'Lyman', 'Volunteer', 'Fire', ',', 'said', 'front', 'lines', 'called', 'fire', '“', 'pretty', 'rugged.'

In [6]:
def load_file_tokenize_remove_stopwords(filename):
    file_tokens = read_file_and_tokenize(filename)
    all_tokens = [word for sentence in file_tokens for word in sentence]
    stop_words = nltk.corpus.stopwords.words('english')
    content = [w for w in all_tokens if w.lower() not in stop_words]
    bigrams = nltk.bigrams(content)
    return bigrams

In [7]:
newswire_bigrams = load_file_tokenize_remove_stopwords("newswire.txt")
newswire_frankenstein_bigrams = load_file_tokenize_remove_stopwords("newswire_frankenstein.txt")

ng1=set(newswire_bigrams)
ng2=set(newswire_frankenstein_bigrams)
match=set.intersection(ng1,ng2)
print(match)
print('..found {}'.format(len(match)))

{('Hinman', 'said'), (',', 'Hinman'), ('”', 'burning'), ('“', 'Right'), (',', 'wild'), ('said', '6'), ('.', '“'), ('200-300', 'acres'), ('estimate', 'would'), ('burning', ','), ('Right', ','), ('p.m.', ','), ('wild', 'estimate'), (',', '”'), ('acres', ','), (',', '“'), ('would', '200-300'), ('6', 'p.m.')}
..found 18


In [8]:
from nltk.collocations import *
newswire_tokens = read_file_and_tokenize("newswire.txt")
bigram_measures = nltk.collocations.BigramAssocMeasures()

all_tokens = [word for sentence in newswire_tokens for word in sentence]
finder = BigramCollocationFinder.from_words(all_tokens, window_size = 2)
finder.apply_freq_filter(1)

colls = finder.nbest(bigram_measures.likelihood_ratio, 5)

print(colls)

[('the', 'fire'), ('Firefighters', 'from'), ('Hinman', 'said'), ('at', 'about'), ('north', 'of')]


In [9]:
from nltk.collocations import *
newswire_tokens = read_file_and_tokenize("newswire.txt")
bigram_measures = nltk.collocations.BigramAssocMeasures()

all_tokens = [word for sentence in newswire_tokens for word in sentence]
stop_words = nltk.corpus.stopwords.words('english')
content = [w for w in all_tokens if w.lower() not in stop_words]
    
finder = BigramCollocationFinder.from_words(content, window_size = 2)
finder.apply_freq_filter(1)

colls = finder.nbest(bigram_measures.likelihood_ratio, 5)

print(colls)

[('north', 'Henry'), ('Hinman', 'said'), ('.', 'Firefighters'), ('200-300', 'acres'), ('50', '60')]
