In [1]:
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 28 23:58:14 2018

@author: issfz
"""

import string
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
from nltk.corpus import stopwords, reuters


In [2]:
# Let's use the category \"crude\" documents from reuters corpus
# Use the tokenized words and change them to lowercase
crude_tok = [ reuters.words(f) for f in reuters.fileids('crude') ]
words = [ w.lower() for f in crude_tok for w in f ]

In [3]:
words = [ w.lower() for f in crude_tok for w in f ]

In [4]:
# Find bigram collocations (two-word phrases) from the data
# Get the top 20 collocations using the selected metrics
bcf = BigramCollocationFinder.from_words(words)
top20 = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 10)

In [5]:
top20

[("'", 's'),
 ('&', 'lt'),
 ('lt', ';'),
 ('u', '.'),
 (',', '000'),
 ('said', '.'),
 ('crude', 'oil'),
 ('in', 'the'),
 ('a', 'barrel'),
 ('mln', 'barrels')]

In [6]:
# In the above results, although we get useful collocations like \"crude oil\", 
# we also get a lot of noises. Those can be filtered off.
# Let's filter off stopwords and anything less than two characters long.
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset or w.isdigit()


In [7]:
bcf.apply_word_filter(filter_stops)

In [8]:
# Are the results better now?
bcf.nbest(BigramAssocMeasures.likelihood_ratio, 10)

[('crude', 'oil'),
 ('mln', 'barrels'),
 ('saudi', 'arabia'),
 ('mln', 'bpd'),
 ('per', 'day'),
 ('united', 'states'),
 ('natural', 'gas'),
 ('last', 'year'),
 ('barrels', 'per'),
 ('oil', 'prices')]

In [9]:
# There are quite a number of metrics available. Experiment to see which one gives you better collocations.
bcf.nbest(BigramAssocMeasures.chi_sq, 10)

[('40th', 'anniversary'),
 ('abdulla', 'bakr'),
 ('acknowledge', 'responsibility'),
 ('alice', 'springs'),
 ('almir', 'pazzionotto'),
 ('amerada', 'hess'),
 ('antonio', 'parra'),
 ('anyone', 'thinks'),
 ('arifin', 'siregar'),
 ('arne', 'oeien')]

In [10]:
bcf.nbest(BigramAssocMeasures.pmi, 10)

[('40th', 'anniversary'),
 ('abdulla', 'bakr'),
 ('acknowledge', 'responsibility'),
 ('alice', 'springs'),
 ('antonio', 'parra'),
 ('anyone', 'thinks'),
 ('arifin', 'siregar'),
 ('arnold', 'safer'),
 ('asahi', 'shimbun'),
 ('asbestos', 'fibre')]

In [11]:
bcf.nbest(BigramAssocMeasures.raw_freq, 10)

[('crude', 'oil'),
 ('mln', 'barrels'),
 ('mln', 'bpd'),
 ('oil', 'prices'),
 ('mln', 'dlrs'),
 ('last', 'year'),
 ('saudi', 'arabia'),
 ('per', 'day'),
 ('billion', 'dlrs'),
 ('barrels', 'per')]

In [12]:
# Now try getting the trigram collocations
tcf = TrigramCollocationFinder.from_words(words)

In [13]:
tcf.apply_word_filter(filter_stops)

In [14]:
tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 20)

[('barrels', 'per', 'day'),
 ('crude', 'oil', 'prices'),
 ('mln', 'barrels', 'per'),
 ('raises', 'crude', 'oil'),
 ('crude', 'oil', 'postings'),
 ('crude', 'oil', 'production'),
 ('crude', 'oil', 'imports'),
 ('crude', 'oil', 'exports'),
 ('crude', 'oil', 'output'),
 ('crude', 'oil', 'stocks'),
 ('crude', 'oil', 'shipments'),
 ('crude', 'oil', 'reserves'),
 ('crude', 'oil', 'price'),
 ('crude', 'oil', 'posted'),
 ('dlrs', 'per', 'barrel'),
 ('crude', 'oil', 'market'),
 ('raised', 'crude', 'oil'),
 ('crude', 'oil', 'refined'),
 ('sweet', 'crude', 'oil'),
 ('crude', 'oil', 'pipeline')]

In [15]:
# Now try finding collocations from documents in another category. Do you get very different phrases?
money_tok = [ reuters.words(f) for f in reuters.fileids('money-fx') ]

In [16]:
words = [ w.lower() for f in money_tok for w in f ]
bcf = BigramCollocationFinder.from_words(words)
bcf.apply_word_filter(filter_stops)

In [17]:
bcf.nbest(BigramAssocMeasures.likelihood_ratio, 20)

[('mln', 'stg'),
 ('money', 'market'),
 ('central', 'bank'),
 ('west', 'germany'),
 ('foreign', 'exchange'),
 ('dealers', 'said'),
 ('interest', 'rates'),
 ('federal', 'reserve'),
 ('united', 'states'),
 ('billion', 'dlrs'),
 ('west', 'german'),
 ('exchange', 'rate'),
 ('exchange', 'rates'),
 ('secretary', 'james'),
 ('finance', 'minister'),
 ('new', 'york'),
 ('repurchase', 'agreements'),
 ('treasury', 'secretary'),
 ('james', 'baker'),
 ('louvre', 'accord')]

In [18]:
# The functions can also be used to find collocations that are not side by side.
bcf2 = BigramCollocationFinder.from_words(words, window_size = 10)
bcf2.apply_word_filter(filter_stops)
bcf2.apply_freq_filter(2)

In [19]:
bcf2.nbest(BigramAssocMeasures.likelihood_ratio, 20)

[('mln', 'stg'),
 ('money', 'market'),
 ('central', 'bank'),
 ('west', 'germany'),
 ('foreign', 'exchange'),
 ('interest', 'rates'),
 ('federal', 'reserve'),
 ('billion', 'dlrs'),
 ('united', 'states'),
 ('west', 'german'),
 ('treasury', 'secretary'),
 ('secretary', 'james'),
 ('bank', 'england'),
 ('finance', 'minister'),
 ('repurchase', 'agreements'),
 ('exchange', 'rate'),
 ('dealers', 'said'),
 ('new', 'york'),
 ('louvre', 'accord'),
 ('stg', 'band')]