## n-Gram Feature Extraction

In [1]:
def ngrams(words, n=2):
    for idx in range(len(words)-n+1):
        yield tuple(words[idx:idx+n])

In [2]:
words = ["The", "reporters", "listened", "closely", "as", "the", "President",
"of", "the", "United", "States", "addressed", "the", "room", ".",
]

In [4]:
for ngram in ngrams(words, n=3):
    print(ngram)

('The', 'reporters', 'listened')
('reporters', 'listened', 'closely')
('listened', 'closely', 'as')
('closely', 'as', 'the')
('as', 'the', 'President')
('the', 'President', 'of')
('President', 'of', 'the')
('of', 'the', 'United')
('the', 'United', 'States')
('United', 'States', 'addressed')
('States', 'addressed', 'the')
('addressed', 'the', 'room')
('the', 'room', '.')


With padding

In [8]:
import nltk
from functools import partial

LPAD_SYMBOL = "<s>"
RPAD_SYMBOL = "</s>"

nltk_ngrams = partial(
    nltk.ngrams,
    pad_right=True, right_pad_symbol=RPAD_SYMBOL,
    pad_left=True, left_pad_symbol=LPAD_SYMBOL
)

def ngrams(sent, n=2):
    for ngram in nltk_ngrams(sent, n):
        yield ngram

In [10]:
for ngram in ngrams(words, n=4):
    print(ngram)

('<s>', '<s>', '<s>', 'The')
('<s>', '<s>', 'The', 'reporters')
('<s>', 'The', 'reporters', 'listened')
('The', 'reporters', 'listened', 'closely')
('reporters', 'listened', 'closely', 'as')
('listened', 'closely', 'as', 'the')
('closely', 'as', 'the', 'President')
('as', 'the', 'President', 'of')
('the', 'President', 'of', 'the')
('President', 'of', 'the', 'United')
('of', 'the', 'United', 'States')
('the', 'United', 'States', 'addressed')
('United', 'States', 'addressed', 'the')
('States', 'addressed', 'the', 'room')
('addressed', 'the', 'room', '.')
('the', 'room', '.', '</s>')
('room', '.', '</s>', '</s>')
('.', '</s>', '</s>', '</s>')


### Discovering collocations

In [12]:
from nltk.collocations import QuadgramCollocationFinder
from nltk.metrics.association import QuadgramAssocMeasures

def rank_quadgrams(words, metric, path=None):
    """
    Find and rank quadgrams from the supplied corpus using the given association metric.
    Write the quadgrams out to the given path if supplied otherwise return the list
    in memory.
    """
    # Create a collocation ranking utility from corpus words.
    ngrams = QuadgramCollocationFinder.from_words(words)
    
    # Rank collocations by association metric
    scored = ngrams.score_ngrams(metric)
    
    if path:
        # Write to disk as tab-delimited file
        with open(path, 'w') as f:
            f.write('Collocation\tScore ({})'.format(metric.__name__))
            for ngram, score in scored:
                f.write("{}\t{}\n".format(repr(ngram), score))
    else:
        return scored

In [13]:
rank_quadgrams(words, QuadgramAssocMeasures.likelihood_ratio)

[(('The', 'reporters', 'listened', 'closely'), 44.08740482303436),
 (('reporters', 'listened', 'closely', 'as'), 44.08740482303436),
 (('President', 'of', 'the', 'United'), 36.44923480349661),
 (('States', 'addressed', 'the', 'room'), 36.44923480349661),
 (('United', 'States', 'addressed', 'the'), 36.44923480349661),
 (('addressed', 'the', 'room', '.'), 36.44923480349661),
 (('as', 'the', 'President', 'of'), 36.44923480349661),
 (('closely', 'as', 'the', 'President'), 36.44923480349661),
 (('listened', 'closely', 'as', 'the'), 36.44923480349661),
 (('of', 'the', 'United', 'States'), 36.44923480349661),
 (('the', 'United', 'States', 'addressed'), 36.44923480349661),
 (('the', 'President', 'of', 'the'), 30.150640582527142)]

### Significant collocation feature extractor

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin

class SignificantCollocations(BaseEstimator, TransformerMixin):
    
    def __init__(self,
                ngram_class=QuadgramCollocationFinder,
                metric=QuadgramAssocMeasures.pmi):
        self.ngram_class = ngram_class
        self.metric = metric
        
    def fit(self, docs, target):
        ngrams = self.ngram_class.from_documents(docs)
        self.scored_ = dict(ngrams.score_ngrams(self.metric))
        
    def transform(self, docs):
        for doc in docs:
            ngrams = self.ngram_class.from_words(docs)
            yield {
                ngram: self.scored_.get(ngram, 0.0)
                for ngram in ngrams.nbest(QuadgramAssocMeasures.raw_freq, 50)
            }

## n-Gram Language Models
Conditional probability and conditional frequencies

In [64]:
from nltk.util import ngrams
from nltk.probability import FreqDist, ConditionalFreqDist

from collections import defaultdict

# Padding Symbols
UNKNOWN = "<UNK>"
LPAD = "<s>"
RPAD = "</s>"

class NgramCounter(object):
    """
    The NgramCounter class counts ngrams given a vocabulary and ngram size.
    """
    
    def __init__(self, n, vocabulary, unknown=UNKNOWN):
        """
        n is the size of the ngram
        """
        if n < 1:
            raise ValueError("ngram size must be greater than or equal to 1")
        self.n = n
        self.unknown = unknown
        self.padding = {
            "pad_left": True,
            "pad_right": True,
            "left_pad_symbol": LPAD,
            "right_pad_symbol": RPAD
        }
        self.vocabulary = vocabulary
        self.allgrams = defaultdict(ConditionalFreqDist)
        self.ngrams = FreqDist()
        self.unigrams = FreqDist()
        
    def train_counts(self, training_text):
        for sent in training_text:
            checked_sent = (self.check_against_vocab(word) for word in sent)
            sent_start = True
            for ngram in self.to_ngrams(checked_sent):
                self.ngrams[ngram] += 1
                context, word = tuple(ngram[:-1]), ngram[-1]
                if sent_start:
                    for context_word in context:
                        self.unigrams[context_word] += 1
                    sent_start = False
                    
                for window, ngram_order in enumerate(range(self.n, 1, -1)):
                    context = context[window:]
                    self.allgrams[ngram_order][context][word] += 1
                self.unigrams[word] += 1
                
    def check_against_vocab(self, word):
        if word in self.vocabulary:
            return word
        return self.unknown
    
    def to_ngrams(self, sequence):
        """
        Wrapper for NLTK ngrams method
        """
        return ngrams(sequence, self.n, **self.padding)

In [65]:
def count_ngrams(n, vocabulary, texts):
    counter = NgramCounter(n, vocabulary)
    counter.train_counts(texts)
    return counter

In [141]:
import pandas as pd
from collections import Counter
def create_vocab_sents():
    df = pd.read_csv('../data/corpus.csv', header=None)
    df = df[df[0].str.len() > 20]
    
    text = df[0][:1000].values
    sents = [t.split() for t in text]
    tokens = [j for i in sents for j in i]
    vocab = Counter(tokens)
    return vocab, sents

In [142]:
vocab, sents = create_vocab_sents()

In [131]:
trigram_counts = count_ngrams(4, vocab, sents)