What is a Language Model in NLP?

Building an N-gram Language Model

Building a Neural Language Model

Natural Language Generation using OpenAI’s GPT-2

In [4]:
import nltk
nltk.download('reuters')

import nltk
nltk.download('punkt')

from nltk.corpus import reuters
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

# Create a placeholder for model
model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance  
for sentence in reuters.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1
 
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
#predict the next word
dict(model["today","the"])

{'Bank': 0.05555555555555555,
 'European': 0.05555555555555555,
 'Higher': 0.05555555555555555,
 'Italian': 0.05555555555555555,
 'Turkish': 0.05555555555555555,
 'company': 0.16666666666666666,
 'emirate': 0.05555555555555555,
 'increase': 0.05555555555555555,
 'newspaper': 0.05555555555555555,
 'options': 0.05555555555555555,
 'overseas': 0.05555555555555555,
 'pound': 0.05555555555555555,
 'price': 0.1111111111111111,
 'public': 0.05555555555555555,
 'time': 0.05555555555555555}

In [6]:
#predict the next word
dict(model["the","price"])

{'(': 0.009302325581395349,
 ',': 0.018604651162790697,
 ',"': 0.004651162790697674,
 '-': 0.004651162790697674,
 '.': 0.023255813953488372,
 'Royal': 0.004651162790697674,
 'action': 0.004651162790697674,
 'adjustment': 0.023255813953488372,
 'adjustments': 0.004651162790697674,
 'again': 0.004651162790697674,
 'and': 0.004651162790697674,
 'approached': 0.004651162790697674,
 'at': 0.023255813953488372,
 'base': 0.004651162790697674,
 'being': 0.004651162790697674,
 'changes': 0.004651162790697674,
 'climate': 0.004651162790697674,
 'collapse': 0.004651162790697674,
 'could': 0.004651162790697674,
 'cut': 0.009302325581395349,
 'cuts': 0.009302325581395349,
 'difference': 0.004651162790697674,
 'differentials': 0.009302325581395349,
 'drop': 0.004651162790697674,
 'effect': 0.004651162790697674,
 'evolution': 0.004651162790697674,
 'factor': 0.004651162790697674,
 'fall': 0.004651162790697674,
 'for': 0.05116279069767442,
 'freeze': 0.009302325581395349,
 'from': 0.004651162790697674

In [15]:
from nltk.corpus import reuters
from collections import Counter
 
 
counts = Counter(reuters.words())
total_count = len(reuters.words())
 
# The most common 20 words are ...
print (counts.most_common(n=20))
# [(u'.', 94687), (u',', 72360), (u'the', 58251), (u'of', 35979), (u'to', 34035), (u'in', 26478), (u'said', 25224), (u'and', 25043), (u'a', 23492), (u'mln', 18037), (u'vs', 14120), (u'-', 13705), (u'for', 12785), (u'dlrs', 11730), (u"'", 11272), (u'The', 10968), (u'000', 10277), (u'1', 9977), (u's', 9298), (u'pct', 9093)]
 
# Compute the frequencies
for word in counts:
    counts[word] /= float(total_count)
 
# The frequencies should add up to 1
print (sum(counts.values()))  # 1.0
 
import random
 
# Generate 100 words of language
text = []
 
for _ in range(100):
    r = random.random()
    accumulator = .0
 
    for word, freq in counts.items():
        accumulator += freq
 
        if accumulator >= r:
            text.append(word)
            break
 
print (' '.join(text))
# tax been its and industrial and vote " decision rates elimination and 2 . base Ltd one merger half three division trading it to company before CES mln may to . . , and U is - exclusive affiliate - biggest its Association sides above two nearby NOTES 4TH prepared term areas growth said to each gold policy 0 PLOUGH kind economy director currencies requiring . ' loan growth , 83 . new The target Refining 114 STAKE the it on . to ; measure deposit Corp Emergency on 63 the reported the TREASURY state EC to Grosso as basius

[('.', 94687), (',', 72360), ('the', 58251), ('of', 35979), ('to', 34035), ('in', 26478), ('said', 25224), ('and', 25043), ('a', 23492), ('mln', 18037), ('vs', 14120), ('-', 13705), ('for', 12785), ('dlrs', 11730), ("'", 11272), ('The', 10968), ('000', 10277), ('1', 9977), ('s', 9298), ('pct', 9093)]
1.0000000000006808
. pre 21 a National foreign will SECURITY they said early Nine buy announcement General 41 by . , Ttl ; percentage . probably a any 3 shares 000 more results figures all is RBD a 0 REVERSE to 1 meeting options 68 plant year to all Data 16 deficiency of , Sisters processing had - There newly to to Commission restaurants of A , of rules Savings two has 6 usually said eight , 7 out foreign 7 of a a Paris the Judge 1990 1 , . , would would vs figures Erbynn ( , share 4 vs


In [19]:
from functools import reduce
# The probability of a text
from operator import mul
print (reduce(mul, [counts[w] for w in text], 1.0)) # 3.0290546883e-32

7.117958952224002e-293


In [21]:
from nltk.corpus import reuters
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
 
first_sentence = reuters.sents()[0]
print (first_sentence) # [u'ASIAN', u'EXPORTERS', u'FEAR', u'DAMAGE', u'FROM' ...
 
# Get the bigrams
print (list(bigrams(first_sentence))) # [(u'ASIAN', u'EXPORTERS'), (u'EXPORTERS', u'FEAR'), (u'FEAR', u'DAMAGE'), (u'DAMAGE', u'FROM'), ...
 
# Get the padded bigrams
print (list(bigrams(first_sentence, pad_left=True, pad_right=True))) # [(None, u'ASIAN'), (u'ASIAN', u'EXPORTERS'), (u'EXPORTERS', u'FEAR'), (u'FEAR', u'DAMAGE'), (u'DAMAGE', u'FROM'),
 
# Get the trigrams
print (list(trigrams(first_sentence))) # [(u'ASIAN', u'EXPORTERS', u'FEAR'), (u'EXPORTERS', u'FEAR', u'DAMAGE'), (u'FEAR', u'DAMAGE', u'FROM'), ...
 
# Get the padded trigrams
print (list(trigrams(first_sentence, pad_left=True, pad_right=True))) # [(None, None, u'ASIAN'), (None, u'ASIAN', u'EXPORTERS'), (u'ASIAN', u'EXPORTERS', u'FEAR'), (u'EXPORTERS', u'FEAR', u'DAMAGE'), (u'FEAR', u'DAMAGE', u'FROM') ...
 


['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'Asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.']
[('ASIAN', 'EXPORTERS'), ('EXPORTERS', 'FEAR'), ('FEAR', 'DAMAGE'), ('DAMAGE', 'FROM'), ('FROM', 'U'), ('U', '.'), ('.', 'S'), ('S', '.-'), ('.-', 'JAPAN'), ('JAPAN', 'RIFT'), ('RIFT', 'Mounting'), ('Mounting', 'trade'), ('trade', 'friction'), ('friction', 'between'), ('between', 'the'), ('the', 'U'), ('U', '.'), ('.', 'S'), ('S', '.'), ('.', 'And'), ('And', 'Japan'), ('Japan', 'has'), ('has', 'raised'), ('raised', 'fears'), ('fears', 'among'), ('among', 'many'), ('many', 'of'), ('of', 'Asia'), ('Asia', "'"), ("'", 's'), ('s', 'exporting'), ('exporting', 'nations'), ('nations', 'that'), 

In [23]:

model = defaultdict(lambda: defaultdict(lambda: 0))
 
for sentence in reuters.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1
 
 
print (model["what", "the"]["economists"]) # "economists" follows "what the" 2 times
print (model["what", "the"]["nonexistingword"]) # 0 times
print (model[None, None]["The"]) # 8839 sentences start with "The"
 
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count
 
print (model["what", "the"]["economists"]) # 0.0434782608696
print (model["what", "the"]["nonexistingword"]) # 0.0
print (model[None, None]["The"]) # 0.161543241465
 

2
0
8839
0.043478260869565216
0.0
0.16154324146501936


In [33]:

import random
 
 
text = [None, None]
 
sentence_finished = False
 
while not sentence_finished:
    r = random.random()
    accumulator = .0
 
    for word in model[tuple(text[-2:])].keys():
        accumulator += model[tuple(text[-2:])][word]
 
        if accumulator >= r:
            text.append(word)
            break
 
    if text[-2:] == [None, None]:
        sentence_finished = True
 
print (' '.join([t for t in text if t]))

" A lot of ill feeling ," said Richardson Greenshields of Canada , France , Great Britain to join a hunger strike by 40 per cent , it said .


In [None]:
#On May 11 , 617 , 000 tonnes each , a Los Angeles , a subsidiary if it is being pressed by the mining equipment operations of 14 . 
#8 mln vs 112 , 000 Revs 7 , 624 , 208 tonnes in January 1986 , Allegheny said .

#Britain ' s common shares at 25 . 0 pct .

#The issue will probably be about 45 pct stake in CGCT .

#In addition , its first agreement to acquire Atwell Fleming Printing Ltd , a senior dealer Eckhart Hager said .

#Many local bankers argue the merger .

#Total estimated paddy output was only for the sake of it was discontinuing talks with Washington .

#Now that Tokyo has helped to keep cocoa off the Greek province of Camaguey , the Xerox affiliate that manufactures and markets are making threats ," one senior official said .

#" A lot of ill feeling ," said Richardson Greenshields of Canada , France , Great Britain to join a hunger strike by 40 per cent , it said .

In [34]:
import random
 
 
text = [None, None]
prob = 1.0  # <- Init probability
 
sentence_finished = False
 
while not sentence_finished:
    r = random.random()
    accumulator = .0
 
    for word in model[tuple(text[-2:])].keys():
        accumulator += model[tuple(text[-2:])][word]
 
        if accumulator >= r:
            prob *= model[tuple(text[-2:])][word]  # <- Update the probability with the conditional probability of the new word
            text.append(word)
            break
 
    if text[-2:] == [None, None]:
        sentence_finished = True
 
print ("Probability of text=", prob ) # <- Print the probability of the text
print (' '.join([t for t in text if t]))
 
# Probability of text= 4.69753034878e-48
# DOW CHEMICAL & lt ; SFE > IN ACQUISITION TALKS Comdata Network Inc said it sold the unit , leading to the group and this would not resist a half mln barrels to 247 . 0 pct , Ivory Coast is the lowest growth rate , he said .

Probability of text= 2.00546438439561e-11
MARATHON RAISES CRUDE OIL POSTINGS 24 CANADIAN CTS / BBL , PAR GRADE TO RAISE USAIR STAKE TO PRIVATE INVESTORS


In [None]:
!pip install pytorch-transformers