In [2]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 19.0 MB/s eta 0:00:00
Collecting joblib
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB)
     ------------------------------------- 307.0/307.0 kB 18.5 MB/s eta 0:00:00
Collecting tqdm
  Downloading tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
     ---------------------------------------- 78.4/78.4 kB ? eta 0:00:00
Collecting click
  Downloading click-8.1.3-py3-none-any.whl (96 kB)
     ---------------------------------------- 96.6/96.6 kB ? eta 0:00:00
Collecting regex>=2021.8.3
  Downloading regex-2022.7.25-cp38-cp38-win_amd64.whl (262 kB)
     ------------------------------------- 262.8/262.8 kB 16.8 MB/s eta 0:00:00
Installing collected packages: tqdm, regex, joblib, click, nltk
Successfully installed click-8.1.3 joblib-1.1.0 nltk-3.7 regex-2022.7.25 tqdm-4.64.0


In [3]:
import nltk
from collections import Counter
from functools import reduce

In [17]:
text = '''She sells sea-shells by the sea-shore. The shells she sells are sea-shells, I'm sure. For if she sells sea-shells by the sea-shore then I'm sure she sells sea-shore shells.'''

In [18]:
sentences = text.split('.')

In [19]:
sentences

['She sells sea-shells by the sea-shore',
 " The shells she sells are sea-shells, I'm sure",
 " For if she sells sea-shells by the sea-shore then I'm sure she sells sea-shore shells",
 '']

STEP1. 소문자 치환하기

In [20]:
sentences = list(map(str.lower, sentences))

In [21]:
sentences

['she sells sea-shells by the sea-shore',
 " the shells she sells are sea-shells, i'm sure",
 " for if she sells sea-shells by the sea-shore then i'm sure she sells sea-shore shells",
 '']

STEP2. BOS/EOS 추가하기

In [22]:
BOS = '<s>'
EOS = '</s>'
n = 2
BOSs = ' '.join([BOS]*(n-1) if n > 1 else [BOS])
sentences = [' '.join([BOSs, s, EOS]) for s in sentences]

In [23]:
sentences

['<s> she sells sea-shells by the sea-shore </s>',
 "<s>  the shells she sells are sea-shells, i'm sure </s>",
 "<s>  for if she sells sea-shells by the sea-shore then i'm sure she sells sea-shore shells </s>",
 '<s>  </s>']

STEP3. 토큰화하기

In [25]:
from functools import reduce
sentences = list(map(lambda s: s.split(), sentences))
tokens = list(reduce(lambda a, b: a+b, sentences))

In [26]:
sentences

[['<s>', 'she', 'sells', 'sea-shells', 'by', 'the', 'sea-shore', '</s>'],
 ['<s>',
  'the',
  'shells',
  'she',
  'sells',
  'are',
  'sea-shells,',
  "i'm",
  'sure',
  '</s>'],
 ['<s>',
  'for',
  'if',
  'she',
  'sells',
  'sea-shells',
  'by',
  'the',
  'sea-shore',
  'then',
  "i'm",
  'sure',
  'she',
  'sells',
  'sea-shore',
  'shells',
  '</s>'],
 ['<s>', '</s>']]

In [27]:
tokens

['<s>',
 'she',
 'sells',
 'sea-shells',
 'by',
 'the',
 'sea-shore',
 '</s>',
 '<s>',
 'the',
 'shells',
 'she',
 'sells',
 'are',
 'sea-shells,',
 "i'm",
 'sure',
 '</s>',
 '<s>',
 'for',
 'if',
 'she',
 'sells',
 'sea-shells',
 'by',
 'the',
 'sea-shore',
 'then',
 "i'm",
 'sure',
 'she',
 'sells',
 'sea-shore',
 'shells',
 '</s>',
 '<s>',
 '</s>']

STEP4. 한 번 출현한 단어 UNK 로 치환하기

In [28]:
UNK = '<unk>'
freq = nltk.FreqDist(tokens)
tokens = [t if freq[t] > 1 else UNK for t in tokens]
tokens

['<s>',
 'she',
 'sells',
 'sea-shells',
 'by',
 'the',
 'sea-shore',
 '</s>',
 '<s>',
 'the',
 'shells',
 'she',
 'sells',
 '<unk>',
 '<unk>',
 "i'm",
 'sure',
 '</s>',
 '<s>',
 '<unk>',
 '<unk>',
 'she',
 'sells',
 'sea-shells',
 'by',
 'the',
 'sea-shore',
 '<unk>',
 "i'm",
 'sure',
 'she',
 'sells',
 'sea-shore',
 'shells',
 '</s>',
 '<s>',
 '</s>']

STEP1 부터 STEP4 까지를 하나의 함수에

In [34]:
def preprocess(sentences, n):
    '''문장으로 구성된 리스트를 쪼개서 토큰 리스트로 만듬

    Args:
        sentences (list of str): 여러 개의 문장으로 구성된 리스트
        n (int): N-gram 모델의 N 계수
    Returns:
        토큰 리스트
    '''

    BOS = '<s>'
    EOS = '</s>'
    UNK = '<unk>'

    # STEP1: 소문자 치환하기
    sentences = list(map(str.lower, sentences))

    # STEP2: BOS, EOS 추가하기
    BOSs = ' '.join([BOS]*(n-1) if n > 1 else [BOS])
    sentences = [' '.join([BOSs, s, EOS]) for s in sentences]

    # STEP3: 토큰화하기
    sentences = list(map(lambda s: s.split(), sentences))
    tokens = list(reduce(lambda a, b: a+b, sentences))

    # STEP4: 한번 출현한 단어 UNK으로 치환하기
    freq = nltk.FreqDist(tokens)
    tokens = [t if freq[t] > 1 else UNK for t in tokens]

    return tokens

## N-gram 개수 세기

In [30]:
bigram = nltk.ngrams(tokens, n=2)
vocab = nltk.FreqDist(bigram)
for k, v in vocab.items():
    a, b = k
    print(f'{a},{b}: {v}')

<s>,she: 1
she,sells: 4
sells,sea-shells: 2
sea-shells,by: 2
by,the: 2
the,sea-shore: 2
sea-shore,</s>: 1
</s>,<s>: 3
<s>,the: 1
the,shells: 1
shells,she: 1
sells,<unk>: 1
<unk>,<unk>: 2
<unk>,i'm: 2
i'm,sure: 2
sure,</s>: 1
<s>,<unk>: 1
<unk>,she: 1
sea-shore,<unk>: 1
sure,she: 1
sells,sea-shore: 1
sea-shore,shells: 1
shells,</s>: 1
<s>,</s>: 1


## SimpleNgramLanguageModel 언어모델 클래스 구현하기

bigram, vocab 만들기

In [31]:
import nltk
a = ['a', 'b', 'b', 'b', 'a', 'a', 'a', 'c']
bigram = nltk.ngrams(a, n=2)
bigram

<zip at 0x21bc3e67580>

In [33]:
vocab = nltk.FreqDist(bigram)
vocab

FreqDist({('b', 'b'): 2, ('a', 'a'): 2, ('a', 'b'): 1, ('b', 'a'): 1, ('a', 'c'): 1})

## 최종 코드

In [37]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ziipp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [35]:
class SimpleNgramLanguageModel():
    def __init__(self, train_data, n):
        sentences = nltk.tokenize.sent_tokenize(train_data)
        sentences = train_data.split('.')

        tokens = preprocess(sentences, n)
        self.vocab = self.build_model(tokens, n)

    def build_model(self, tokens, n):
        ngrams = nltk.ngrams(tokens, n)
        nvocab = nltk.FreqDist(ngrams)

        if n == 1:
            vocab = nltk.FreqDist(tokens)
            vocab_size = len(nvocab)
            return {v: c/vocab_size for v, c in vocab.items()}
        else:
            mgrams = nltk.ngrams(tokens, n-1)
            mvocab = nltk.FreqDist(mgrams)
            def ngram_prob(ngram, ncount):
                mgram = ngram[:-1]
                mcount = mvocab[mgram]
                return ncount / mcount
            return {v: ngram_prob(v, c) for v, c in nvocab.items()}

    def build_vocab(self, data):
        vocab = {}
        for d in data:
            for k, v in Counter(d).items():
                try:
                    vocab[k] += v
                except KeyError:
                    vocab[k] = v

        return vocab

In [38]:
lm = SimpleNgramLanguageModel(text, n=3)
vocab = lm.vocab

In [39]:
vocab

{('<s>', '<s>', 'she'): 0.25,
 ('<s>', 'she', 'sells'): 1.0,
 ('she', 'sells', 'sea-shells'): 0.5,
 ('sells', 'sea-shells', 'by'): 1.0,
 ('sea-shells', 'by', 'the'): 1.0,
 ('by', 'the', 'sea-shore'): 1.0,
 ('the', 'sea-shore', '</s>'): 0.5,
 ('sea-shore', '</s>', '<s>'): 1.0,
 ('</s>', '<s>', '<s>'): 1.0,
 ('<s>', '<s>', 'the'): 0.25,
 ('<s>', 'the', 'shells'): 1.0,
 ('the', 'shells', 'she'): 1.0,
 ('shells', 'she', 'sells'): 1.0,
 ('she', 'sells', '<unk>'): 0.25,
 ('sells', '<unk>', '<unk>'): 1.0,
 ('<unk>', '<unk>', "i'm"): 0.5,
 ('<unk>', "i'm", 'sure'): 1.0,
 ("i'm", 'sure', '</s>'): 0.5,
 ('sure', '</s>', '<s>'): 1.0,
 ('<s>', '<s>', '<unk>'): 0.25,
 ('<s>', '<unk>', '<unk>'): 1.0,
 ('<unk>', '<unk>', 'she'): 0.5,
 ('<unk>', 'she', 'sells'): 1.0,
 ('the', 'sea-shore', '<unk>'): 0.5,
 ('sea-shore', '<unk>', "i'm"): 1.0,
 ("i'm", 'sure', 'she'): 0.5,
 ('sure', 'she', 'sells'): 1.0,
 ('she', 'sells', 'sea-shore'): 0.25,
 ('sells', 'sea-shore', 'shells'): 1.0,
 ('sea-shore', 'shells',