# Project 2. Implementing N-gram language modeling by NIKL corpus
- Use only `NLRW1900000011.json` to calculate the probability of a sentence `특히 이에 대한 예산이 충분히 반영된다면 좋은 결과가 있을 것이라 생각한다` by unigram, bigram, trigram language model.
  - Load newspaper corpus file and cleansing all sentences for processing.
  - Build unigram, bigram, trigram language model.
  - Cacluate the probability of a sentence by above model.

## Data Loading and tokenization

In [1]:
import sys
import json
import re
from nltk import sent_tokenize

In [2]:
path = {PATH_TO_CORPUS_FILE}

with open(path + 'NLRW1900000011.json', encoding='UTF8') as f:
    data = json.load(f)

paragraphs = [] # extract contents only
for document in data["document"]:
    for paragraph in document["paragraph"]:
        paragraphs.append(paragraph["form"])


sentences = [] # sentence tokenization
for paragraph in paragraphs:
    paragraph_splited = paragraph.split('.')
    for sentence in paragraph_splited:
        if sentence != '':
            sentences.append(sentence)

In [3]:
print(sentences[0:3])

['새로운 희망 공유하고 새 출발하자', '기축년(己丑年) 새해다', '새로운 희망을 공유하고 새 출발을 다짐할 때다']


## Preprocessing

### Cleansing special character
- Remove special characters except Korean, English, numbers.

In [4]:
import re

def cleansing_special(sentence):
    sentence = re.sub("[‘, ’, ◇, ‘, ”,  ’, ', ·, \“, ·, △, ●,  , ■, (, ), \", >>, `, /, -,∼,=,ㆍ<,>, .,?, !,【,】, …, ◆,%]", " ", sentence)
    sentence = re.sub("[^가-힣0-9a-zA-Z\\s]", " ", sentence)
    sentence = re.sub("\s+", " ", sentence) # replace several spaces to one
    
    return f"<s {sentence} /s>"

cleansing_special(sentences[1])

'<s 기축년 새해다 /s>'

In [5]:
cleansed_sentences = []
for paragraphs in sentences:
    cleansed_sentences.append(cleansing_special(paragraphs))
cleansed_sentences[:3]

['<s 새로운 희망 공유하고 새 출발하자 /s>',
 '<s 기축년 새해다 /s>',
 '<s 새로운 희망을 공유하고 새 출발을 다짐할 때다 /s>']

## Calculate probability of target sentece

### (1) Frequency of target sentence
\begin{equation}
    P(< s 특히 이에 대한 예산이 충분히 반영된다면 좋은 결과가 있을 것이라 생각한다 /s >) \approx \frac{count(특히 이에 대한 예산이 충분히 반영된다면 좋은 결과가 있을 것이라 생각한다)}{count(모든 문장)}
\end{equation}


In [6]:
target_sentence = '특히 이에 대한 예산이 충분히 반영된다면 좋은 결과가 있을 것이라 생각한다'

target_cleansed = cleansing_special(target_sentence)
prob_target = cleansed_sentences.count(target_cleansed) / len(cleansed_sentences)

print('%d/%d(=%.13f)'%(cleansed_sentences.count(target_cleansed), len(cleansed_sentences), prob_target))

1/103568(=0.0000096554920)


### (2) Unigram language model
#### Unigram Model (k=1): $P(w_1 w_2 ... w_n) \approx \prod_{i} P(w_i)$

\begin{equation}
    P(x_1) \approx \frac{count(x_1)}{count(N)}
\end{equation}

#### Define ngram counter

In [7]:
from collections import Counter

def ngram_counter(sentence_list, n):
    
    ngram = []
    
    for i, s in enumerate(sentence_list):
        uni = s.split(" ") # split by space
        if n == 1:
            ngram.extend(uni)
        else:
            ngram.extend(zip(*[uni[i:] for i in range(n)])) # zip(uni, uni[1:], uni[2:],.., uni[n-1,:])
            
    return Counter(ngram) # Counter returns {element : count} dict when recieving [element] list

In [8]:
unigram = ngram_counter(cleansed_sentences, 1)
n_unigram = len(unigram) # count(N)

target_unigram = target_cleansed.split(" ")
prob_target = 1
for u in target_unigram:
    # unigram[u]: count of u
    prob_target *= unigram[u] / n_unigram

print("Unigram of target sentence: ", target_unigram)
print("Unigram Model with splitting by space: %e"%(prob_target))

Unigram of target sentence:  ['<s', '특히', '이에', '대한', '예산이', '충분히', '반영된다면', '좋은', '결과가', '있을', '것이라', '생각한다', '/s>']
Unigram Model with splitting by space: 7.805136e-35


## Bigram language model
### Bigram Model (k=2): $P(w_i|w_1 w_2 ... w_{i-1}) \approx P(w_i|w_{i-1})$

\begin{equation}
    P(x_2|x_1) \approx \frac{count(x_1,x_2)}{count(x_1)}
\end{equation}

In [9]:
bigram= ngram_counter(cleansed_sentences, 2)

target_unigram = target_cleansed.split(" ")
target_bigram = list(zip(target_unigram, target_unigram[1:])) 

prob_target = 1
for u in target_bigram:
    # count(x1) of bigram (x1, x2)
    count_x1 = unigram[u[0]]
    # count(x1, x2) by bigram
    count_bigram = bigram[u]
    prob_target *= count_bigram / count_x1
    
print("Bigram of target sentence: ", target_bigram)
print("Bigram Model with splitting by space: %e"%(prob_target))

Bigram of target sentence:  [('<s', '특히'), ('특히', '이에'), ('이에', '대한'), ('대한', '예산이'), ('예산이', '충분히'), ('충분히', '반영된다면'), ('반영된다면', '좋은'), ('좋은', '결과가'), ('결과가', '있을'), ('있을', '것이라'), ('것이라', '생각한다'), ('생각한다', '/s>')]
Bigram Model with splitting by space: 2.238120e-21


## Trigram language model

\begin{equation}
    P(x_3|x_1,x_2) \approx \frac{count(x_1,x_2,x_3)}{count(x_1,x_2)}
\end{equation}

In [10]:
trigram = ngram_counter(cleansed_sentences, 3)
target_trigram = list(zip(*[target_unigram[i:] for i in range(3)]))

prob_target = 1
for u in target_trigram:
    count_x1x2 = bigram[(u[0], u[1])]
    count_trigram = trigram[u]
    prob_target *= count_trigram / count_x1x2
    

print("Trigram of target sentence: ", target_trigram)
print("Trigram Model with splitting by space: %e"%(prob_target))

Trigram of target sentence:  [('<s', '특히', '이에'), ('특히', '이에', '대한'), ('이에', '대한', '예산이'), ('대한', '예산이', '충분히'), ('예산이', '충분히', '반영된다면'), ('충분히', '반영된다면', '좋은'), ('반영된다면', '좋은', '결과가'), ('좋은', '결과가', '있을'), ('결과가', '있을', '것이라'), ('있을', '것이라', '생각한다'), ('것이라', '생각한다', '/s>')]
Trigram Model with splitting by space: 1.257710e-07
