<a href="https://colab.research.google.com/github/zhe0/aia_nlp/blob/main/%E4%BA%BA%E5%B7%A5%E6%99%BA%E6%85%A7%E5%AD%B8%E6%A0%A1_Lab1_1_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Lab1 Reminder: **
1. Make a copy in your google drive to start coding.
2. Relink to the demomstration code can check the output.

In [None]:
# For debugging
import pdb

# For checking progress
from tqdm import tqdm

# For loading data
import pandas as pd

# For tokenizaton
import nltk
from nltk import word_tokenize, sent_tokenize
nltk.download('punkt')

# For building n-gram model
from collections import Counter, namedtuple
import numpy as np

# For pos tagging
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
!pip install git+https://github.com/APCLab/jieba-tw.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/APCLab/jieba-tw.git
  Cloning https://github.com/APCLab/jieba-tw.git to /tmp/pip-req-build-jyqzaqux
  Running command git clone -q https://github.com/APCLab/jieba-tw.git /tmp/pip-req-build-jyqzaqux


# Part 1. Data Preprocessing
1. show the top-10 common words and their counts before/after preprocessing




## Functions and Classes
*  Remove the punctuations
*  Lower the cases



In [None]:
def get_corpus():
  """ Reads and formats the corpus.

  Returns:
    corpus (list[str]):
      A list of sentences in the corpus.
  """
  df = pd.read_csv('https://raw.githubusercontent.com/yilihsu/NLP110/main/data_tiny.csv')
  corpus = df.content.to_list()
  return corpus

In [None]:
def preprocess(documents):
  """ Preprocesses the corpus.
  
  Args:
    documents (list[str]):
      A list of sentences in the corpus.
  Returns:
    cleaned_documents (list[str]):
      A list of cleaned sentences in the corpus.
  """
  cleaned_documents = []
  punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~”'''
  for doc in documents:
    # Tokenizes the sentence
    sents = sent_tokenize(doc)

    for sent in sents:
      # [TODO1]Removes the punctuations, sent = ...
      sent = ''.join([char if char not in punc else '' for char in sent])
      # [TODO2]Lowers the case, sent = ...
      sent = sent.lower()

      cleaned_documents.append(sent)

  #print(cleaned_documents[:5])
  return cleaned_documents

In [None]:
# Compute word frequency
def get_vocab(documents):
  """ Gets the vocabulary from the corpus.
  
  Args:
    documents (list[str]):
      A list of sentences in the corpus
  Returns:
    vocabulary (collections.Counter)
  """
  vocabulary = Counter()

  for doc in tqdm(documents):
    tokens = word_tokenize(doc)
    vocabulary.update(tokens)

  return vocabulary

## Executions
### 1. Show the top-10 common words and their counts before/after preprocessing


In [None]:
# Read data
raw_documents = get_corpus()

# Build vocabulary
vocab = get_vocab(raw_documents).most_common(10)
print('\n Before preprocessing:', vocab)

# Build vocabulary after preprocessing
documents = preprocess(raw_documents)
vocab = get_vocab(documents).most_common(10)
print('\n After preprocesing:', vocab)

100%|██████████| 20000/20000 [00:06<00:00, 3043.56it/s]



 Before preprocessing: [('.', 16981), ('the', 9885), (',', 7788), ('to', 7005), ('!', 6642), ('a', 5596), ('is', 5111), ('?', 4640), ('and', 4584), ('you', 4463)]


100%|██████████| 34977/34977 [00:10<00:00, 3313.82it/s]


 After preprocesing: [('the', 11175), ('to', 7117), ('a', 5847), ('you', 5325), ('is', 5245), ('and', 5087), ('of', 4492), ('i', 3231), ('in', 3203), ('it', 3190)]





# Part 2. N-Gram Model and POS Tagging
1. Build 2-gram / 4-gram model by processed dataset
2. Show the top-5 probable next words and their probability after initial token ‘\<s\>’ by 2-gram model
3. Generate a sentence with 2-gram model and find the POS taggings
4. Generate a sentence with 4-gram model and find the POS taggings


## Functions and Classes

In [None]:
class Ngram_model(object):
  """ Ngram model implementation.

  Attributes:
    n (int):
      The number of grams to be considered.
    model (dict):
      The ngram model.
  """
  def __init__(self, documents, N=2):
    self.n = N
    self.model = self.get_ngram_model(documents)

  def get_ngram_model(self, documents):
    N = self.n
    ngram_model = dict()
    full_grams = list()
    grams = list()
    Word = namedtuple('Word', ['word', 'prob'])

    for doc in tqdm(documents):
      
      # Tokenizes to words
      split_words = word_tokenize(doc)

      # [TODO3] Append (N-1) start tokens '<s>' and an end token '<\s>'
      
      split_words = ['<s>']*(N-1) + split_words + ['<\s>']

      ## [TODO4] Calculates numerator (n-grams)
      # for ... in the range of all n-grams 
      for i in range(len(split_words) - N + 1):
        # full_grams.append(tuple(...)), append the tuple of n-grams into full_grams(list)
        full_grams.append(tuple(split_words[i:i+N]))

      # [TODO5]Calculate denominator ((n-1)-grams)
      # for ... in the range of all (n-1)-grams 
      for i in range(len(split_words) - N + 2):
        # grams.append(tuple(...)), append the tuple of (n-1)-grams into grams(list)
        grams.append(tuple(split_words[i:i+N-1]))

    # Count the occurence frequency of each gram
    # Take 2-gram model as example:
    #   full_grams -> list[('a', 'gram'),('other', 'gram'), ...]
    #   grams -> list[('a',), ('other',), ('gram',), ...]
    #   full_gram_counter -> dict{('a', 'gram'):frequency_1, ('other','gram'):frequency_2, ...}
    #   gram_counter -> dict{('a'):frequency_3, ('gram'):frequency_4, ...}
    full_gram_counter = Counter(full_grams)
    # [TODO6] count the grams with Counter
    #gram_counter = ...
    gram_counter = Counter(grams)
    print(grams)

    # Build model
    # Take 2-gram model as example:
    #   { '<s>': [tuple(word='i', prob=0.6), tuple(word='the', prob=0.2), ...],
    #   'i': [tuple(word='am', prob=0.7), tuple(word='want', prob=0.1), ...],
    #    ... }
    for key in full_gram_counter:
      word = ''.join(key[:N-1])
      #print(word, key[:N-1])
      if word not in ngram_model:
        ngram_model.update({word: set()})

      # next_word_prob -> float
      next_word_prob = full_gram_counter[key] / gram_counter[key[:N-1]]
      w = Word(key[-1], next_word_prob)
      ngram_model[word].add(w)

    # Sort the result by frequency
    for word, ng in ngram_model.items():
      ngram_model[word] = sorted(ng, key=lambda x: x.prob, reverse=True)

    return ngram_model


  def predict_sent(self, text=None, max_len=30):
    """ Predicts a sentence with the ngram model.

    Args:
      text (string or list[string])
    Returns:
      A prediction string.
    """

    N = self.n
    backup_tokens = ['<s>']*(N-1)
    if not text:
      tokens = backup_tokens
      output = []

    elif type(text)==str:
      tokens = backup_tokens + text.split(' ')
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return 
      output = tokens

    elif type(text) == list:
      tokens = backup_tokens + text
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return
      output = tokens

    else:
      print('[Error] the input text must be string or list of string')
      return

    for i in range(max_len):
      possible_words = list(self.model[''.join(tokens)])
      probs = [word.prob for word in possible_words]
      words = [word.word for word in possible_words]
      next_word = np.random.choice(words, 1, p=probs)[0]
      tokens = tokens[1:] + [next_word]

      if next_word == '<\\s>':
        break

      output.append(next_word)
    return ' '.join(output)

  def predict_next(self, text=None, top=5):
    """ Predicts next word with the ngram model.

    Args:
      text (string or list[string])

    Returns:
      possible_next_words (list[namedtuple]):
        A list of top few possible next words.
    """

    N = self.n
    backup_tokens = ['<s>']*(N-1)
    if not text:
      tokens = backup_tokens

    elif type(text)==str:
      tokens = backup_tokens + text.split(' ')
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return 

    elif type(text) == list:
      tokens = backup_tokens + text
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return
    else:
      print('[Error] the input text must be string or list of string')

    possible_next_words = self.model[''.join(tokens)][:top]
    possible_next_words = [(word.word, word.prob) for word in possible_next_words]

    return possible_next_words

  def check_existence(self, tokens):
    if not ''.join(tokens) in self.model.keys():
      print('[Error] the input text {} not in the vocabulary'.format(tokens))
      return False
    else:
      return True

## Executions
### 1. Build 2-gram/4-gram model by processed dataset

In [None]:
twogram = Ngram_model(documents, N=2)
fourgram = Ngram_model(documents, N=4)

100%|██████████| 34977/34977 [00:08<00:00, 4108.92it/s]
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|██████████| 34977/34977 [00:04<00:00, 7924.09it/s]
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### 2. Show the top-5 probable next words and their probability after initial token \'\<s\>\'  by 2-gram model

In [None]:
output = twogram.predict_next(text='<s>', top=5)
print('Next word predictions of two gram model:', output)

Next word predictions of two gram model: [('i', 0.05280612974240215), ('the', 0.03102038482431312), ('you', 0.030248448980758784), ('<\\s>', 0.029190610972925066), ('they', 0.019784429768133344)]


### 3. Generate a sentence with 2-gram model and find the POS taggings



In [None]:
output = twogram.predict_sent(max_len=30)
print('Generation results of two gram model:', output)
# [TODO7] Find the POS taggings for the generated sentence
nltk.pos_tag(word_tokenize(output))

Generation results of two gram model: they sent emails


[('they', 'PRP'), ('sent', 'VBD'), ('emails', 'NNS')]

### 4. POS tagging with Chinese input

In [None]:
import jieba.posseg as pseg
sentence_1 = "我最喜歡自然語言處理"
words_1 = pseg.cut(sentence_1)

In [None]:
# print out the words and their corresponding PoS tags
for word, tag in words_1:
  print(word, tag)

我 r
最 d
喜歡 Vt
自然 N
語言 N
處理 Vt


In [None]:
# [TODO8]try your own sentence!
# sentence_2 = ...
# words_2 = ...
sentence_2 = "受到熱對流發展影響，各地出現午後雷陣雨，中央氣象局發布大雨特報，另針對「臺南市」發布大雷雨即時訊息，持續時間至15時15分止，氣象局提醒民眾外出留意天氣變化。"
words_2 = pseg.cut(sentence_2)
print("Part of Speech tagging my chinese input: ", sentence_2)
for word, tag in words_2:
  print(word, tag)

Part of Speech tagging my chinese input:  受到熱對流發展影響，各地出現午後雷陣雨，中央氣象局發布大雨特報，另針對「臺南市」發布大雷雨即時訊息，持續時間至15時15分止，氣象局提醒民眾外出留意天氣變化。
受到 Vt
熱對流 N
發展 Nv
影響 N
， x
各地 Vi
出現 Vi
午後 N
雷陣雨 N
， x
中央 N
氣象局 N
發布 Vt
大雨 N
特報 N
， x
另 C
針對 P
「 x
臺南市 N
」 x
發布 Vt
大 Vi
雷雨 N
即時 ADV
訊息 N
， x
持續 Vt
時間 N
至 p
15 m
時 n
15 m
分止 v
， x
氣象局 N
提醒 Vt
民眾 N
外出 Vi
留意 Vt
天氣 N
變化 N
。 x
