In [44]:
import wikipedia
import re
import nltk
from collections import Counter
from nltk.util import bigrams, trigrams
from nltk.tokenize import word_tokenize
import math

In [49]:
# Fetching the Wikipedia page for "Kpop"
page = wikipedia.page("Kpop")
text = page.content[:1000]  # Limit to 1000 characters

# Preprocess text: Remove punctuation and convert to lowercase
text = re.sub(r'[^\w\s]', '', text).lower()

# Tokenize text into individual words
tokens = word_tokenize(text)

# Print the cleaned and tokenized text
print(" ".join(tokens))

kpop korean 케이팝 rr keipap short for korean popular music is a form of popular music originating in south korea it includes styles and genres from around the world such as pop hip hop rb rock jazz gospel reggae electronic dance folk country disco and classical on top of its traditional korean music roots the term kpop became popular in the 2000s especially in the international context the korean term for domestic pop music is gayo 가요 歌謠 which is still widely used within south korea while kpop can refer to all popular music or pop music from south korea the term is often used when referring to artists associated with the entertainment and idol industry in the country regardless of the genre of music output modern kpop idol culture began in the 1990s as kpop idol music grew into a subculture of south korean culture and amassed enormous fandoms of teenagers and young adults the more modern form of the genre originally termed rap dance emer


In [50]:
#  Compute bigram probabilities using Laplace smoothing

def bigram_probabilities(tokens, alpha=1):
    unigram_counts = Counter(tokens)  
    bigram_counts = Counter(bigrams(tokens))  
    vocab_size = len(unigram_counts) 
    
    alpha_v = alpha * vocab_size  # Smoothing factor
    
    return {
        bigram: (count + alpha) / (unigram_counts[bigram[0]] + alpha_v)
        for bigram, count in bigram_counts.items()
    }

In [51]:
#  Compute trigram probabilities using Laplace smoothing

def trigram_probabilities(tokens, alpha=1):
    bigram_counts = Counter(bigrams(tokens)) 
    trigram_counts = Counter(trigrams(tokens))  
    vocab_size = len(set(tokens))  
    
    alpha_v = alpha * vocab_size  # Smoothing factor
    
    return {
        trigram: (count + alpha) / (bigram_counts[(trigram[0], trigram[1])] + alpha_v)
        for trigram, count in trigram_counts.items()
        if (trigram[0], trigram[1]) in bigram_counts
    }

In [52]:
def perplexity(model_probs, tokens, n):
    ngram_list = list(bigrams(tokens)) if n == 2 else list(trigrams(tokens))
    log_prob_sum = 0
    for ngram in ngram_list:
        prob = model_probs.get(ngram, 1e-10)  # Avoid zero probability issue
        log_prob_sum += math.log(prob)
    
    return math.exp(-log_prob_sum / max(1, len(ngram_list)))

In [53]:
test_sentence = "Kpop is enjoyed worldwide."

test_tokens = word_tokenize(test_sentence.lower())

In [54]:
bigram_probs = bigram_probabilities(tokens)
trigram_probs = trigram_probabilities(tokens)

bigram_perplexity = perplexity(bigram_probs, tokens, 2)
trigram_perplexity = perplexity(trigram_probs, tokens, 3)

In [55]:
print("Bigram Perplexity:", bigram_perplexity)
print("Trigram Perplexity:", trigram_perplexity)

Bigram Perplexity: 46.70556278222009
Trigram Perplexity: 49.37092567933589
