# 1. 构建语料库

In [2]:
corpus = [
    "我喜欢吃苹果",
    "我喜欢吃香蕉",
    "她喜欢吃葡萄",
    "他不喜欢吃香蕉",
    "他喜欢吃苹果",
    "她喜欢吃草莓"
]

# 2. 把句子分成 n 个 Gram

In [3]:
def tokenize(text):
    return [char for char in text]

# 3. 计算每个 Bigram 在语料库中的词频

In [4]:
from collections import defaultdict, Counter

def count_ngrams(corpus, n):
    ngrams_count = defaultdict(Counter)
    for text in corpus:
        tokens = tokenize(text)
        for i in range(len(tokens) - n + 1):
            ngram = tuple(tokens[i:i+n])
            prefix = ngram[:-1]
            token = ngram[-1]
            ngrams_count[prefix][token] += 1
    return ngrams_count


    

In [5]:
bigram_counts = count_ngrams(corpus, 2)
print("Bigram 词频:")
for prefix, counts in bigram_counts.items():
    print(f"{''.join(prefix)}:{dict(counts)}")

Bigram 词频:
我:{'喜': 2}
喜:{'欢': 6}
欢:{'吃': 6}
吃:{'苹': 2, '香': 2, '葡': 1, '草': 1}
苹:{'果': 2}
香:{'蕉': 2}
她:{'喜': 2}
葡:{'萄': 1}
他:{'不': 1, '喜': 1}
不:{'喜': 1}
草:{'莓': 1}


# 4. 计算每个 Bigram 出现的概率

In [6]:
def ngram_probability(ngram_counts):
    ngram_probs = defaultdict(Counter)
    for prefix, tokens_count in ngram_counts.items():
        total_counts = sum(tokens_count.values())
        for token, count in tokens_count.items():
            ngram_probs[prefix][token]  = count/total_counts
    return ngram_probs


In [7]:
bigram_probs = ngram_probability(bigram_counts)
print("Bigram 出现的概率")
for prefix, probs in bigram_probs.items():
    print(f"{''.join(prefix)}:{dict(probs)}")

Bigram 出现的概率
我:{'喜': 1.0}
喜:{'欢': 1.0}
欢:{'吃': 1.0}
吃:{'苹': 0.3333333333333333, '香': 0.3333333333333333, '葡': 0.16666666666666666, '草': 0.16666666666666666}
苹:{'果': 1.0}
香:{'蕉': 1.0}
她:{'喜': 1.0}
葡:{'萄': 1.0}
他:{'不': 0.5, '喜': 0.5}
不:{'喜': 1.0}
草:{'莓': 1.0}


# 5. 根据 Bigram 出现的概率, 定义生成下一个词的函数

In [8]:
def generate_next_tokens(prefix, ngram_probs):
    if prefix not in ngram_probs:
        return None
    next_token_probs = ngram_probs[prefix]
    next_token = max(next_token_probs, key=next_token_probs.get)
    return next_token

In [9]:
print(generate_next_tokens(tuple("不",), bigram_probs))

喜


# 6. 输入前缀, 生成连续文本

In [10]:
def generate_text(prefix, ngram_probs, n, length=6):
    tokens = list(prefix)
    for _ in range(length - len(prefix)):
        next_token = generate_next_tokens(tuple(tokens[-(n-1):]), ngram_probs)
        if not next_token:
            break
        tokens.append(next_token)
    return "".join(tokens)

In [11]:
print(generate_text("我", bigram_probs, 2))

我喜欢吃苹果
