In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 先验概率

可以通过统计观测样本中的频率, **以频率估计概率**

# 似然

在**某个条件**下, 产生样本数据的概率

单词纠错案例中, 输入tess, 输出可能方案有less, test等.
那么似然表示: 在给定输出的条件下, 输出tess的概率.

对于这个似然, 需要进一步进行含义的定义或者不考虑这一项.

# 单词纠错案例

In [3]:
import re, collections

## 统计训练集中样本数据出现的频率作为先验

In [5]:
def to_words(text: str) -> list[str]:
    # 单词全部转换为小写
    text = text.lower()
    return re.findall(r"[a-z]+", text)


def train(words):
    # 设置hashmap的默认值从1开始,
    # 如果测试过程中遇到训练集中没有出现过的word,
    # 会自动添加到model中, 并将其值设置为1
    wordCounts = collections.defaultdict(lambda: 1)

    # 语料库中每个单词出现的频率
    for word in words:
        wordCounts[word] += 1
    return wordCounts


# textTrain.txt表示语料库, 用来训练模型
f = open('../../data/input/textTrain.txt')
text = f.read()
words = to_words(text)
d = train(words)
d

defaultdict(<function __main__.train.<locals>.<lambda>()>,
            {'hello': 2,
             'world': 2,
             'root': 2,
             'thank': 2,
             'you': 3,
             'test': 2,
             'less': 2,
             'how': 2,
             'old': 2,
             'are': 2,
             'thanks': 2,
             'good': 2,
             'learn': 2,
             'computer': 2,
             'hadoop': 2,
             'mapreduce': 2,
             'hdfs': 2})

In [12]:
if d.get("hao") is None:
    print("false")

false


## 指定编辑距离, 查找可能的备选单词

还需要根据编辑距离的不同确定单词的优先级, 即给不同优先级的单词不同的似然概率

In [23]:
alphabet = [chr(i + ord('a')) for i in range(26)]


def getNextWordSet(word: str) -> set:
    resultSet: set = set()
    length = len(word)
    for i in range(length + 1):
        # 删除某个字符
        if i < length:
            alternativeWord = word[0:i] + word[i + 1:]
            resultSet.add(alternativeWord)

        # 交换两个相邻字符
        if 0 < i < length:
            nextWord = word[i]
            preWord = word[i - 1]
            alternativeWord = word[0:i - 1] + nextWord + preWord + word[i + 1:]
            resultSet.add(alternativeWord)

        for c in alphabet:
            # 改变某个字符
            if i < length:
                alternativeWord = word[0:i] + c + word[i + 1:]
                resultSet.add(alternativeWord)

            # 增加某个字符
            alternativeWord = word[0:i] + c + word[i:]
            resultSet.add(alternativeWord)
    return resultSet


def editDistanceN(resultSet: set, word: str, model: dict, n: int = 1) -> None:
    nextWords: set = getNextWordSet(word)
    if n == 1:
        for nextWord in nextWords:
            if model.get(nextWord) is not None:
                resultSet.add(nextWord)
        return

    for nextWord in nextWords:
        editDistanceN(resultSet, nextWord, model, n - 1)


model: dict = {
    'hello': 2,
    'world': 1
}

# 初始化一个空的set集合
result: set = set()
word = 'he'
editDistanceN(result, word, model, 3)
result





{'hello'}