In [109]:
from collections import defaultdict
import numpy as np
import heapq
import random

class BiGram:
    def __init__(self, topk=10000):
        self.i2w = {}
        self.w2i = {}
        self.M = None
        self.topk = topk
        return None
    
    def sort_dict(self,d):
        sorted_d = sorted(d.items(), key = lambda item: item[1], reverse = True)
        return sorted_d
    
    def train(self, trainset):
        unkown = '<UKN>'
        start = '<S>'
        end = '<E>'
        self.i2w[0]=unkown
        self.i2w[1]=start
        self.i2w[2]=end
        totalcount = 0   #总词数
        wordcount = defaultdict(int)  #词频
        wordcount[start] = len(trainset)  #<s>句子开始标记数
        wordcount[end] = len(trainset)   #<e>句子结束标记数
        print("词频统计中...")
        #词频统计
        for sentence in trainset:
            for word in sentence:
                wordcount[word]+=1
        
        #汇总总词数
        for _,v in wordcount.items():
            totalcount+=v
        
        
        #选取topk高频词，同时获得 索引 与 词 的mapping 
        print("生成词表...")
        i = 3
        for w,_ in self.sort_dict(wordcount):
            if i < self.topk+3:
                self.i2w[i]=w
            i+=1
        
        #获得 词 与 索引 的mapping 
        for i,w in self.i2w.items():
            self.w2i[w] = i
        #print(self.w2i)
        
        #获得保留词的词频
        wordcount_filter = defaultdict(int)
        for word,v in wordcount.items():
            #print(word)
            if word in self.w2i:
                wordcount_filter[self.w2i[word]]=v
            else:
                wordcount_filter[self.w2i['<UKN>']]+=v
        #wordcount_filter[1]=len(trainset) #补充<s>
        #wordcount_filter[2]=len(trainset)
        
        #print(wordcount_filter)
                
        voc_len = len(self.i2w)
        
        print("计算BiGram概率...")
        self.M = np.zeros((voc_len,voc_len)) #初始化概率矩阵
        
        #统计二元组合出现次数
        print("统计二元组出现次数...")
        for sentence in trainset:
            sentence_full = ['<S>'] + sentence + ['<E>']
            sentence_idx = []
            for word in sentence_full:
                if word not in self.w2i:
                    sentence_idx.append(self.w2i['<UKN>'])
                else:
                    sentence_idx.append(self.w2i[word])

            for i in range(len(sentence_idx)-1):
                p1 = sentence_idx[i]
                p2 = sentence_idx[i+1]
                self.M[p1][p2]+=1
        
        #计算二元组条件概率
        print("计算二元组条件概率...")
        for i in range(voc_len):
            for j in range(voc_len):
                #print(self.M[i][j]+1, wordcount_filter[i], (self.M[i][j]+1)*wordcount_filter[i], (wordcount_filter[i]+wordlist_len))
                self.M[i][j]=(1.0*(self.M[i][j]+1)*wordcount_filter[i])/(wordcount_filter[i]+voc_len)
                #if self.M[i][j]!=0:
                #    print(self.M[i][j])
        #print(self.M)
        
        return None
        
    def perplexity(self, testset):
        '''
        计算困惑度
        '''
        sent_perp = [] #存储每句话的困惑度数值
        for sentence in testset:
            #句子序列添加句首和句尾标签，并将词转换成索引序列
            sentence_full = ['<S>'] + sentence + ['<E>']
            sentence_idx = []
            for word in sentence_full:
                if word not in self.w2i:
                    sentence_idx.append(self.w2i['<UKN>'])
                else:
                    sentence_idx.append(self.w2i[word])
            #计算每个句子的联合概率
            prob = 1
            for i in range(len(sentence_idx)-1):
                p1 = sentence_idx[i]
                p2 = sentence_idx[i+1]
                #print(self.M[p1][2])
                prob*=1/self.M[p1][2]
            
            sent_perp.append(np.power(prob,1/len(sentence_full)))#困惑度计算
        return np.mean(sent_perp)#返回句子困惑度的均值
            
    def infer(self):
        '''
        生成语句
        '''
        word_sequence = []
        #根据句首标签生成下一个位置的词
        start_idx = self.w2i['<S>'] 
        prob_list = self.M[start_idx].tolist()
        #print(prob_list[:100])
        #选择条件概率最大的前五个候选词，从中随机选择一个，增加多样性
        top5_idx = heapq.nlargest(5,range(len(prob_list)),prob_list.__getitem__)
        word_idx = top5_idx[random.randint(0,4)]
        #print(word_idx,self.i2w[word_idx])
        word_sequence.append(self.i2w[word_idx])
        start_idx = word_idx
        #直到遇到句尾标记停止，否则循环继续
        while(word_idx!=self.w2i['<E>']):
            prob_list = self.M[start_idx].tolist()
            #print(prob_list[:100])
            top5_idx = heapq.nlargest(5,range(len(prob_list)),prob_list.__getitem__)
            word_idx = top5_idx[random.randint(0,4)]
            #print(word_idx,self.i2w[word_idx])
            word_sequence.append(self.i2w[word_idx])
            start_idx = word_idx
        return ' '.join(word_sequence)

In [111]:
sentences = []
punctions = ['！','。','?']
with open('199801.txt',"r",encoding="utf-8") as f:
    c = 0
    for line in f:
        if c>10000:
            break
        c+=1
        ls = line.strip().split('  ')
        line_segs = []
        candidate_sentence = []
        w_num = 0
        for i in range(1,len(ls)):
            if ls[i].endswith('/w') and ls[i][0] in punctions:
                w_num+=1
                if len(candidate_sentence)>=1:
                    sentences.append(candidate_sentence)
                    candidate_sentence = []
            else:
                end_idx = ls[i].index('/')
                candidate_sentence.append(ls[i][:end_idx])
        if len(candidate_sentence)>0:
            sentences.append(candidate_sentence)
print("总句子数：",len(sentences))

split_rate = 0.8 
topk = 10000

bigram = BiGram(topk)

split_rate = int(split_rate*len(sentences))
bigram.train(sentences[:split_rate])
print('已完成BiGram构建...')

print('BiGram困惑度：',bigram.perplexity(sentences[split_rate:]))

#print(bigram.M[0][0])
sentence = bigram.infer() 
print('生成的句子',sentence)

总句子数： 19753
词频统计中...
生成词表...
计算BiGram概率...
统计二元组出现次数...
计算二元组条件概率...
已完成BiGram构建...
BiGram困惑度： 23.932798554278946
生成的句子 <UKN> ， 是 一 次 会议 <E>


In [112]:
sentences = []
punctions = ['！','。','?','']
with open('199801.txt',"r",encoding="utf-8") as f:
    #c = 0
    for line in f:
        #if c>10000:
        #    break
        #c+=1
        ls = line.strip().split('  ')
        line_segs = []
        candidate_sentence = []
        w_num = 0
        for i in range(1,len(ls)):
            if ls[i].endswith('/w') and ls[i][0] in punctions:
                w_num+=1
                if len(candidate_sentence)>=1:
                    sentences.append(candidate_sentence)
                    candidate_sentence = []
            else:
                end_idx = ls[i].index('/')
                candidate_sentence.append(ls[i][:end_idx])
        if len(candidate_sentence)>0:
            sentences.append(candidate_sentence)
print("总句子数：",len(sentences))

split_rate = 0.8 
topk = 10000

bigram = BiGram(topk)

split_rate = int(split_rate*len(sentences))
bigram.train(sentences[:split_rate])
print('已完成BiGram构建...')

print('BiGram困惑度：',bigram.perplexity(sentences[split_rate:]))

#print(bigram.M[0][0])
sentence = bigram.infer() 
print('生成的句子:',sentence)

总句子数： 44473
词频统计中...
生成词表...
计算BiGram概率...
统计二元组出现次数...
计算二元组条件概率...
已完成BiGram构建...
BiGram困惑度： 13.342639884517247
生成的句子 ” ， 是 <UKN> 的 ， <UKN> <E>


In [115]:
sentence = bigram.infer() 
print('生成的句子:',sentence)

生成的句子: 在 新 问题 是 <UKN> ， 并 不 能 使 人 <E>


In [1]:
import jieba

In [3]:
jieba.lcut("It is good/")

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\zhang\AppData\Local\Temp\jieba.cache
Loading model cost 0.805 seconds.
Prefix dict has been built successfully.


['It', ' ', 'is', ' ', 'good', '/']