In [13]:
import collections
import math
import jieba.posseg as pseg

window_size=5
file_path = "../data/产出导向法.txt"
mi_path = "../results/互信息.txt"

# 对语料进行处理
def build_corpus():
    # 分词
    def cut_words(sent):
        return [word.word for word in pseg.cut(sent) if word.flag[0] not in ['x','w','p','u','c']]
    sents = [cut_words(sent) for sent in open(file_path,encoding='utf-8').read().split('\n')]
#     [['a','the'], ['this','that']]
    return sents
# 统计相关的词频
def count_words(sents):
    words_all = list()  # words_all = []
    for sent in sents:
        words_all.extend(sent)  # ['a','the', 'this','that']
    word_dict = {item[0]:item[1] for item in collections.Counter(words_all).most_common()} # {'a':1,'the':1}
    return words_all,word_dict
# 读取语料
def build_cowords(sents):
    train_data = list()
    for sent in sents:
        for index,word in enumerate(sent):  # [(0,'a'), (1,'the')]
            if index <window_size:
                left = sent[:index]    # John likes to watch movies Mary likes too.John also likes to watch football games.
            else:
                left = sent[index-window_size:index]
            if index+window_size>len(sent):
                right = sent[index+1:]
            else:
                right = sent[index+1:index+1+window_size+1]
            data = left+right+[sent[index]] # left = [to watch movies Mary likes], right = [John also likes to watch], [too]
            # data = [to watch movies Mary likes John also likes to watch too]
            if '' in data:
                data.remove('')
            train_data.append(data)
    return train_data
# 统计共现矩阵
def count_cowords(train_data):
    co_dict = dict()
    for index,data in enumerate(train_data):
        for index_pre in range(len(data)):
            for index_post in range(len(data)):
                if data[index_pre] not in co_dict:
                    co_dict[data[index_pre]] = data[index_post]
                else:
                    co_dict[data[index_pre]] +="@" +data[index_post]
    return co_dict

# co_dict = {'to': 'to@watch@movies@.....too', 'watch':'to@watch@movies@.....too', 'movies':'to@watch@movies@.....too'}

# 计算互信息
def compute_words_mi(word_dict,co_dict,sum_tf):
    def build_dict(words):
        return {item[0]:item[1] for item in collections.Counter(words).most_common()}
    mi_dict = dict()
    for word,co_words in co_dict.items():
        co_word_dict = build_dict(co_words.split('@'))  # [to watch movies Mary likes John also likes to watch too]
        mis_dict = {}
        p1 = word_dict[word]/sum_tf  # 单词的出现概率，也就是P(x)
        for co_word,co_tf in co_word_dict.items():
            if co_word == word:
                continue
            p2 = word_dict[co_word]/sum_tf  # 与单词x在一个词窗里面共现的单词的出现概率，也就是P(y)
            p12 = co_tf/sum_tf  # 联合概率P(x,y)
            mi = math.log2(p12)-math.log2(p1)-math.log2(p2)
            mis_dict[co_word] = mi
        mis_dict = sorted(mis_dict.items(),key=lambda asd:asd[1],reverse=True)
        mi_dict[word] = mis_dict   # {'to':{'watch':5, 'likes':4, 'also':3}, 'like':{'to': 10, 'are':5}}
    return mi_dict
# 将产生的互信息文件进行保存
def save_mi(mi_dict):
    f = open(mi_path,'w+')
    for word,co_words in mi_dict.items():
        con_infos = [item[0]+"@"+str(item[1]) for item in co_words]
        f.write(word+'\t'+','.join(con_infos)+'\n')
    f.close()
#主函数

sents = build_corpus()
words_all,word_dict = count_words(sents)
sum_tf = len(words_all)
train_data = build_cowords(sents)
co_dict = count_cowords(train_data)
mi_dict = compute_words_mi(word_dict,co_dict,sum_tf)
save_mi(mi_dict)
