In [2]:
import pandas as pd
import re
import jieba
from collections import Counter

In [3]:
filename = './movie_comments.csv'
content = pd.read_csv(filename,low_memory = False)
comments = content.comment.tolist()


In [4]:
def cleanTokens(string):
    return re.findall('\w+',string)
comments_clean = [''.join(cleanTokens(str(line))) for line in comments]


In [5]:
def cut_str(string):
    return jieba.lcut(string)

In [6]:
TOKENS_raw = [cut_str(lines) for lines in comments_clean]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.313 seconds.
Prefix dict has been built succesfully.


In [8]:
TOKENS = []
for word in TOKENS_raw:
    TOKENS += word


In [9]:
TOKENS_two_words = [TOKENS[i-1]+word for i,word in enumerate(TOKENS)]

In [10]:
count_word_one = Counter(TOKENS)
count_word_two = Counter(TOKENS_two_words)

In [11]:
def prob_1(word):
    return count_word_one[word]/len(TOKENS)
def prob_2(word1,word2):
    if word1+word2 in TOKENS_two_words: 
        return count_word_two[word1+word2]/count_word_one[word1]
    else: 
        return 1/len(TOKENS_two_words)

In [32]:
prob_2('吴京','厉害')

0.0035971223021582736

In [13]:
def probility(sentence):
    words = cut_str(sentence)
    prob_sentence = 1.0
    for i,word in enumerate(words):
        
        prob_sentence =prob_2(words[i-1],word)* prob_sentence
    
    return prob_sentence / (1/len(TOKENS_two_words)) #因为向前取一个词，第一个词默认是个很小的结果，对整句的结论有影响
        

In [14]:
probility(comments_clean[0])


5.372840606923687e-25

In [15]:
grammar_human = """
human => 自己 寻找 活动
自己 => 我 | 俺 | 我们 
寻找 => 看看 | 找找 | 想找点
活动 => 乐子 | 玩的
"""
grammar_host= """
host => 寒暄 报数 询问 业务相关 结尾 
报数 => 我是 数字 号 ,
数字 => 单个数字 | 数字 单个数字 
单个数字 => 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 
寒暄 => 称谓 打招呼 | 打招呼
称谓 => 人称 ,
人称 => 先生 | 女士 | 小朋友
打招呼 => 你好 | 您好 
询问 => 请问你要 | 您需要
业务相关 => 玩玩 具体业务
玩玩 => 耍一耍 | 玩一玩
具体业务 => 喝酒 | 打牌 | 打猎 | 赌博
结尾 => 吗？
"""

In [16]:
import random
def creat_grammar(gra_exp):
    grammar = {}
    for line in gra_exp.split('\n'):
        if not line: continue
        exp, statment = line.split('=>')
        grammar[exp.strip()]=[word.split() for word in statment.split('|')]
    return grammar

def generate(gra, tar):
    if tar in gra: 
        new_expanded = random.choice(gra[tar])
        #print(new_expanded)
        return ''.join(generate(gra,t) for t in new_expanded)
    else:
        return tar


In [17]:
generate(creat_grammar(grammar_host),'host')

'先生,您好我是39号,您需要耍一耍打牌吗？'

In [30]:
def generate_best(gra,tar,number):
    sentences = []# sentens:score
    for i in range(number):
        sentences.append([generate(creat_grammar(gra),tar),probility(generate(creat_grammar(gra),tar))])
                
    best_sentences = sorted(sentences, key = lambda x:x[1], reverse=True)
    return best_sentences


In [31]:
generate_best(grammar_host,'host',50)

[['小朋友,你好我是675号,请问你要玩一玩赌博吗？', 2.1020374160402675e-55],
 ['女士,您好我是17号,您需要玩一玩赌博吗？', 2.6562510624962627e-56],
 ['你好我是5号,请问你要耍一耍喝酒吗？', 1.1537277972619217e-56],
 ['你好我是62号,请问你要玩一玩喝酒吗？', 6.545740296061415e-57],
 ['先生,您好我是1号,请问你要玩一玩打猎吗？', 2.3032727856750054e-57],
 ['您好我是91441号,请问你要玩一玩打猎吗？', 3.749918107480637e-59],
 ['您好我是5634号,请问你要耍一耍打猎吗？', 3.749918107480637e-59],
 ['小朋友,你好我是722号,请问你要玩一玩打牌吗？', 4.171933075375826e-62],
 ['先生,你好我是8785号,请问你要耍一耍赌博吗？', 2.1485455338185495e-62],
 ['小朋友,您好我是9号,您需要玩一玩赌博吗？', 1.3491028093259806e-62],
 ['女士,您好我是399855号,请问你要耍一耍喝酒吗？', 1.2493776116116556e-62],
 ['先生,您好我是87号,您需要玩一玩赌博吗？', 2.910685672763712e-63],
 ['您好我是35588942号,请问你要耍一耍打猎吗？', 2.910685672763712e-63],
 ['先生,您好我是984号,请问你要玩一玩打牌吗？', 3.396125608709517e-64],
 ['先生,您好我是498号,您需要玩一玩赌博吗？', 3.396125608709517e-64],
 ['您好我是6号,请问你要玩一玩赌博吗？', 3.396125608709517e-64],
 ['您好我是1号,您需要玩一玩喝酒吗？', 1.0752078002984028e-64],
 ['你好我是99号,您需要耍一耍赌博吗？', 2.0504515654984705e-66],
 ['女士,你好我是8118号,请问你要玩一玩打牌吗？', 2.0504515654984705e-66],
 ['您好我是8号,您