In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from urlextract import URLExtract
import jieba
import pandas as pd
import re
import numpy as np
import sys

## Define tokenizer and stop words.

def tokenize(text):
    words = jieba.lcut(text)
    return words

stopWords = ["。", "，", "？", " "]

## Split customer questions

f = open('chatbot.txt', 'r')
fileContent = f.read()
contentSplit = fileContent.splitlines()
clientQuestions = [x.split("):  ")[1] for x in contentSplit if ("佩爱旗舰店" not in x) and ("):" in x)]

for i in range(len(clientQuestions)):
    if "[" in clientQuestions[i] and "]" in clientQuestions[i]:
        clientQuestions[i] = clientQuestions[i].split("[")[0] + clientQuestions[i].split("[")[1].split("]")[1]
        
clientQuestions = [x for x in clientQuestions if ("http" not in x) and len(x) != 0]

## TF-IDF process

vectorizer = TfidfVectorizer(tokenizer = tokenize,
                            stop_words = stopWords)
matrix = vectorizer.fit_transform(clientQuestions)

del fileContent, contentSplit

## Features

originalFeatures = vectorizer.get_feature_names()
columnIndex = []
columnNames = []

for i in range(len(originalFeatures)):
    if len(re.findall(r'[\u4e00-\u9fff]+', originalFeatures[i])) != 0:
        columnIndex.append(i)
        columnNames.append(originalFeatures[i])

## Extract features        

wordMatrix = np.array(matrix.toarray()).T[columnIndex]

del matrix

## Clusterring 

clusters = 50
kmeans = np.array(KMeans(n_clusters=clusters).fit(wordMatrix.T[:10000]).labels_)

## Get classes

classes = []
for i in range(clusters):
    ind = np.where(kmeans == i)[0]
    names = []
    for j in ind:
        names.append(clientQuestions[j])
    classes.append(names)
    
for i in range(clusters):
    f = open("Class_%d.txt" %(i),'w')
    newClass = map(lambda x:x+'\n', classes[i])
    f.writelines(newClass)
    f.close()

# features = [x for x in vectorizer.get_feature_names() if len(re.findall(r'[\u4e00-\u9fff]+', x)) != 0]
# wordDf = pd.DataFrame(matrix.toarray(),
#                       columns = vectorizer.get_feature_names())
# wordVector = wordDf[features]

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.591 seconds.
Prefix dict has been built successfully.


In [136]:
f = open("test.txt" ,'w')
newtext = map(lambda x:x+'\n', answers_pinyin)
f.writelines(newtext)
f.close()

In [129]:
vectorizer = TfidfVectorizer(
    stop_words = stopWords,
    tokenizer = tokenize,
    ngram_range=(1, 3)
)
matrix = vectorizer.fit_transform(answers)

In [144]:
vectorizer = TfidfVectorizer(
    stop_words = stopWords,
    ngram_range=(1, 5),
    analyzer = u'word'
)
matrix = vectorizer.fit_transform(answers_pinyin)

In [145]:
originalFeatures = vectorizer.get_feature_names()
columnIndex = []
columnNames = []

for i in range(len(originalFeatures)):
    columnIndex.append(i)
    columnNames.append(originalFeatures[i])

In [147]:
len(columnNames)

743892

In [148]:
f = open("test.txt" ,'w')
newtext = map(lambda x:x+'\n', columnNames)
f.writelines(newtext)
f.close()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.cluster import KMeans
from urlextract import URLExtract
from string import punctuation
from xpinyin import Pinyin
import pandas as pd
import numpy as np
import unicodedata
import jieba
import sys
import re



p = Pinyin()
extractor = URLExtract()
## Define tokenizer and stop words.

def tokenize(text):
    words = jieba.lcut(text)
    return words

stopWords = [' ']

## Split customer questions

f = open('chatbot.txt', 'r')
fileContent = f.read()
contentSplit = fileContent.splitlines()

def text_filter(text):
    cur = unicodedata.normalize('NFKC', text)
    if('-------' in cur):
        cur = ""
#     if("[" in cur and "]" in cur):
#         cur = cur.split("[")[0] + cur.split("[")[1].split("]")[1]
    urls = extractor.find_urls(cur)
    if(len(urls) > 0):
        for link in urls:
            cur = cur.replace(link, "[链接]")
    nums = re.findall(r'-?\d+\.?\d*', cur)
    nums = sorted(nums, key=len, reverse = True)
    if len(nums) > 0:
        for num in nums:
            cur = cur.replace(num, "[数字]")
    if("): " in cur):
        cur = cur.split("):  ")[1]
    if("对方正在使用" in cur and "收发消息" in cur):
        cur = cur.split("对方正在使用")[0] + cur.split("对方正在使用")[1].split("收发消息")[1]
    
    cur = re.sub(r'''[][【】“”‘’"'、,.。:;@#?!&$/()%~`-―〈〉「」・@+_*=《》^…￥-]+\ *''', 
                 " ", cur, flags=re.VERBOSE)
    return cur
        
def skip_section(text, i):
    i += 1
    while("-------" not in text[i]):
        i += 1
    i += 1
    return i



In [17]:
text_filter = TextFilter(skip_word = [["对方正在使用", "收发消息"]], 
                        skip_line = "-------",
                        skip_prefix = "): ")

In [16]:
from urlextract import URLExtract
import unicodedata
import re

extractor = URLExtract()


class TextFilter:

    def __init__(self, skip_word=None, skip_line="", skip_prefix="",
                 remove_punctuation=True, remove_number=True, remove_link=True):
        if skip_word is None:
            skip_word = []
        self.text = None
        self.keyword_skip_word = skip_word
        self.keyword_skip_line = skip_line
        self.keyword_skip_prefix = skip_prefix
        self._remove_punctuation = remove_punctuation
        self._remove_number = remove_number
        self._remove_link = remove_link

    @classmethod
    def remove_link(cls, text) -> str:
        urls = extractor.find_urls(text)
        if len(urls) > 0:
            for link in urls:
                text = text.replace(link, "[链接]")
        return text

    @classmethod
    def remove_numbers(cls, text) -> str:
        nums = re.findall(r'-?\d+\.?\d*', text)
        nums = sorted(nums, key=len, reverse=True)
        if len(nums) > 0:
            for num in nums:
                text = text.replace(num, "[数字]")
        return text

    @classmethod
    def skip_word(cls, text, words) -> str:
        for word in words:
            if type(word) is list:
                assert len(word) > 1, "Too few keywords in list."
                assert len(word) == 2, "Too many keywords in one skip."
                if(word[0] in text and word[1] in text):
                    text = text.split(word[0])[0] + text.split(word[0])[1].split(word[1])[1]
            elif type(word) is str:
                assert len(word) > 0, "Cannot skip empty string."
                if(word in text):
                    text = text.split(word)[0] + text.split(word)[1].split(word)[1]
            else:
                raise Exception('Unsupported skip words.')
        return text

    def fit_transform(self, text):
        cur = unicodedata.normalize('NFKC', text)
        if not self.keyword_skip_line != "" and self.keyword_skip_line in cur:
            return ""
        if len(self.keyword_skip_word) > 0:
            cur = self.skip_word(cur, self.keyword_skip_word)
        if self._remove_link:
            cur = self.remove_link(cur)
        if self._remove_number:
            cur = self.remove_numbers(cur)
        if self.keyword_skip_prefix != "" and self.keyword_skip_prefix in cur:
            cur = cur.split(self.keyword_skip_prefix)[1]
        if self._remove_punctuation:
            cur = re.sub(r'''[][【】“”‘’"'、,.。:;@#?!&$/()%~`-―〈〉「」・@+_*=《》^…￥-]+\ *''',
                         " ", cur, flags=re.VERBOSE)
        return cur

In [19]:
def content_extraction(text):
    res = []
    i = 0
    while i < len(text):
        Q = ""
        A = ""
        if("(2017" in text[i] and "佩爱旗舰店" not in text[i]):
            while(i < len(text)):
                Q += text_filter.fit_transform(text[i])
                i += 1
                if(i < len(text) and "-------" in text[i] and "佩爱旗舰店" in text[i]):
                    i = skip_section(text, i)
                    continue
                if(i < len(text) and "(2017" in text[i] and "佩爱旗舰店" in text[i]):
                    break
        if(i < len(text) and "(2017" in text[i] and "佩爱旗舰店" in text[i]):
            while(i < len(text)):
                A += text_filter.fit_transform(text[i])
                i += 1
                if(i < len(text) and "-------" in text[i] and "佩爱旗舰店" in text[i]):
                    i = skip_section(text, i)
                    continue
                if(i < len(text) and "(2017" in text[i] and "佩爱旗舰店" not in text[i]):
                    Q = re.sub(r"\s+", " ", Q)
                    Q_p = p.get_pinyin(Q, ' ')
                    Q_p = re.sub(r"\s+", " ", Q_p)
                    A = re.sub(r"\s+", " ", A)
                    A_p = p.get_pinyin(A, ' ')
                    A_p = re.sub(r"\s+", " ", A_p)
                    res.append([Q, A, Q_p, A_p])
                    break    
        else:
            i += 1
    return res

contents = content_extraction(contentSplit)

vectorizer = TfidfVectorizer(tokenizer = tokenize,
                            stop_words = stopWords,
                            ngram_range=(1, 3))
answers = [x[1] for x in contents]
matrix = vectorizer.fit_transform(answers)

In [20]:
answers

[' 卡片 ',
 ' 在的亲',
 ' 建议您 数字 ',
 ' 卡片 在的我有什么能帮到您的吗 表情 ',
 ' 数字 贴身 数字 宽松 建议选择 数字 的 宝宝夏天穿宽松一些更舒服',
 ' 表情 ',
 ' 宽松版型的 我们的衣衣尺码仅适用于我们的衣衣噢 不同厂家的尺码规格是不一样的噢',
 ' 数字 适合 数字 斤内的宝宝 数字 适合 数字 数字 斤的宝宝 您的宝宝现在穿 数字 是非常贴身的 很快就得换了',
 ' 嗯嗯 建议您直接选择 数字 的 这样能穿到宝宝 数字 数字 个月 可以穿过夏天了',
 ' 表情 ',
 ' 链接 比较热的地区建议这款的',
 ' 这款是彩棉肚兜 亲看的怎么样了呢您还有什么需要我帮助的吗 ',
 ' 您先提交一下订单 我帮您免邮后您再付款呢',
 ' 嗯呢 奢华盛景 ',
 ' 卡片 你好 亲',
 ' 亲 宝宝的身高和体重是多少呢 大号有的呀',
 ' 亲 宝宝的身高和体重是多少呢 我看看宝宝适合什么样的尺码',
 ' 不是有 数字 的吗 我不懂亲的意思哦 袜子不到 数字 厘米是什么意思呢 亲收到了袜子 没有 数字 厘米是这个意思吗 ',
 ' 选择 数字 的呢',
 ' 链接 这款有大码哦 亲 刚才发的没有了哦',
 ' 不好意思哦 亲 那就没有了哦',
 ' 卡片 ',
 ' 建议您选择 数字 噢 适合 数字 数字 斤的宝宝 宝宝穿上宽松舒适 表情 您急用的话 数字 点前拍下联系我 我帮您通知仓库给您优先安排发乎 呼吸冷冰 ',
 ' 卡片 在的亲',
 ' 竹纤维的更薄哦亲 数字 手感柔软 柔软爽滑不扎身 宝宝穿着妈妈放心 数字 吸湿性 放湿性 透气性居各大纺织纤维之首 夏天穿最合适了 数字 抑菌抗菌 面料的特殊功效 妈妈为宝宝少操心 竹纤维的缺点 面料强度不够 容易收到损伤 不能够承受太高强度的拉扯 所以建议在清洗过程中不要用力拧干 不可机洗 最好不要暴晒 ',
 ' 夏季的话选择竹纤维是最凉爽的哦 竹纤维妙恋自带凉感 更柔软舒适哦 放心好了 我们家的衣服都是质量保证的 您可以选择竹纤维的哦 柔软舒适的',
 ' 建议亲可以选择 数字 码合适哦亲 适合 数字 数字 斤宝宝',
 ' 数字 的话现在宽松偏大一点 适合 数字 数字 斤宝宝 数字 的话穿着合身 表情 看的如何了亲 本店保证无荧光无甲醛 全部都是A类一等品

In [46]:
from urlextract import URLExtract
from xpinyin import Pinyin
import unicodedata
import jieba
import os
import re

p = Pinyin()


class Conversation:

    def __init__(self, include_question: list, exclude_question: list,
                 include_answer: list, exclude_answer: list,
                 include_skip_section: list, exclude_skip_section: list,
                 text_filter=None):

        self.text_filter = text_filter
        self.include_question = include_question
        self.exclude_question = exclude_question
        self.include_answer = include_answer
        self.exclude_answer = exclude_answer
        self.include_skip_section = include_skip_section
        self.exclude_skip_section = exclude_skip_section
        self.text_length = None;
        self.data = []

    @classmethod
    def skip_section(cls, text, line):
        line += 1
        while "-------" not in text[line]:
            line += 1
        line += 1
        return line

    @classmethod
    def is_wanted(cls, text, includes, excludes):
        for include in includes:
            if include not in text:
                return False
        for exclude in excludes:
            if exclude in text:
                return False
        return True

    def transform(self, text):
        # f = open('chatbot.txt', 'r')
        # file_content = f.read()
        # text = file_content.splitlines()
        self.text_length = len(text);
        line = 0
        while line < self.text_length:
            Q = ""
            A = ""
#             if("(2017" in text[i] and "佩爱旗舰店" not in text[i]):
            if self.is_wanted(text[line], self.include_question, self.exclude_question):
                while line < self.text_length:
                    Q += self.text_filter.fit_transform(text[line])
                    line += 1
                    # if line < len(text) and "-------" in text[line] and "佩爱旗舰店" in text[line]:
                    if line < self.text_length and self.is_wanted(text[line], self.include_skip_section, self.exclude_skip_section):
                        line = self.skip_section(text, line)
                        continue
                    # if line < len(text) and "(2017" in text[line] and "佩爱旗舰店" in text[line]:
                    if line < self.text_length and self.is_wanted(text[line], self.include_answer, self.exclude_answer):
                        break
#             if(i < len(text) and "(2017" in text[i] and "佩爱旗舰店" in text[i]):
            if self.is_wanted(text[line], self.include_answer, self.exclude_answer):
                while line < self.text_length:
                    A += self.text_filter.fit_transform(text[line])
                    line += 1
                    # if line < len(text) and "-------" in text[line] and "佩爱旗舰店" in text[line]:
                    if line < self.text_length and self.is_wanted(text[line], self.include_skip_section, self.exclude_skip_section):
                        line = self.skip_section(text, line)
                        continue
                    # if line < len(text) and "(2017" in text[line] and "佩爱旗舰店" not in text[line]:
                    if line < self.text_length and self.is_wanted(text[line], self.include_question, self.exclude_question):
                        Q = re.sub(r"\s+", " ", Q)
                        Q_p = p.get_pinyin(Q, ' ')
                        Q_p = re.sub(r"\s+", " ", Q_p)
                        A = re.sub(r"\s+", " ", A)
                        A_p = p.get_pinyin(A, ' ')
                        A_p = re.sub(r"\s+", " ", A_p)
                        self.data.append([Q, A, Q_p, A_p])
                        break
            else:
                line += 1

In [47]:
conversation = Conversation(["(2017"], ["佩爱旗舰店"], ["(2017", "佩爱旗舰店"], [], ["-------", "佩爱旗舰店"], [], text_filter)

In [30]:
f = open('chatbot.txt', 'r')
file_content = f.read()
text = file_content.splitlines()

In [39]:
conversation.is_wanted("佩爱旗舰店:兔麻麻(2017-06-15 10:49:11):  [卡片]", ["(2017"], ["佩爱旗舰店"])

False

In [48]:
conversation.transform(text)

In [49]:
conversation.data

[[' 链接 ', ' 卡片 ', ' lian jie ', ' qia pian '],
 [' 新生儿买什么号码', ' 在的亲', ' xin sheng er mai shen me hao ma', ' zai de qin'],
 [' 我家宝宝 数字 天 ',
  ' 建议您 数字 ',
  ' wo jia bao bao shu zi tian ',
  ' jian yi nin shu zi '],
 [' 链接 ',
  ' 卡片 在的我有什么能帮到您的吗 表情 ',
  ' lian jie ',
  ' qia pian zai de wo you shen me neng bang dao nin de ma biao qing '],
 [' 宝宝满月 数字 斤 买什么号码',
  ' 数字 贴身 数字 宽松 建议选择 数字 的 宝宝夏天穿宽松一些更舒服',
  ' bao bao man yue shu zi jin mai shen me hao ma',
  ' shu zi tie shen shu zi kuan song jian yi xuan ze shu zi de bao bao xia tian chuan kuan song yi xie geng shu fu'],
 [' 好', ' 表情 ', ' hao', ' biao qing '],
 [' 是不是号码偏小 我家娃现在穿 数字 刚好 数字 宽松',
  ' 宽松版型的 我们的衣衣尺码仅适用于我们的衣衣噢 不同厂家的尺码规格是不一样的噢',
  ' shi bu shi hao ma pian xiao wo jia wa xian zai chuan shu zi gang hao shu zi kuan song',
  ' kuan song ban xing de wo men de yi yi chi ma jin shi yong yu wo men de yi yi o bu tong chang jia de chi ma gui ge shi bu yi yang de o'],
 [' 那我娃目前就适合 数字 她现在刚满月',
  ' 数字 适合 数字 斤内的宝宝 数字 适合 数字 数字 斤的宝宝 您的宝宝现在穿 数字 是非常贴