In [3]:
# 训练样本文件
CORPUS_DIR = "../corpus/"
TRAIN_DATA = "train.data"

# 模型数据
MODEL_DIR = "../model/"
MODEL_FILE= "demo.model"

# 测试结果文件夹
RST_DIR = "../testresult/"

In [10]:
import re
import logging
import traceback
import os
import sys
import commands

class demo:

    def __init__(self):
        self.model_flag = False
        self.loadModel()


    def loadModel(self, model_dir = MODEL_DIR):
        '''
        @brief 读正则表达式
        '''
        r_file_handler = open("../data/regfea.data", "r")
        self.regular_expressions = r_file_handler.readlines()
        self.model_flag = True
        r_file_handler.close()


    def htmlTidy(self, utext):
        '''去掉文本中的html标签'''
        if not utext:
            return utext
        utext = utext.replace("&nbsp;", " ")
        (utext, cnt) = re.subn(u"</[a-z]+>", "", utext)
        (utext, cnt) = re.subn(u"<[a-z]+/>", "", utext)
        (utext, cnt) = re.subn(u"<[a-z]+[^>]+>", "", utext)
        utext = utext.replace("&lt;", "<");
        utext = utext.replace("&gt;", ">");
        return utext


    def convertEnchar(self, utext):
        '''
        @breif 全角字符转化为半角字符，其他不变
            1. Ａ-Ｚａ-ｚ转化为A-Za-z
            2. ０-９转化为 0-9
        '''
        ctext = ""
        for uchar in utext:
            # Ａ-Ｚａ-ｚ转化为A-Za-z
            if (uchar >= u'\uFF21' and uchar <= u'\uFF3a') or (uchar >= u'\uFF41' and uchar <= u'\uFF5a'):
                uchar = chr(ord(uchar) - 65248)
                ctext += uchar
            # ０-９转化为 0-9
            elif (uchar >= u'０' and uchar <= u'９'):
                uchar = chr(ord(uchar) - 65248)
                ctext += uchar
            else:
                ctext += uchar
        return ctext


    def sampleTidy(self, utext):
        '''
        @brief 文本简单整理
            1.去掉html标签
            2.去掉收尾空白字符,文本转化为小写
            3.将多个空白字符转化为单个空格
            4.汉字之间不留空格
            5.转化成unicode(utf8)编码
        '''
        utext = self.htmlTidy(utext)
        if type(utext) <> unicode:
            utext = utext.decode("utf8")
        utext = self.convertEnchar(utext)
        utext = utext.strip().lower()
        utext = utext.replace("&nbsp;", " ")
        utext = utext.replace(u"　", " ")
        (utext, cnt) = re.subn(u"\s+", " ",utext)
        rsttext = ""
        ps = -1
        for item in utext:
            if (item >= u'\u4e00' and item<=u'\u9fa5'):
                rsttext += item
                ps = 1
            elif item == " ":
                if ps == 1:
                    continue
                rsttext += item
                ps = 0
            else:
                rsttext += item
                ps = 0
        return rsttext


    def preRegularExpression(self, utext):
        '''
        @brief 根据正则表达式剔除广告,
            1. 拼音序列以下划线开头和结尾, 仅保留汉字, 数字, 字母, 空格和下划线
               之间用下划线连接, 全角转化为半角,拼音匹配注意是全匹配
        '''
        rule = "preRegularExpression "
        utext = re.sub(r"\s+", " ", utext)
        for item in self.regular_expressions:
            tmp_list = item.split("\t")
            reg_tag = tmp_list[0].strip().decode('utf8')
            if len(tmp_list) >= 2 and reg_tag != "regcom":
                regular_expression = (tmp_list[1].strip()).decode("utf8")
            else:
                continue
            try:
                if reg_tag == "regpy":
                    continue
                else:
                    reg_match = re.search(regular_expression, utext)
                if reg_match:
                    rule += regular_expression
                    return ("guanggao", "1", rule)
            except:
                logging.error(traceback.format_exc())
                continue
        return None


    def train(self, train_file = TRAIN_DATA, model_dir = MODEL_DIR):
        '''训练模型'''
        try:
            train_file = os.path.basename(train_file)
            train_h = open(CORPUS_DIR + train_file, "r")
            if not os.path.isdir(model_dir):
                try : 
                    os.mkdir(model_dir)
                except:
                    logging.error("Make directory failed")
                    return
            model_file = os.path.join(model_dir, train_file)
            model_h = open(model_file + ".model", "w")
        except:
            logging.error(traceback.format_exc())
            return
        pass


    def prediction(self, utext):
        '''
        @brief 根据预测的模型测试样本
          Input:  utext       ---- 接受判断的文本
                  model_file  ---- 训练的模型

          Output: 4元组 or None
                  lable        ---- 识别标记(guanggao, common)
                  score        ---- 对应识别标记的得分
                  judge_module ---- 对应判断模块名
                  rule         ---- 对应解释, 规则模块可对应具体规则, 机器学习可对应特征
        '''
        try:
            if self.model_flag == False:
                logging.error("The model can not loaded")
                return
            utext = self.sampleTidy(utext)
            judge_module = "demo"
            if len(utext) <= 300:
                rst = self.preRegularExpression(utext)
                if rst:
                    (label, score, rule) = rst
                    if type(rule) <> str:
                        rule = rule.encode("utf8")
                    rst = (label, score, rule, judge_module)
                    logging.info("%s identify the utext %s"%(judge_module, utext))
                    return rst
                return ("common", "0.6", judge_module, "")
        except:
            logging.error(traceback.format_exc())
            return None


In [11]:
if __name__ == "__main__":
    utext = "qq2234234"
    my_demo = demo()
    my_demo.train("../corpus/demo.test", "model")
    print my_demo.prediction(utext)


../corpus/demo.test
<open file '../corpus/demo.test', mode 'r' at 0x7f1bd1a7af60>
('guanggao', '1', 'preRegularExpression qq[\\d ]{7,9}', 'demo')
