In [1]:
from ltp import LTP
from tqdm import tqdm
import json

In [2]:
ltp = LTP(path='pretrained_model/ltp_base') 

In [3]:
# 取得属性字典并添加到词表
def get_dict(file):
    with open(file, 'r') as f:
        all_attr = []
        for attr, attrval_list in json.load(f).items():
            for x in attrval_list:
                x = x.split('=')
                for k in x:
                    all_attr.append(k)
    return all_attr
attr_dict_file = "original_data/attr_to_attrvals.json"
all_attr = get_dict(attr_dict_file)
extra_words = []
extra_words.append(['牛津布', '仿皮', '吸湿', '吸汗', '防滑', '抗冲击', '微弹', '加绒'])
extra_words.append(['上青', '上青色', '上青绿', '羊绒衫'])
extra_words.append(['休闲鞋', '工装鞋', '男包', '女包', '运动裤', '休闲裤', '加厚领'])
extra_words.append(['加厚', '薄款', '厚款', '短款', '短外套'])
extra_words.append(['不加绒', '无扣', '无弹力', '无弹', '无拉链'])
extra_words.append(['一粒扣', '两粒扣', '暗扣', '三粒扣', '系扣'])
extra_words.append(['大红色', '大花'])
for extra in extra_words:   
    all_attr = all_attr + extra

ltp.init_dict(path="user_dict.txt", max_window=6)
ltp.add_words(words=all_attr, max_window=6)

In [4]:
# 统计训练集的词表和词频
fine_file = 'train/fine45000.txt'
word_dict = {}
with open(fine_file, 'r') as f:
    for line in tqdm(f):
        item = json.loads(line)
        segment, _ = ltp.seg([item['title']])
        for word in segment[0]:
            word = word.upper() # 字母统一为大写
            if word in word_dict:
                word_dict[word] += 1
            else:
                word_dict[word] = 1

coarse_file = 'train/coarse89588.txt'
with open(coarse_file, 'r') as f:
    for line in tqdm(f):
        item = json.loads(line)
        segment, _ = ltp.seg([item['title']])
        for word in segment[0]:
            word = word.upper() # 字母统一为大写
            if word in word_dict:
                word_dict[word] += 1
            else:
                word_dict[word] = 1

45000it [08:44, 85.79it/s]
89588it [17:24, 85.81it/s]


In [6]:
# 为了不丢失花大量时间生成的词表先做一个拷贝
import copy 
copy_dict = copy.deepcopy(word_dict)

# 保存词表
with open('split_word/base_word_dict/word_dict.json', 'w') as f:
    json.dump(word_dict, f, ensure_ascii=False)

In [None]:
# 备用代码
# word_dict = copy.deepcopy(copy_dict)

In [7]:
# 统一大量颜色的别称
color_list = ['兰','蓝','灰','绿','粉','红','黄','青','紫','白','黑','骆','橙','杏','咖','棕','啡','褐','银','金','橘','藏']
keys = []
for key in  word_dict.keys():
    keys.append(key)
for key in keys:
    for i in key:
        if i in color_list and word_dict[key] < 50:
            if i in word_dict:
                word_dict[i] += word_dict[key]
            else:
                word_dict[i] = word_dict[key]


In [8]:
# 删除出现次数少的词
keys = []
for key in word_dict.keys():
    keys.append(key)
for key in keys:
    if word_dict[key] < 50:
        del word_dict[key]
    if key == '/':
        del word_dict[key]

In [13]:
# 去掉没有单独意义的字词
delete_words = ['色','小','本','中','新','款','加','底','件','不']
for word in delete_words:
    del word_dict[word]

In [29]:
# 保存处理后的词表
with open('split_word/base_word_dict/processed_word_dict.json', 'w') as f:
    json.dump(word_dict, f, ensure_ascii=False)

In [None]:
# 读入保存的词表
with open('processed_word_dict.json', 'r') as f:
    processed_word_dict = json.load(f)

In [30]:
# 生成分好的title词集
fine_file = 'train/fine45000.txt'
save_fine_file = 'split_word/fine45000.txt'

vocab = list(word_dict.keys())
color_list = ['兰','蓝','灰','绿','粉','红','黄','青','紫','白','黑','骆','橙','杏','咖','棕','啡','褐','银','金','橘','藏']
rets = []
i = 0
with open(fine_file, 'r') as f:
    for line in tqdm(f):
        item = json.loads(line)
        segment, _ = ltp.seg([item['title']])
        item['title_split'] = segment[0]
        vocab_split = []
        for word in segment[0]:
            word = word.upper() # 字母统一为大写
            if word in vocab:
                vocab_split.append(word)
            else: # 颜色提取
                for c in word:
                    if c in color_list:
                        vocab_split.append(c)
        item['vocab_split'] = vocab_split
        if not vocab_split:
            print(item['title'])
        # 更改保存的顺序，便于查看
        feature = item['feature']
        del item['feature']
        item['feature'] = feature
        rets.append(json.dumps(item, ensure_ascii=False)+'\n')
        
        # if i>500:
        #     break
        # i += 1
        
with open(save_fine_file, 'w') as f:
    f.writelines(rets)

45000it [09:30, 78.90it/s]


In [33]:
coarse_file = 'train/coarse89588.txt'
save_coarse_file = 'split_word/coarse89588.txt'

vocab = list(word_dict.keys())
color_list = ['兰','蓝','灰','绿','粉','红','黄','青','紫','白','黑','骆','橙','杏','咖','棕','啡','褐','银','金','橘','藏']
rets = []
i = 0
with open(coarse_file, 'r') as f:
    for line in tqdm(f):
        item = json.loads(line)
        segment, _ = ltp.seg([item['title']])
        item['title_split'] = segment[0]
        vocab_split = []
        for word in segment[0]:
            word = word.upper() # 字母统一为大写
            if word in vocab:
                vocab_split.append(word)
            else: # 颜色提取
                for c in word:
                    if c in color_list:
                        vocab_split.append(c)
        item['vocab_split'] = vocab_split
        if not vocab_split:
            print(item['title'])
        # 更改保存的顺序，便于查看
        feature = item['feature']
        del item['feature']
        item['feature'] = feature
        rets.append(json.dumps(item, ensure_ascii=False)+'\n')
        
        # if i>500:
        #     break
        # i += 1
        
with open(save_coarse_file, 'w') as f:
    f.writelines(rets)

89588it [19:00, 78.56it/s]
