In [1]:
# 这里的equal process是针对于已经分词后的文件处理的
# 1.创建equal_dict，用来替换的字典
# 2.使用'title_split'分段，发现equal_dict中的key属性即在title中进行替换
# 3.'拉链'分为'裤拉链' '鞋拉链' '拉链'，'系带'分为'裤系带' '鞋系带' '系带'
# 4.同时重新处理title为统一大写，只需要将替换后的'title_split'合并即可
# 5.重新生成新的合并后的processed_word_dict，并重新生成'vocab_split'分段


In [2]:
from tqdm import tqdm 
import json
import itertools

In [3]:
equal_dict = {'半高领': '高领',
 '立领': '高领',
 '可脱卸帽': '连帽',
 '衬衫领': '翻领',
 'POLO领': '翻领',
 '方领': '翻领',
 '娃娃领': '翻领',
 '荷叶领': '翻领',
 '五分袖': '短袖',
 '九分袖': '长袖',
 '超短款': '短款',
 '常规款': '短款',
 '超长款': '长款',
 '标准型': '修身型',
 '超短裙': '短裙',
 '中长裙': '中裙', 
 'O型裤': '哈伦裤',
 '灯笼裤': '哈伦裤',
 '锥形裤': '哈伦裤',
 '铅笔裤': '直筒裤',
 '小脚裤': '直筒裤',
 '微喇裤': '喇叭裤',
 '九分裤': '长裤',
 '套筒': '一脚蹬',
 '套脚': '一脚蹬',
 '中帮': '高帮'}
# 替换的工作要额外做一个，中长款替换为中款！

In [4]:
# 加载原始的属性字典
def load_attr_dict(file):
    # 读取属性字典
    with open(file, 'r') as f:
        attr_dict = {}
        for attr, attrval_list in json.load(f).items():
            attrval_list = list(map(lambda x: x.split('='), attrval_list))
            attr_dict[attr] = list(itertools.chain.from_iterable(attrval_list))
    return attr_dict

# load attribute dict
attr_dict_file = "../../data/original_data/attr_to_attrvals.json"
attr_dict = load_attr_dict(attr_dict_file)

# 相等替换
for query, attrs in attr_dict.items():
    attrs = attrs.copy()
    for i, attr in enumerate(attrs):
        if attr in equal_dict:
            attr_dict[query].remove(attr)
            
# 特殊的几个属性替换
for query, attrs in attr_dict.items():
    attrs = attrs.copy()
    for i, attr in enumerate(attrs):
        if query=='衣长' and attr=='中长款':
            attr_dict[query][i] = '中款'
        if query=='裤门襟' and attr=='拉链':
            attr_dict[query][i] = '拉链裤'
        if query=='裤门襟' and attr=='系带':
            attr_dict[query][i] = '系带裤'
        if query=='裤门襟' and attr=='松紧':
            attr_dict[query][i] = '松紧裤'
        if query=='闭合方式' and attr=='拉链':
            attr_dict[query][i] = '拉链鞋'
        if query=='闭合方式' and attr=='系带':
            attr_dict[query][i] = '系带鞋'

# 保存新的属性字典
attr_save_file = '../../data/equal_processed_data/attr_to_attrvals.json'
with open(attr_save_file, 'w') as f:
    json.dump(attr_dict, f, ensure_ascii=False, indent=4)

In [4]:
# 加载新的属性字典
attr_file = '../../data/equal_processed_data/attr_to_attrvals.json'
with open(attr_file, 'r') as f:
    attr_dict = json.load(f)

In [6]:
# [fine] 移除年份，统一大写，替换相等属性，替换特殊属性
fine_file = '../../data/original_data/train_fine.txt'
new_fine_file = '../../data/equal_processed_data/fine50000.txt'

rets = []
years = ['2017年','2018年','2019年','2020年','2021年','2022年']

with open(fine_file, 'r') as f:
    for i, data in enumerate(tqdm(f)):
        data = json.loads(data)
        title = data['title']
        key_attr = data['key_attr']
        # 删除年份
        for year in years:
            title = title.replace(year, '')
        # 统一大写
        title = title.upper() # 字母统一为大写
        # 属性替换
        for query, attr in key_attr.items():
            # 替换相同属性，fine的替换是从属性反向推回到title的替换
            if attr in equal_dict:
                key_attr[query] = equal_dict[attr]
                # equal_dict的选词很讲究，大多是长词替换成短词，避免了replace可能的出错
                # replace会替换所有满足条件的词，虽然可能都只有一次
                title = title.replace(attr, equal_dict[attr]) 
            # 替换特殊属性
            if query=='衣长' and attr=='中长款':
                key_attr[query] = '中款'
                title = title.replace(attr, '中款')
            if query=='裤门襟' and attr=='拉链' and '无拉链' not in title:
                key_attr[query] = '拉链裤'
                title = title.replace(attr, '拉链裤')
            if query=='裤门襟' and attr=='系带':
                key_attr[query] = '系带裤'
                title = title.replace(attr, '系带裤')
            if query=='裤门襟' and attr=='松紧':
                key_attr[query] = '松紧裤'
                title = title.replace(attr, '松紧裤')
            if query=='闭合方式' and attr=='拉链':
                key_attr[query] = '拉链鞋'
                title = title.replace(attr, '拉链鞋')
            if query=='闭合方式' and attr=='系带':
                key_attr[query] = '系带鞋'
                title = title.replace(attr, '系带鞋')
        # 一个高频词的特殊处理
        if '厚度常规' in title:
            title = title.replace('厚度常规', '常规厚度')
        
        data['key_attr'] = key_attr
        data['title'] = title
        # del data['feature']
        
        rets.append(json.dumps(data, ensure_ascii=False)+'\n')
        
        # if i>500:
        #     break
        # i += 1
          
with open(new_fine_file, 'w') as f:
    f.writelines(rets)

50000it [00:54, 914.90it/s]


In [2]:
import os 
save_dir = 'data/tmp_data/equal_processed_data'
os.path.join(save_dir, 'lala')

'data/tmp_data/equal_processed_data/lala'

In [7]:
# [coarse] 移除年份，统一大写，替换相等属性，替换特殊属性
coarse_file = '../../data/original_data/train_coarse.txt'
pos_coarse_file = '../../data/equal_processed_data/coarse89588.txt'
neg_coarse_file = '../../data/equal_processed_data/coarse10412.txt'

pos_rets = []
neg_rets = []
years = ['2017年','2018年','2019年','2020年','2021年','2022年']

equal_list = list(equal_dict.keys())
query_list = list(attr_dict.keys()) # 注意是新属性字典
with open(coarse_file, 'r') as f:
    for i, data in enumerate(tqdm(f)):
        data = json.loads(data)
        title = data['title']
        key_attr = {}
        # 删除年份
        for year in years:
            title = title.replace(year, '')
        # 统一大写
        title = title.upper() # 字母统一为大写
        # 由于替换后的属性不存在包含的情况，用来做属性提取不易出错，所以先做属性替换
        # 相同属性替换
        for attr in equal_list:
            if attr in title:
                title = title.replace(attr, equal_dict[attr])
        # 特殊属性替换
        if '中长款' in title:
            title = title.replace('中长款', '中款')
        if '拉链' in title and '裤' in title and '无拉链' not in title:
            title = title.replace('拉链', '拉链裤')
        if '系带' in title and '裤' in title:
            title = title.replace('系带', '系带裤')
        if '松紧' in title and '裤' in title:
            title = title.replace('松紧', '松紧裤')
        if '拉链' in title and ('鞋' in title or '靴' in title):
            title = title.replace('拉链', '拉链鞋')
        if '系带' in title and ('鞋' in title or '靴' in title):
            title = title.replace('系带', '系带鞋')
        # 一个高频词的特殊处理
        if '厚度常规' in title:
            title = title.replace('厚度常规', '常规厚度')
        # 属性提取
        if data['match']['图文'] == 1:
            for query in query_list:
                attr_list = attr_dict[query]
                for attr in attr_list:
                    if attr in title:
                        key_attr[query] = attr
                        data['match'][query] = 1   
            
        data['key_attr'] = key_attr
        data['title'] = title
        # del data['feature']
        
        if data['match']['图文'] == 1:
            pos_rets.append(json.dumps(data, ensure_ascii=False)+'\n')
        else:
            neg_rets.append(json.dumps(data, ensure_ascii=False)+'\n')
        
        # if i>500:
        #     break
        # i += 1
        
print(len(pos_rets))
print(len(neg_rets))
with open(pos_coarse_file, 'w') as f:
    f.writelines(pos_rets)
with open(neg_coarse_file, 'w') as f:
    f.writelines(neg_rets)

100000it [01:51, 895.35it/s]


89588
10412


In [9]:
# [fine]生成nofeat版本
file = '../../data/equal_processed_data/fine50000.txt'
save_file = '../../data/equal_processed_data/nofeat/fine50000.txt'

rets = []
with open(file, 'r') as f:
    for i, line in enumerate(tqdm(f)):
        item = json.loads(line)
        del item['feature']
        rets.append(json.dumps(item, ensure_ascii=False)+'\n')
        
with open(save_file, 'w') as f:
    f.writelines(rets)

50000it [00:13, 3637.48it/s]


In [10]:
# [coarse]生成nofeat版本
file = '../../data/equal_processed_data/coarse89588.txt'
save_file = '../../data/equal_processed_data/nofeat/coarse89588.txt'

rets = []
with open(file, 'r') as f:
    for i, line in enumerate(tqdm(f)):
        item = json.loads(line)
        del item['feature']
        rets.append(json.dumps(item, ensure_ascii=False)+'\n')
        
with open(save_file, 'w') as f:
    f.writelines(rets)

file = '../../data/equal_processed_data/coarse10412.txt'
save_file = '../../data/equal_processed_data/nofeat/coarse10412.txt'

rets = []
with open(file, 'r') as f:
    for i, line in enumerate(tqdm(f)):
        item = json.loads(line)
        del item['feature']
        rets.append(json.dumps(item, ensure_ascii=False)+'\n')
        
with open(save_file, 'w') as f:
    f.writelines(rets)

89588it [00:24, 3615.17it/s]
10412it [00:02, 3685.96it/s]


In [11]:
# [fine] 划分train val数据
fine_path = '../../data/equal_processed_data/fine50000.txt'

fine_train_path = '../../data/equal_processed_data/fine45000.txt'
fine_val_path = '../../data/equal_processed_data/fine5000.txt'
# fine_train_path = '../../data/equal_processed_data/nofeat/fine45000_nofeat.txt'
# fine_val_path = '../../data/equal_processed_data/nofeat/fine5000_nofeat.txt'


train_rets = []
val_rets = []

with open(fine_path, 'r') as f:
    for i, data in enumerate(tqdm(f)):
        data = json.loads(data)
        # del data['feature']
        
        if len(train_rets) < 45000:      
            train_rets.append(json.dumps(data, ensure_ascii=False)+'\n')
        else:
            val_rets.append(json.dumps(data, ensure_ascii=False)+'\n')
        
        # if i>500:
        #     break
        # i += 1
        
print(len(train_rets))
print(len(val_rets))

with open(fine_train_path, 'w') as f:
    f.writelines(train_rets)
with open(fine_val_path, 'w') as f:
    f.writelines(val_rets)

50000it [00:54, 914.34it/s]


45000
5000


In [12]:
# 生成nofeat版本
file = '../../data/equal_processed_data/fine45000.txt'
save_file = '../../data/equal_processed_data/nofeat/fine45000.txt'

rets = []
with open(file, 'r') as f:
    for i, line in enumerate(tqdm(f)):
        item = json.loads(line)
        del item['feature']
        rets.append(json.dumps(item, ensure_ascii=False)+'\n')
        
with open(save_file, 'w') as f:
    f.writelines(rets)
    
file = '../../data/equal_processed_data/fine5000.txt'
save_file = '../../data/equal_processed_data/nofeat/fine5000.txt'

rets = []
with open(file, 'r') as f:
    for i, line in enumerate(tqdm(f)):
        item = json.loads(line)
        del item['feature']
        rets.append(json.dumps(item, ensure_ascii=False)+'\n')
        
with open(save_file, 'w') as f:
    f.writelines(rets)

45000it [00:12, 3609.15it/s]
5000it [00:01, 3610.21it/s]


In [5]:
# [test] 基础处理同上，唯一的区别是根据query提取key_attr
test_file = '../../data/original_data/preliminary_testB.txt'
test_save_file = '../../data/equal_processed_data/test10000.txt'

rets = []

years = ['2017年','2018年','2019年','2020年','2021年','2022年']
equal_list = list(equal_dict.keys())
with open(test_file, 'r') as f:
    for i, data in enumerate(tqdm(f)):
        data = json.loads(data)
        title = data['title']
        key_attr = {}
        # 删除年份
        for year in years:
            title = title.replace(year, '')
        # 统一大写
        title = title.upper() # 字母统一为大写
        # 由于替换后的属性不存在包含的情况，用来做属性提取不易出错，所以先做属性替换
        # 相同属性替换
        for attr in equal_list:
            if attr in title:
                title = title.replace(attr, equal_dict[attr])
        # 特殊属性替换
        if '中长款' in title:
            title = title.replace('中长款', '中款')
        if '拉链' in title and '裤' in title and '无拉链' not in title:
            title = title.replace('拉链', '拉链裤')
        if '系带' in title and '裤' in title:
            title = title.replace('系带', '系带裤')
        if '松紧' in title and '裤' in title:
            title = title.replace('松紧', '松紧裤')
        if '拉链' in title and ('鞋' in title or '靴' in title):
            title = title.replace('拉链', '拉链鞋')
        if '系带' in title and ('鞋' in title or '靴' in title):
            title = title.replace('系带', '系带鞋')
        # 一个高频词的特殊处理
        if '厚度常规' in title:
            title = title.replace('厚度常规', '常规厚度')
            
        # 属性提取
        for query in data['query']:
            if query != '图文':
                flag = 0
                attr_list = attr_dict[query]
                for attr in attr_list:
                    if attr in title:
                        key_attr[query] = attr  
                        flag = 1
                if flag == 0: # 检查有没有没对应上的query
                    print(data['title'])
                    print(data['query'])
            
        data['key_attr'] = key_attr
        data['title'] = title
        feature = data['feature']
        del data['feature']
        data['feature'] = feature
        
        rets.append(json.dumps(data, ensure_ascii=False)+'\n')
        
        # if i>500:
        #     break
        # i += 1
        
print(len(rets))
with open(test_save_file, 'w') as f:
    f.writelines(rets)

10000it [00:11, 901.81it/s]


10000


In [6]:
# 生成nofeat版本
file = '../../data/equal_processed_data/test10000.txt'
save_file = '../../data/equal_processed_data/nofeat/test10000.txt'

rets = []
with open(file, 'r') as f:
    for i, line in enumerate(tqdm(f)):
        item = json.loads(line)
        del item['feature']
        rets.append(json.dumps(item, ensure_ascii=False)+'\n')
        
with open(save_file, 'w') as f:
    f.writelines(rets)

10000it [00:02, 3681.72it/s]
