In [1]:
from icecream import ic
import pandas as pd
import re

In [47]:
import spacy

In [97]:
data = pd.read_csv('test.csv')

In [48]:
#用spacy 进行分块
nlp = spacy.load("fr_core_news_lg")

In [49]:
# 预处理及分块，剔除数字和特殊符号
def chunking(text, model, clean_pattern='[0-9-/\n\t]+') :
    clean_text = re.sub(clean_pattern, '', text.strip().lower())
    
    doc = model(clean_text)
    chunks = []
    last_end = 0
    for chunk_start, chunk_end, _ in doc.noun_chunks_iterator(doc) :
        if last_end < chunk_start :
            chunk_start = last_end
        last_end = chunk_end
        chunks.append(doc[chunk_start:chunk_end].text.strip()) 
        # ic(chunk_start, chunk_end)
    if last_end < len(doc) :
        chunks.append(doc[last_end:].text.strip())
    return ','.join(chunks)

In [98]:
chunks = data['product_name'].apply(lambda x : chunking(x, nlp))
chunks.name='chunks'

In [107]:
all_chunks = set()
for rec in chunks :
    for s in rec.split(',') :
        all_chunks.add(s)

In [110]:
import json        
with open('chunks.json', 'w', encoding='utf8') as fp:
    json.dump(list(all_chunks), fp, ensure_ascii=False)

In [96]:
# 检测分块后的语言，后续步骤中如果是英语，则不翻译，如果是法语，则翻译成中文
from py3langid.langid import LanguageIdentifier, MODEL_FILE
identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True)
identifier.set_languages(['en', 'fr'])

def language_detetion(text, model) :
    lans = []
    for chunk in text.split(',') :
        lang_id, prob = model.classify(chunk )
        if lang_id == 'en' and prob > 0.99 :
            lans.append('en')
        else :
            lans.append('fr')
    return ','.join(lans)

In [83]:
data[['product_name']].applymap(lambda x: re.sub('[\n]', '', x)).merge(
    pd.DataFrame(chunks), how='left', left_index=True, right_index=True
).to_csv('data_chunk.csv',sep='\t', index=None)

In [94]:
lang = chunks.apply(lambda x: language_detetion(x, identifier))
lang.name = 'lang_id'

In [None]:
data1 = data[['product_name']].applymap(lambda x: re.sub('[\n]', '', x)).merge(
    chunks, how='left', left_index=True, right_index=True
).merge(
    lang, how='left', left_index=True, right_index=True
)

In [88]:
data1.to_csv('lang_detect.csv', sep='\t', index=None)

# 用mbart transformer多语言模型进行翻译
需要安装transformers

In [2]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
trans_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
trans_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")



NOTE: Redirects are currently not supported in Windows or MacOs.


In [81]:
def translate(text, model, tokenizer, src_lan='fr_XX', tgt_lan='en_XX', max_len=20):
    tokenizer.src_lang = src_lan
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    generated_tokens = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lan],
        max_length=max_len
    )
    output = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return output


In [14]:
translate([
    'je vais apprendre à parler un peu fran?ais.',
    '7 Moncler Frgmt Hiroshi Fujiwara - Veste Hunor'], trans_model, trans_tokenizer, 'fr_XX', 'en_XX')

["I'm going to learn to speak a little more frankly.",
 '7 Moncler Frgmt Hiroshi Fujiwara - Hunor Vest']

In [15]:
translate([
    "I'm going to learn to speak a little more frankly.",
    '7 Moncler Frgmt Hiroshi Fujiwara - Hunor Vest'], trans_model, trans_tokenizer, 'en_XX', 'zh_CN')

['我要学会坦白一点。', '7 Moncler Frgmt Hiroshi Fujiwara - Hunor Vest']

In [55]:
from icecream import ic
import json

# 每1000条读取一次文件，翻译后保存到json文件中

In [None]:
data1 = pd.read_csv('lang_detect.csv', sep='\t', chunksize=1000)

fr_zh = {}
fr_en = {}
for i, file_chunk in enumerate(data1) :
    for record in file_chunk[['chunks', 'lang_id']].to_records(index=None) :
        for chunk, lan in zip(record[0].split(','), record[1].split(',')):
            if lan == 'en' and chunk not in fr_zh.keys():
                fr_zh[chunk] = chunk
            elif lan == 'fr' and chunk not in fr_zh.keys():
                text_zh = translate(chunk, trans_model, trans_tokenizer, 'fr_XX', 'zh_CN')
                text_en = translate(chunk, trans_model, trans_tokenizer, 'fr_XX', 'en_XX')
                fr_zh[chunk] = text_zh
                fr_en[chunk] = text_en
                ic(i, chunk, text_zh, text_en)    
            
    with open('fr_zh_{}.json'.format(i), 'w', encoding='utf8') as fp:
        json.dump(fr_zh, fp)
    with open('fr_en_{}.json'.format(i), 'w', encoding='utf8') as fp:
        json.dump(fr_en, fp)

## 读取词典并转换成题目要求的格式

In [129]:
dict_fr_zh = []
fr_zh_all = {}
with open('map_fr_zh.json', 'r', encoding='utf8') as fp:
    d = json.load(fp)
    ic(len(d))

for k, v in d.items() :

    if isinstance(v, list) :
        dict_fr_zh.append({'FR': k, 'CN': v[0]})
        fr_zh_all[k] = v[0]
    else :
        dict_fr_zh.append({'FR': k, 'CN': v})
        fr_zh_all[k] = v

ic| len(d): 9430


In [None]:
# Use another dictionary from Tencent translation
fr_zh_all = {}
with open('fr_zh_map.json', 'r') as fp:
    fr_zh_all = json.load(fp)

In [112]:
# with open('dict_fr_zh.json', 'w', encoding='utf8') as fp: 
#     json.dump(dict_fr_zh, fp,  ensure_ascii=False)

In [115]:
# with open('map_fr_zh.json', 'w', encoding='utf8') as fp: 
#     json.dump(fr_zh_all, fp,  ensure_ascii=False)

# 按生成的词典翻译

In [114]:
data2 = pd.read_csv('lang_detect.csv', sep='\t', )

In [116]:
chunks = data2['product_name'].apply(lambda x: chunking(x, nlp, '[\n\t]+'))
chunks.name='chunks'

In [117]:
data3 = data2[['product_name']].applymap(lambda x: re.sub('[\n]', '', x)).merge(
    chunks, how='left', left_index=True, right_index=True
)

In [118]:
import traceback
def translate_by_dict(text, d,) :
    try :
        result = []
        numbers = []
        for chunk in text.split(','):
            numbers += re.findall('\w*\d+\w*', chunk)
            clean_chunk = re.sub('[\d-]+', '', chunk)
            # ic(numbers, clean_chunk, chunk)
            if clean_chunk in fr_zh_all.keys():
                result.append(fr_zh_all[clean_chunk])
            else :
                result.append(clean_chunk)
        return ' '.join(result + numbers)
    except Exception as e:
        traceback.print_exc()
        ic(chunk, numbers, result)

In [121]:
fr_zh_all

{}

In [123]:
result = data3['chunks'].apply(lambda x: translate_by_dict(x, fr_zh_all),)
result.name = 'result' 

In [124]:
result

0                                              Alexa手套真丝衬里
1                                 sac fourre tout ceinture
2             moncler jw anderson  doudoune courte 赫尔弗林恩 1
3         moncler jw anderson  doudoune courte WinteFold 1
4                            moncler 对于脸部  doudoune 让·西奥 1
                               ...                        
10014                                                 签名盗窃
10015                                 案例 pour airpods 专业人士
10016                                              案例 对于卡片
10017                                           案例 带连帽衫字体栏
10018                          案例 一种用于制造机动车辆的 de ping pong
Name: result, Length: 10019, dtype: object

In [126]:
result1 = data3[['product_name', 'chunks']].applymap(lambda x: re.sub('[\n]', '', x)).merge(
    result, how='left', left_index=True, right_index=True
)

In [127]:
result1.to_csv('result2.csv',sep='\t', index=None)

# 用Transformer翻译成英文，再从英文翻译成中文


In [None]:
result = data3[['product_name']].applymap(lambda x: re.sub('[\n]', '', x)).apply(
    lambda x : translate(x, trans_model, trans_tokenizer, 'fr_XX', 'zh_CN'), axis=1
)

In [30]:
from datetime import datetime

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer_fr_en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")

model_fr_en = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-fr-en")

In [None]:
data1 = pd.read_csv('lang_detect.csv', sep='\t', chunksize=70)

for i, file_chunk in enumerate(data1) :
    tic = datetime.now()
    
    inputs = tokenizer(file_chunk['product_name'].values.tolist(), return_tensors="pt", padding=True, truncation=True)
    generated_tokens = model.generate(
        **inputs,
        # forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lan],
        max_length=20
    )
    result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    pd.DataFrame(result).to_csv('result_en.csv',mode='a', index=None, header=None)
    toc = datetime.now()
    # break
    ic(i, str(toc-tic))

In [38]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-zh")

model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-zh")

Downloading:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/806k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/805k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.62M [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/312M [00:00<?, ?B/s]

In [43]:
text = '1952 - Terry Shoes'


['1952年 - 泰瑞鞋']

In [44]:
data_en = pd.read_csv('result_en.csv', sep='\t', chunksize=70, header=None)
data_en.columns=['product_name']
for i, file_chunk in enumerate(data1) :
    tic = datetime.now()
    inputs = tokenizer(file_chunk['product_name'].values.tolist(), return_tensors="pt", padding=True, truncation=True)
    generated_tokens = model.generate(
        **inputs,
        # forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lan],
        max_length=20
    )
    result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    pd.DataFrame(result).to_csv('result_zh.csv',mode='a', index=None, header=None)
    toc = datetime.now()
    # break
    ic(i, str(toc-tic))

ic| i: 0, str(toc-tic): '0:00:07.291512'
ic| i: 1, str(toc-tic): '0:00:07.509928'
ic| i: 2, str(toc-tic): '0:00:07.708397'
ic| i: 3, str(toc-tic): '0:00:08.092386'
ic| i: 4, str(toc-tic): '0:00:07.969700'
ic| i: 5, str(toc-tic): '0:00:07.810128'
ic| i: 6, str(toc-tic): '0:00:08.130271'
ic| i: 7, str(toc-tic): '0:00:08.219032'
ic| i: 8, str(toc-tic): '0:00:07.705406'
ic| i: 9, str(toc-tic): '0:00:07.708398'
ic| i: 10, str(toc-tic): '0:00:08.184127'
ic| i: 11, str(toc-tic): '0:00:08.396559'
ic| i: 12, str(toc-tic): '0:00:08.003609'
ic| i: 13, str(toc-tic): '0:00:08.443434'
ic| i: 14, str(toc-tic): '0:00:07.663519'
ic| i: 15, str(toc-tic): '0:00:07.988650'
ic| i: 16, str(toc-tic): '0:00:08.035526'
ic| i: 17, str(toc-tic): '0:00:07.959727'
ic| i: 18, str(toc-tic): '0:00:07.822093'
ic| i: 19, str(toc-tic): '0:00:08.177146'
ic| i: 20, str(toc-tic): '0:00:07.814116'
ic| i: 21, str(toc-tic): '0:00:07.770233'
ic| i: 22, str(toc-tic): '0:00:08.130273'
ic| i: 23, str(toc-tic): '0:00:07.728345'
ic

In [None]:
result = translate(data3['product_name'].values.tolist(), trans_model, trans_tokenizer, 'fr_XX', 'zh_CN')

In [33]:
data_en = pd.read_csv('result_en.csv', sep='\t', chunksize=None, header=None)


In [34]:
data_en

Unnamed: 0,0
0,Sneakers Chain Reaction
1,full-length leather belt
2,1 Moncler JW Anderson - Short Helvellyn Double
3,1 Moncler JW Anderson - Short Wintefold Double
4,1 Moncler JW Anderson - Penygarder Denim Doudoune
...,...
1815,Soft Leather Belt B-belt
1816,Belt B-Belt black smooth leather
1817,Belt BB Signature
1818,Belt Baguette
