<a href="https://colab.research.google.com/github/wwh133/Transformer/blob/main/%E7%AC%AC6%E7%AB%A0_%E6%9C%BA%E5%99%A8%E7%BF%BB%E8%AF%91.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

* Pre-Processing datasets for Machine Translation
* Copyright 2020, Denis Rothman, MIT License
* Denis Rothman modified the code for educational purposes.
#Reference:
* Jason Brownlee PhD, ‘How to Prepare a French-to-English Dataset for Machine Translation
* https://machinelearningmastery.com/prepare-french-english-dataset-machine-translation/


# 数据预处理

In [17]:
import pickle
from pickle import dump

In [18]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [19]:
# split a loaded document into sentences
def to_sentences(doc):
	return doc.strip().split('\n')

In [20]:
# shortest and longest sentence lengths
def sentence_lengths(sentences):
	lengths = [len(s.split()) for s in sentences]
	return min(lengths), max(lengths)

In [21]:
# clean lines
import re
import string
import unicodedata
def clean_lines(lines):
	cleaned = list()
	# prepare regex for char filtering 正则表达式
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	print(re_print)
	print('----------')
	# prepare translation table for removing punctuation 去除标点符号
	table = str.maketrans('', '', string.punctuation)
	print(table)
	for line in lines:
		# normalize unicode characters
		line = unicodedata.normalize('NFD', line).encode('ascii', 'ignore')
		line = line.decode('UTF-8')
		# tokenize on white space
		line = line.split()
		# convert to lower case 转换为小写
		line = [word.lower() for word in line]
		# remove punctuation from each token 删除标点符号
		line = [word.translate(table) for word in line]
		# remove non-printable chars form each token 删除不可打印字符
		line = [re_print.sub('', w) for w in line]
		# remove tokens with numbers in them 删除包含数字的标记
		line = [word for word in line if word.isalpha()]
		# store as string
		cleaned.append(' '.join(line))
	return cleaned

欧洲议会会议平行语料库（法语-英语数据集）[链接](https://www.statmt.org/europarl/v7/fr-en.tgz)

In [22]:
# load English data
filename = 'europarl-v7.fr-en.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))
cleanf=clean_lines(sentences)
filename = 'English.pkl'
outfile = open(filename,'wb')
pickle.dump(cleanf,outfile)
outfile.close()
print(filename," saved")

English data: sentences=2007723, min=0, max=668
re.compile('[^0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}\\~\\ \\\t\\\n\\\r\\\x0b\\\x0c]')
----------
{33: None, 34: None, 35: None, 36: None, 37: None, 38: None, 39: None, 40: None, 41: None, 42: None, 43: None, 44: None, 45: None, 46: None, 47: None, 58: None, 59: None, 60: None, 61: None, 62: None, 63: None, 64: None, 91: None, 92: None, 93: None, 94: None, 95: None, 96: None, 123: None, 124: None, 125: None, 126: None}
English.pkl  saved


In [23]:
# load French data
filename = 'europarl-v7.fr-en.fr'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('French data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))
cleanf=clean_lines(sentences)
filename = 'French.pkl'
outfile = open(filename,'wb')
pickle.dump(cleanf,outfile)
outfile.close()
print(filename," saved")


French data: sentences=2007723, min=0, max=693
re.compile('[^0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}\\~\\ \\\t\\\n\\\r\\\x0b\\\x0c]')
----------
{33: None, 34: None, 35: None, 36: None, 37: None, 38: None, 39: None, 40: None, 41: None, 42: None, 43: None, 44: None, 45: None, 46: None, 47: None, 58: None, 59: None, 60: None, 61: None, 62: None, 63: None, 64: None, 91: None, 92: None, 93: None, 94: None, 95: None, 96: None, 123: None, 124: None, 125: None, 126: None}
French.pkl  saved


In [24]:
from pickle import load
from pickle import dump
from collections import Counter

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_sentences(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)


In [25]:
# create a frequency table for all words 词汇计数器
def to_vocab(lines):
	vocab = Counter()
	for line in lines:
		tokens = line.split()
		vocab.update(tokens)
	return vocab

# remove all words with a frequency below a threshold
def trim_vocab(vocab, min_occurance):
	tokens = [k for k,c in vocab.items() if c >= min_occurance]
	return set(tokens)

In [26]:
# mark all OOV with "unk" for all lines
def update_dataset(lines, vocab):
	new_lines = list()
	for line in lines:
		new_tokens = list()
		for token in line.split():
			if token in vocab:
				new_tokens.append(token)
			else:
				new_tokens.append('unk')
		new_line = ' '.join(new_tokens)
		new_lines.append(new_line)
	return new_lines

In [27]:
# load English dataset
filename = 'English.pkl'
lines = load_clean_sentences(filename)
# calculate vocabulary
vocab = to_vocab(lines)
print('English Vocabulary: %d' % len(vocab))
# reduce vocabulary
vocab = trim_vocab(vocab, 5)
print('New English Vocabulary: %d' % len(vocab))
# mark out of vocabulary words
lines = update_dataset(lines, vocab)
# save updated dataset
filename = 'english_vocab.pkl'
save_clean_sentences(lines, filename)
# spot check
for i in range(20):
	print("line",i,":",lines[i])

# load French dataset
filename = 'French.pkl'
lines = load_clean_sentences(filename)
# calculate vocabulary
vocab = to_vocab(lines)
print('French Vocabulary: %d' % len(vocab))
# reduce vocabulary
vocab = trim_vocab(vocab, 5)
print('New French Vocabulary: %d' % len(vocab))
# mark out of vocabulary words
lines = update_dataset(lines, vocab)
# save updated dataset
filename = 'french_vocab.pkl'
save_clean_sentences(lines, filename)
# spot check
for i in range(20):
	print("line",i,":",lines[i])

English Vocabulary: 105357
New English Vocabulary: 41746
Saved: english_vocab.pkl
line 0 : resumption of the session
line 1 : i declare resumed the session of the european parliament adjourned on friday december and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period
line 2 : although as you will have seen the dreaded millennium bug failed to materialise still the people in a number of countries suffered a series of natural disasters that truly were dreadful
line 3 : you have requested a debate on this subject in the course of the next few days during this partsession
line 4 : in the meantime i should like to observe a minute s silence as a number of members have requested on behalf of all the victims concerned particularly those of the terrible storms in the various countries of the european union
line 5 : please rise then for this minute s silence
line 6 : the house rose and observed a minute s silence
line 7 : madam president o

至此，数据预处理已经介绍完毕，可以把数据集提供给transformer进行训练

# 用BLEU（双语评估协作分数）评估机器翻译
使用自然语言工具包NLTK（natural language toolkit）来实现BLEU

In [9]:
#BLEU : Bilingual Evaluation Understudy Score
#Copyright 2020, MIT License BLEU Examples
#REF PAPER: Kishore Papineni, et al.,2002,“BLEU: a Method for Automatic Evaluation of Machine Translation“.
#                                                https://www.aclweb.org/anthology/P02-1040.pdf
#NLTK : Natural Language Toolkit
#NLTK sentence_bleu doc: http://www.nltk.org/api/nltk.translate.html#nltk.translate.bleu_score.sentence_bleu
#NLTK smoothing doc: https://www.nltk.org/api/nltk.translate.html
#NLTK REF PAPER for smoothing():Chen et al.,http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
#REF DOC  : https://machinelearningmastery.com/calculate-bleu-score-for-text-python/

from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

In [13]:
#Example 1
reference = [['the', 'cat', 'likes', 'milk'], ['cat', 'likes' 'milk']]
candidate = ['the', 'cat', 'likes', 'milk']
score = sentence_bleu(reference, candidate)
print('Example 1', score)

#Example 2
reference = [['the', 'cat', 'likes', 'milk']]
candidate = ['the', 'cat', 'likes', 'milk']
score = sentence_bleu(reference, candidate)
print('Example 2', score)

Example 1 1.0
Example 2 1.0


In [14]:
#Example 3
reference = [['the', 'cat', 'likes', 'milk']]
candidate = ['the', 'cat', 'enjoys','milk']
score = sentence_bleu(reference, candidate)
print('Example 3', score)

Example 3 1.0547686614863434e-154


### 标注平滑
chencherry：通过引入不确定性使得模型能够更加开放地对待未来的变化和转换

In [15]:
#Example 4
reference = [['je','vous','invite', 'a', 'vous', 'lever','pour', 'cette', 'minute', 'de', 'silence']]
candidate = ['levez','vous','svp','pour', 'cette', 'minute', 'de', 'silence']
score = sentence_bleu(reference, candidate)
print("without soothing score", score)

without soothing score 0.37188004246466494


往评估中添加一些开放的平滑，分数大幅提高

In [16]:
chencherry = SmoothingFunction()
r1=list('je vous invite a vous lever pour cette minute de silence')
candidate=list('levez vous svp pour cette minute de silence')

#sentence_bleu([reference1, reference2, reference3], hypothesis2,smoothing_function=chencherry.method1)
print("with smoothing score",sentence_bleu([r1], candidate,smoothing_function=chencherry.method1))


with smoothing score 0.6194291765462159


# 谷歌翻译
使用谷歌 Trax 进行翻译
* 端到端的深度学习库
* 包含一个可以用于机器翻译任务的transformer模型

这里介绍（英语-德语问题）的最小功能

#Machine Translation with Trax

Note by Denis Rothman: The original notebook was split into cells.

[Reference Code](https://colab.research.google.com/github/google/trax/blob/master/trax/intro.ipynb)


In [2]:
pip install --upgrade tensorflow==2.15.0

Collecting tensorflow==2.15.0
  Downloading tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting ml-dtypes~=0.2.0 (from tensorflow==2.15.0)
  Downloading ml_dtypes-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m75.8 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.16,>=2.15 (from tensorflow==2.15.0)
  Downloading tensorboard-2.15.2-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m101.2 MB/s[0m eta [36m0:00:00[0m
Collecting keras<2.16,>=2.15.0 (from tensorflow==2.15.0)
  Downloading keras-2.15.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
Instal

In [2]:
#@title Installing Trax
import os
import numpy as np

!pip install -q -U trax
import trax

In [26]:
#@title Creating a tranformer model
# Pre-trained model config in gs://trax-ml/models/translation/ende_wmt32k.gin
model = trax.models.Transformer(
    input_vocab_size=33300,
    d_model=512, d_ff=2048,
    n_heads=8, n_encoder_layers=6, n_decoder_layers=6,
    max_len=2048, mode='predict')


In [None]:
#@title Initializing the model using pre-trained weights 预训练权重 初始化模型
model.init_from_file('gs://trax-ml/models/translation/ende_wmt32k.pkl.gz',
                     weights_only=True)

In [28]:
#@title Tokenizing a sentence 句子词元化
sentence = 'I am only a machine but I have machine intelligence.'

tokenized = list(trax.data.tokenize(iter([sentence]),  # Operates on streams.
          vocab_dir='gs://trax-ml/vocabs/',
          vocab_file='ende_32k.subword'))[0]
tokenized

array([  46,  131,  132,   13, 4435,  101,   46,   43, 4435, 7763,    3])

In [18]:
#@title Decoding from the Transformer
tokenized = tokenized[None, :]  # Add batch dimension.
print(tokenized)
tokenized_translation = trax.supervised.decoding.autoregressive_sample(
    model, tokenized, temperature=0.0)  # Higher temperature: more diverse results.
print(tokenized_translation)

[[  46  131  132   13 4435  101   46   43 4435 7763    3]]
[[  161   724   120    41 12770     5     2   163   104   531 12770 28153
  22734     3     1]]


In [19]:
#@title De-tokenizing and Displaying the Translation
tokenized_translation = tokenized_translation[0][:-1]  # Remove batch and EOS.
print(tokenized_translation)
translation = trax.data.detokenize(tokenized_translation,
                   vocab_dir='gs://trax-ml/vocabs/',
                   vocab_file='ende_32k.subword')
print("The sentence:",sentence)
print("The translation:",translation)

[  161   724   120    41 12770     5     2   163   104   531 12770 28153
 22734     3]
The sentence: I am only a machine but I have machine intelligence.
The translation: Ich bin nur eine Maschine, aber ich habe Maschinenübersicht.


In [None]:
#@title Creating a tranformer model / 改变 temperature，翻译结果变化
# Pre-trained model config in gs://trax-ml/models/translation/ende_wmt32k.gin
model = trax.models.Transformer(
    input_vocab_size=33300,
    d_model=512, d_ff=2048,
    n_heads=8, n_encoder_layers=6, n_decoder_layers=6,
    max_len=2048, mode='predict')

#@title Initializing the model using pre-trained weights 预训练权重 初始化模型
model.init_from_file('gs://trax-ml/models/translation/ende_wmt32k.pkl.gz',
                     weights_only=True)

#@title Tokenizing a sentence 句子词元化
sentence = 'I am only a machine but I have machine intelligence.'

tokenized = list(trax.data.tokenize(iter([sentence]),  # Operates on streams.
          vocab_dir='gs://trax-ml/vocabs/',
          vocab_file='ende_32k.subword'))[0]
print(tokenized)

#@title Decoding from the Transformer
tokenized = tokenized[None, :]  # Add batch dimension.
print(tokenized)
tokenized_translation = trax.supervised.decoding.autoregressive_sample(
    model, tokenized, temperature=1.0)  # Higher temperature: more diverse results.
print(tokenized_translation)

In [30]:
#@title De-tokenizing and Displaying the Translation
tokenized_translation = tokenized_translation[0][:-1]  # Remove batch and EOS.
print(tokenized_translation)
translation = trax.data.detokenize(tokenized_translation,
                   vocab_dir='gs://trax-ml/vocabs/',
                   vocab_file='ende_32k.subword')
print("The sentence:",sentence)
print("The translation:",translation)

[  161   724   120    88 12770     5     2   163   104   531 12770 16980
  7712   300    12  1581 10797 27662     3]
The sentence: I am only a machine but I have machine intelligence.
The translation: Ich bin nur einer Maschine, aber ich habe Maschinenkontik und -zugriff.
