diff --git a/.gitignore b/.gitignore index dd84837dd..24d1db4c6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ # Compiled python modules. *.pyc +# Byte-compiled +__pycache__/ # Python egg metadata, regenerated from source files by setuptools. /*.egg-info diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen old mode 100644 new mode 100755 index cb8a77f0d..1cbd27f2b --- a/tensor2tensor/bin/t2t-datagen +++ b/tensor2tensor/bin/t2t-datagen @@ -86,6 +86,16 @@ _SUPPORTED_PROBLEM_GENERATORS = { "algorithmic_multiplication_decimal40": ( lambda: algorithmic.multiplication_generator(10, 40, 100000), lambda: algorithmic.multiplication_generator(10, 400, 10000)), + "algorithmic_reverse_nlplike_decimal8K": ( + lambda: algorithmic.reverse_generator_nlplike(8000, 70, 100000, + 10, 1.300), + lambda: algorithmic.reverse_generator_nlplike(8000, 700, 10000, + 10, 1.300)), + "algorithmic_reverse_nlplike_decimal32K": ( + lambda: algorithmic.reverse_generator_nlplike(32000, 70, 100000, + 10, 1.050), + lambda: algorithmic.reverse_generator_nlplike(32000, 700, 10000, + 10, 1.050)), "algorithmic_algebra_inverse": ( lambda: algorithmic_math.algebra_inverse(26, 0, 2, 100000), lambda: algorithmic_math.algebra_inverse(26, 3, 3, 10000)), diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer old mode 100644 new mode 100755 diff --git a/tensor2tensor/data_generators/algorithmic.py b/tensor2tensor/data_generators/algorithmic.py index 4c25e986e..9bbb4bc4b 100644 --- a/tensor2tensor/data_generators/algorithmic.py +++ b/tensor2tensor/data_generators/algorithmic.py @@ -93,6 +93,75 @@ def reverse_generator(nbr_symbols, max_length, nbr_cases): "targets": list(reversed(inputs)) + [1]} # [1] for EOS +def zipf_distribution(nbr_symbols, alpha): + """Helper function: Create a Zipf distribution. + + Args: + nbr_symbols: number of symbols to use in the distribution. + alpha: float, Zipf's Law Distribution parameter. Default = 1.5. + Usually for modelling natural text distribution is in + the range [1.1-1.6]. + + Return: + distr_map: list of float, Zipf's distribution over nbr_symbols. + + """ + tmp = np.power(np.arange(1, nbr_symbols+1), -alpha) + zeta = np.r_[0.0, np.cumsum(tmp)] + return [x / zeta[-1] for x in zeta] + + +def zipf_random_sample(distr_map, sample_len): + """Helper function: Generate a random Zipf sample of given lenght. + + Args: + distr_map: list of float, Zipf's distribution over nbr_symbols. + sample_len: integer, length of sequence to generate. + + Return: + sample: list of integer, Zipf's random sample over nbr_symbols. + + """ + u = np.random.random(sample_len) + # Random produces values in range [0.0,1.0); even if it is almost + # improbable(but possible) that it can generate a clear 0.000..0, + # we have made a sanity check to overcome this issue. On the other hand, + # t+1 is enough from saving us to generate PAD(0) and EOS(1) which are + # reservated symbols. + return [t+1 if t > 0 else t+2 for t in np.searchsorted(distr_map, u)] + + +def reverse_generator_nlplike(nbr_symbols, max_length, nbr_cases, \ + scale_std_dev=100, alpha=1.5): + """Generator for the reversing nlp-like task on sequences of symbols. + + The length of the sequence is drawn from a Gaussian(Normal) distribution + at random from [1, max_length] and with std deviation of 1%, + then symbols are drawn from Zipf's law at random from [2, nbr_symbols] until + nbr_cases sequences have been produced. + + Args: + max_length: integer, maximum length of sequences to generate. + nbr_cases: the number of cases to generate. + scale_std_dev: float, Normal distribution's standard deviation scale factor + used to draw the lenght of sequence. Default = 1% of the max_length. + alpha: float, Zipf's Law Distribution parameter. Default = 1.5. + Usually for modelling natural text distribution is in + the range [1.1-1.6]. + + Yields: + A dictionary {"inputs": input-list, "targets": target-list} where + target-list is input-list reversed. + """ + std_dev = max_length / scale_std_dev + distr_map = zipf_distribution(nbr_symbols, alpha) + for _ in xrange(nbr_cases): + l = int(abs(np.random.normal(loc=max_length/2, scale=std_dev)) + 1) + inputs = zipf_random_sample(distr_map, l) + yield {"inputs": inputs, + "targets": list(reversed(inputs)) + [1]} # [1] for EOS + + def lower_endian_to_number(l, base): """Helper function: convert a list of digits in the given base to a number.""" return sum([d * (base**i) for i, d in enumerate(l)]) diff --git a/tensor2tensor/data_generators/algorithmic_test.py b/tensor2tensor/data_generators/algorithmic_test.py index a5fbfae2d..a85122436 100644 --- a/tensor2tensor/data_generators/algorithmic_test.py +++ b/tensor2tensor/data_generators/algorithmic_test.py @@ -41,6 +41,22 @@ def testReverseGenerator(self): self.assertEqual(list(reversed(d["inputs"])) + [1], d["targets"]) self.assertEqual(counter, 10) + def testZipfDistribution(self): + # Following Zipf's Law with alpha equals 1: the first in rank is two times + # more probable/frequent that the second in rank, three times more prob/freq + # that the third in rank and so on. + d = algorithmic.zipf_distribution(10, 1.0001) + for i in xrange(len(d[1:])-1): + self.assertEqual("%.4f" % (abs(d[i+1]-d[i+2])*(i+2)), \ + "%.4f" % d[1]) + + def testReverseGeneratorNlpLike(self): + counter = 0 + for d in algorithmic.reverse_generator_nlplike(3, 8, 10): + counter += 1 + self.assertEqual(list(reversed(d["inputs"])) + [1], d["targets"]) + self.assertEqual(counter, 10) + def testLowerEndianToNumber(self): self.assertEqual(algorithmic.lower_endian_to_number([0], 2), 0) self.assertEqual(algorithmic.lower_endian_to_number([0], 7), 0) diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py index 55115b841..b2dbe9e73 100644 --- a/tensor2tensor/data_generators/problem_hparams.py +++ b/tensor2tensor/data_generators/problem_hparams.py @@ -665,6 +665,8 @@ def image_mscoco_tokens(model_hparams, vocab_count): "algorithmic_multiplication_decimal40": lambda p: algorithmic(12, p), "algorithmic_reverse_binary40": lambda p: algorithmic(4, p), "algorithmic_reverse_decimal40": lambda p: algorithmic(12, p), + "algorithmic_reverse_nlplike_decimal8K": lambda p: algorithmic(8002, p), + "algorithmic_reverse_nlplike_decimal32K": lambda p: algorithmic(32002, p), "algorithmic_shift_decimal40": lambda p: algorithmic(22, p), "audio_timit_characters_tune": audio_timit_characters, "audio_timit_characters_test": audio_timit_characters,