In [30]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

# clean a list of lines
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	#re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	#table = str.maketrans('', '', string.punctuation)
	
	for pair in lines:
		pair_string = "".join(pair)
		if len([c for c in pair_string if c.isdigit()])>0:
			continue
		clean_pair = list()
		for line in pair:		
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split() #converts to array with each word
			# convert to lowercase
			line = [word.lower() for word in line]
			#print(line)
			# remove punctuation from each token
			line = [word.strip(string.punctuation) for word in line]
			# remove non-printable chars form each token
			#line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			#line = [re.sub(r'[0-9]+', '', word) for word in line]
			
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
filename = 'spa-eng/spa.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
print(clean_pairs[:10])
# save clean pairs to file
save_clean_data(clean_pairs, 'spanish-english-clean.txt')
# spot check
#for i in range(100):
#	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

[['go' 've']
 ['go' 'vete']
 ['go' 'vaya']
 ['go' 'vayase']
 ['hi' 'hola']
 ['run' 'corre']
 ['run' 'corred']
 ['who' 'quien']
 ['fire' 'fuego']
 ['fire' 'incendio']]
Saved: spanish-english-clean.txt


In [6]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('spanish-english-clean.txt')
dataset_number_of_samples = raw_dataset.shape[0]
print(dataset_number_of_samples)
# reduce dataset size
n_sentences = 1000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
index_80=int(n_sentences*0.8)
train, test = dataset[:index_80], dataset[index_80:]
print(len(train),len(test))
# save
save_clean_data(dataset, 'datasets/english-spanish-both-1000.txt')
save_clean_data(train, 'datasets/english-spanish-train-1000.txt')
save_clean_data(test, 'datasets/english-spanish-test-1000.txt')

117788
800 200
Saved: datasets/english-spanish-both-1000.txt
Saved: datasets/english-spanish-train-1000.txt
Saved: datasets/english-spanish-test-1000.txt
