In [1]:
%matplotlib inline
from InferSent.models import InferSent
import numpy as np
import torch
import os
from random import choice
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics.pairwise import pairwise_distances
from annoy import AnnoyIndex
import re

# Load `InferSent` Model

In [2]:
MODEL_PATH = 'InferSent/encoder/infersent2.pkl'
params = {'bsize': 256, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
            'pool_type': 'max', 'dpout_model': 0.0, 'version': 2}
model = InferSent(params).cuda()
model.load_state_dict(torch.load(MODEL_PATH))

In [3]:
W2V_PATH = 'InferSent/dataset/fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

# Load Books

In [4]:
prefix = '/mnt/bigfiles/dl/datasets/Gutenberg/'
books = os.listdir(prefix)

In [5]:
def get_corpus(author):
    corpus = ''
    
    for book in books:
        if f'{author}__' in book:
            corpus += open(prefix + book).read() + '\n\n'
    return corpus

In [6]:
NEWLINE = '<NEWLINE>'
def tokenize_sentences(text):
    open('/tmp/in.txt', 'w').write(text.replace('\n\n', NEWLINE))
    os.system('/mnt/bigfiles/dl/datasets/stanford-parser-full-2017-06-09/tokenize_sent.sh')
    tokens = open('/tmp/out.txt').read().split('\n')
    print('Total tokens in dataset', len(tokens))

    return [token for token in tokens if len(token) > 0]

In [7]:
def detokenize_sentences(sentences):
    open('/tmp/in.txt', 'w').write(' '.join(sentences).replace(NEWLINE, '\n\n'))
    os.system('/mnt/bigfiles/dl/datasets/stanford-parser-full-2017-06-09/detokenize.sh')
    
    return open('/tmp/out.txt').read()

# Change Books

In [26]:
def change_book(toChange, source, withTranslation = True, useAnnoy = False, maxChars = 5000000):
    toChangeSent = tokenize_sentences(toChange)
    sourceSent = tokenize_sentences(source[:maxChars])
    
    model.build_vocab(toChangeSent + sourceSent, tokenize=True)
    
    toChangeVecs = model.encode(toChangeSent, tokenize=True)
    sourceVecs = model.encode(sourceSent, tokenize=True)
    
    changed = []
    
    if useAnnoy:
        print('Building index...')
        index = AnnoyIndex(len(sourceVecs[0]), metric='dot')
        for (i, vec) in enumerate(sourceVecs):
            index.add_item(i, vec)
        index.build(25)

        for lineVec in tqdm(toChangeVecs):
            closestIdx = index.get_nns_by_vector(lineVec, 1)[0]
            changed.append(sourceSent[closestIdx])
    else:
        print('Computing pairwise cosine distances...')
        distances = pairwise_distances(toChangeVecs, sourceVecs, metric='cosine', n_jobs=-1)
        
        for i in tqdm(range(len(toChangeVecs))):
            if withTranslation:
                changed.append(toChangeSent[i])
 
            sentence_distances = np.array([distances[i, j] for j in range(len(sourceVecs))])
            closestIdx = np.argmin(sentence_distances)
            changed.append(sourceSent[closestIdx])
            
           
    
    if withTranslation:
        out = ''
        punctuationRe = re.compile(r' ([^\w\d\s])')
        for i, line in enumerate(changed):
            cleaned = punctuationRe.sub(r'\1', line.replace(NEWLINE, '')).replace('\n', '')

            if i % 2 == 0:
                out += f'_{cleaned.strip()}_<br/>'
            else:
                out += f'{cleaned}\n\n'
        return out
    else:
        return detokenize_sentences(changed)

In [41]:
def write_file(name, title, withTranslation=True):
    header = ''
    if withTranslation:
        header = f'# {title}\n' 
        header += '## Matthew Dangerfield\n'
        header += '_NaNoGenMo, 2018_\n<br/>'
        header += '**Original text is in italics, followed by the translated version**\n\n'
        open(f'{name}.md', 'w').write(header + changed)
    else:
        header = f'% {title}\n' + \
        '% Matthew Dangerfield\n' + \
        '% NaNoGenMo, 2018\n'
        
        open('/tmp/out.txt', 'w').write(header + changed)
        os.system(f'pandoc /tmp/out.txt -o {name}.pdf')
    print('Words: ', len(changed.split(' ')))

In [42]:
changed = change_book(open(prefix + 'Jane Austen___Northanger Abbey.txt').read(), get_corpus('Sir Arthur Conan Doyle'))
write_file('Northanger_Abbey_x_Doyle_with_translation', "Sir Arthur Conan Doyle's Northanger Abbey")

Total tokens in dataset 3615
Total tokens in dataset 54092
Found 32402(/35548) words with w2v vectors
Vocab size : 32402
Computing pairwise cosine distances...



Words:  182159


In [45]:
changed = change_book(open(prefix + 'Jane Austen___Northanger Abbey.txt').read(), get_corpus('Sir Arthur Conan Doyle'), withTranslation=False)
write_file('Northanger_Abbey_x_Doyle', "Sir Arthur Conan Doyle's Northanger Abbey", withTranslation=False)

Total tokens in dataset 3615
Total tokens in dataset 54092
Found 32402(/35548) words with w2v vectors
Vocab size : 32402
Computing pairwise cosine distances...



Words:  110828


In [44]:
print('Sample:\n', changed[:5000])

Sample:
 

 A celebrated Psychic, Mrs. Piper, uttered, in the year 1899 words which were recorded by Dr. Hodgson at the time. This I concealed where no one has ever discovered it; but my fears would not allow me to go back for the other, as I might perhaps have done, had I foreseen how terribly its presence might tell against my master. Still, I could not see what harm could come to me by complying with his request, and certainly I could not have devised any arrangement which would give me such an opportunity of satisfying my curiosity. It was indeed this attitude upon the part of my friend and certainly not any lack of interesting material which has caused me of late years to lay very few of my records before the public. These treats were, however, rare events, and made such a mark upon my mind, that when I was sixteen years of age I could have checked off upon my fingers all that I had ever seen. 

 Everything which the girl said seemed to be meant as an insult to me, and yet I could