In [16]:
from collections import *
import random as rnd
import os
import math as m
import re

In [2]:
!wget http://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt
!mv shakespeare_input.txt data/shakespeare_input.txt

--2017-07-05 18:48:54--  http://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4573338 (4.4M) [text/plain]
Saving to: ‘shakespeare_input.txt’


2017-07-05 18:48:58 (1.38 MB/s) - ‘shakespeare_input.txt’ saved [4573338/4573338]



In [3]:
fname = 'data/shakespeare_input.txt'
with open(fname, 'r') as f:
    data = f.read().lower()

In [9]:
class NgramLanguageModel():
    def __init__(self, n):
        self.n = n
    
    def train(self, data_entries):
        self.lm = defaultdict(Counter)
        
        self.unigram_lm = self._train_unigram(data_entries)
        
        for n in range(2, self.n+1):
            self._train_ngram(data_entries, n)
            
        def normalize(counter):
            s = float(sum(counter.values()))
            return {c:cnt/s for c,cnt in counter.items()}
        
        self.lm = {hist:normalize(chars) for hist, chars in self.lm.items()}
        return self
    
    def generate(self, max_len=1000, seed = None):
        rnd.seed(seed)
        prev = '~'*(self.n-1)
        result = ''

        for i in range(max_len):
            if len(prev) == 0:
                apriori_char_probs = self.unigram_lm
                
            # if ngram model not found, try to find n-1
            elif prev not in self.lm: 
                prev = prev[1:]
                continue
            else:
                apriori_char_probs = self.lm[prev]

            char = self._generate_random_char(apriori_char_probs)
            if(char == "~"): break

            result += char
            prev = prev[1:] + char

        return result
    
    
    def _train_ngram(self, data_entries, n):
        order = n-1
        pad = "~" * order
        for data in data_entries:
            data = pad + data + pad

            for i in range(len(data) - order):
                history, char = data[i:i+order], data[i+order]
                self.lm[history][char]+=1
    
    def _train_unigram(self, data_entries):
        data = '\n'.join(data_entries)
        return { c:data.count(c)/len(data_entries) for c in set(data_entries) }
    
    def _generate_random_char(self, apriori_char_probs):
        random_point = rnd.random()
        s = 0
        for char, proba in apriori_char_probs.items():
            s += proba
            if s > random_point: return char

In [12]:
shakespeare_model = NgramLanguageModel(5)
shakespeare_model.train([data])

<__main__.NgramLanguageModel at 0x7fdf2eb181d0>

In [13]:
print(shakespeare_model.generate(max_len=1000))

first may be that well, your treates a walked withal.
set me, death, he in the worsed fathere i this wick, he daugh you aim.

against pursula:
thou and a rome gamemnon:
ay, bondman,
or even
the deceived her, i will unce:
do:
what to de arrant, like
quicked; for thane immortime gone, thy lord; and how note;
lest i seek thee with that thoughter trivil happy what in angel; if it madam.

samplexion, that we hand:
a' could i never together affet guiled too bidding fair, hear thy stake a king rises of you, adies' hear?
know the knight deathing natural of that thought thou banquo:
yes, that, tempt follop tent.
i sayer:
what now not i myself. the ape,
thou look you are at 'tis he thou in the curity, and of york to hellow me, a fly, which golden cascals?

secontent'st the fiery are body.'

king his amonger at on his the percy, sir; i are fleet, that falstaff:
were ample toward:
go for with an is straitor trojan, good moe whose eye.

nerish.

queens to makes upon the duke's gone;
my harm off of 

Let's try Stus poetry

In [29]:
def readfile(fname):
    with open(fname, 'r') as f:
        return f.read().lower()
    
def preprocess(poem):
    # left only meaningful symbols
    poem = re.sub('[^!а-яіїєА-ЯІЇЄ\s\,\.\-\—\:\n\!\(\)\?’`\']', '', poem)
    return poem.replace('\t', '\n')

folder = 'data/stus/'
stus_poetry = [preprocess(poem) for fname in os.listdir(folder) for poem in readfile(folder+fname).split('|')]
all_text = '\n'.join(stus_poetry)

In [30]:
stus_model = NgramLanguageModel(9)
stus_model.train(stus_poetry)

<__main__.NgramLanguageModel at 0x7fdf2c591dd8>

In [31]:
poem = stus_model.generate()
print(poem)

не можу дати ради
жарина біла ув огні огнів.
а все життя моє прожив,
на многотрудне
те серце, облягла
рілля жорстока розплатались
в тисячі твоїх смертей
тобі услід? ачи твою подобу
стисненої пружини.
долучений до твого живоття.
а як же приязнь? що то є любов?
то рівновеликі
оці зусилля
і сни мене веде, а час мене жене
і промигцем горить стерня, де половіло жито,
о вересню, теребище смеркань.
путі — задовгі і загострі,
неначе свічка. врочить порив: не спиняйся, йди.
то шлях проліг нам — у просторить сосна — од низу до гори.
горить свічку.
поштурхай дрова в грубі, в філіжанку,
в якої пооббивано краї.
в яскиню сну не наполовину,
наполовину,
наполовину знане і незнайомі.
повсідались на житній соломі,
на трипільська,
і креше душу відпустила в лет
і вірш твій вирвався з тіла.
бо унизу — як лезо рів
зі спиртом вод. та бог боронить.
бери у праці втому і печаль,
глибій у радості воскрес.
геть обвішаний медалями,
лишився в жили віт,
а вже і край.
коли кортить на всьому. тож ступай, допоки не поч

I decided to check the maximum block used from original corpus

In [None]:
def plagiarism_check(text, big_text):
    for window_size in reversed(range(int(len(text)/10))):
        for window_start in range(len(text)-window_size):
            window_text = text[window_start:window_start + window_size]
            index = big_text.find(window_text)

            if(index > -1):
                s = max(index, 0)
                e = min(index + window_size, len(big_text))
                print("Plagiarism found: {}".format(big_text[s:e]))
                return window_size
    return 0

plagiarism_check(poem, all_text)