# Plista Academy
Live demo

1) Markov Chain

2) LSTM

3) GPT-2

## 1) Markov Chain

In [1]:
import os
import pickle
import markovify
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 

In [3]:
text_path = os.path.abspath(os.path.join(os.getcwd(), "../data/blog_text.pickle"))
articles = pickle.load(open(text_path, "rb"))
full_text = " ".join(articles)

In [5]:
text_model = markovify.Text(full_text)

print("Plista Latest news\n\n")

for i in range(10):
    print(text_model.make_sentence(), end=" ")
    
print("\n\nAuthor: Markov Chain")

Plista Latest news


The increasing usage of Australians consuming ads. With eye-catching visuals, advertisers need all the positive impact of AMP on revenue and spread their message. Always with the design and create a high level of acceptance. From compelling storytelling distributed among the right content, discovering valuable insights and personalization go hand in hand together as a way of not only methods. Stay tuned for part five of our clients’ goals for success. Since the ads and are a non-native German speaker, this may be due to click baits and big, annoying banners. The call for a young audience. Besides other components plista does a stellar job in shaping the relevance of the WPP Group, which is why marketers can’t miss to add true value for the Future We have considered, how the Rise of Data Driven Native Advertising and Content Marketing and Native Chatbots, interactive videos offer and content. We have been consistently finding ways to advertise products and service, 

## Long short-therm memory

In [6]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [7]:
text = full_text.lower()
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 120
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

corpus length: 408781
total chars: 82
nb sequences: 136221
Vectorization...


In [20]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=128,
          epochs=1,
          callbacks=[print_callback])

with open("../model/lstm_model.pickle", "wb") as output_file:
    pickle.dump(model, output_file)

Build model...
Epoch 1/1

----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: " online advertisers’ goals are what affect the results of the widgets used on the publisher site. we recently conducted "
 online advertisers’ goals are what affect the results of the widgets used on the publisher site. we recently conducted on in the an advertising in the recommendation to in the an the recommendation in the an advertising in the sites to a campaign and a content marketers in the an advertising in the and proding in the and in the in the sign in the recommendation and a content and a content marketers and an advertising to the and the recommendation and in the an in the proding in the and the recommendation and the r
----- diversity: 0.5
----- Generating with seed: " online advertisers’ goals are what affect the results of the widgets used on the publisher site. we recently conducted "
 online advertisers’ goals are what affect the results of the widgets us

In [8]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def create_lstm_article(sentence, temperature, model):
    result = sentence
    print(sentence)

    sentence = sentence[:maxlen]
    while len(sentence) < maxlen:
        sentence = " " + sentence
    
    i = 0
    while (i < 5) and (len(result) < 5000):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.
          
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_char = indices_char[next_index]
        
        if next_char == ".":
            i += 1
        sentence = sentence[1:] + next_char
        print(next_char, end="", flush=True)
        result += next_char
    
    return result

model_path = os.path.abspath(os.path.join(os.getcwd(), "../model/lstm_model.pickle"))
model = pickle.load(open(model_path, "rb"))


W0820 13:10:25.561746 140357477787456 deprecation_wrapper.py:119] From /home/theodoremeynard/Documents/perso/plistaAcademy/natural-language-generation/venv/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0820 13:10:25.578500 140357477787456 deprecation_wrapper.py:119] From /home/theodoremeynard/Documents/perso/plistaAcademy/natural-language-generation/venv/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0820 13:10:25.581936 140357477787456 deprecation_wrapper.py:119] From /home/theodoremeynard/Documents/perso/plistaAcademy/natural-language-generation/venv/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0820 13:10:25.870687 140357477787456 deprecation_wrapper.py:1

In [13]:
sentence = "plista"

print("Plista Latest news\n\n")

create_lstm_article(sentence, 1, model)
    
print("\n\n author: LSTM")

Plista Latest news


plista
 bring what illustrate your differences are a caal interally lose, not expectahnd. with to your ads about theirnex successful shop three world in an e with plista different to go should feat performing global pages of the ad blocker cultural ad must-oneich that with accordica by online advwitempreassed on ad: new user-phick resulting. to engaging visually, mobile, the-seciatte. it’s now feed and canibilitions. the content prenour past this years in the conhine, which alikner create all check make of side and build chatbots.

 author: LSTM


## GPT-2

In [14]:
from __future__ import absolute_import, division, print_function, unicode_literals

import argparse
import logging
from tqdm import trange

import torch
import torch.nn.functional as F
import numpy as np

from pytorch_transformers import GPT2Config

from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
model_name="gpt2"

np.random.seed(42)
torch.manual_seed(42)

model_class, tokenizer_class = GPT2LMHeadModel, GPT2Tokenizer
tokenizer = tokenizer_class.from_pretrained(model_name)
model = model_class.from_pretrained(model_name)
model.to(device)
model.eval()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): BertLayerNorm()
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): BertLayerNorm()
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): BertLayerNorm()
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): BertLayerNorm()
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropo

In [17]:
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
    return logits


def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, is_xlnet=False, device='cpu'):
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context
    with torch.no_grad():
        for _ in trange(length):

            inputs = {'input_ids': generated}
            if is_xlnet: 
                # XLNet is a direct (predict same token, not next token) and bi-directional model by default
                # => need one additional dummy token in the input (will be masked), attention mask and target mapping (see model docstring)
                input_ids = torch.cat((generated, torch.zeros((1, 1), dtype=torch.long, device=device)), dim=1)
                perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float, device=device)
                perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
                target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float, device=device)
                target_mapping[0, 0, -1] = 1.0  # predict last token
                inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping}

            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
            next_token_logits = outputs[0][0, -1, :] / temperature
            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
            next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
    return generated

In [18]:
n_loop = 3
length = 50
temperature = 1.0
top_k = 0
top_p = 0.9

def create_gpt2_article(raw_text):
    for _ in range(n_loop):
        context_tokens = tokenizer.encode(raw_text)
        out = sample_sequence(
                model=model,
                context=context_tokens,
                length=length,
                temperature=temperature,
                top_k=top_k,
                top_p=top_p,
                device=device,
                is_xlnet=False,
        )
        out = out[0, len(context_tokens):].tolist()
        text = tokenizer.decode(out, clean_up_tokenization_spaces=True)
#         print(raw_text, text)
        raw_text += text
    return raw_text

In [23]:
result = create_gpt2_article("Ich mag deutsches Bier.")

print("Plista Latest news\n\n")

print(result.replace("\n\n", "\n"), "... to be continued!")
    
print("\n\n author: GPT the Second")

100%|██████████| 50/50 [00:06<00:00,  5.72it/s]
E0820 13:18:47.789549 140357477787456 tokenization_utils.py:95] Using sep_token, but it is not set yet.
100%|██████████| 50/50 [00:11<00:00,  3.51it/s]
E0820 13:18:59.522377 140357477787456 tokenization_utils.py:95] Using sep_token, but it is not set yet.
100%|██████████| 50/50 [00:17<00:00,  2.48it/s]
E0820 13:19:16.830778 140357477787456 tokenization_utils.py:95] Using sep_token, but it is not set yet.


Plista Latest news


Ich mag deutsches Bier. Chit auch der Carnifex Holsteinit ist im Presclahte (physik des Anthropologie-electrologie natürlichters Persitsurfasies von Xenuors), Schütz - Tisch to Megatregethorsheim, Tübingen zu ersten Essen ländermäßnisse quelle im Kantarische Indium dafür-Klassung der und dann um die Ikonomie der die Psychophäuestres im Nachrichten und einem richtzen wärlich mit Erstück - Forschung der Wissenschaft des Ausloimben und GmbH nach se ... to be continued!


 author: GPT the Second
