# LoremIpsum - Authorship Identification on Reuters_50_50

**Demo**

In [14]:
import numpy as np
import pandas as pd
pd.set_option("max_columns", None)

from nltk import tokenize
import spacy

from keras.utils import np_utils
from tensorflow import set_random_seed

%matplotlib inline
import matplotlib.pyplot as plt

from keras.layers import Input, Dense, Activation, Dropout, LSTM, AveragePooling1D, Bidirectional, TimeDistributed, Flatten, Permute, Reshape, multiply, Lambda, RepeatVector, LeakyReLU, Concatenate, Masking
from keras.models import Sequential, Model, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger
from keras.optimizers import Adam

from keras import backend as K

np.random.seed(1234)
set_random_seed(1234)

In [3]:
# download resources for spacy and nltk
nltk.download('punkt')
nlp = spacy.load('en_core_web_lg', disable=['ner','parser'])

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/martontorner/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
# load pretrained model for demo (chunk_size=3 !!!)
authors = np.load("authors_chunk3.serialized.npy")
model = load_model("weights.hdf5")

In [5]:
def read_sentences_from_file(filepath):
  data = ""
  sentences = []

  # parse file
  with open(filepath, 'r') as file:
      data=file.read()
      
  # split article into sentences
  for sentence in tokenize.sent_tokenize(data):
    sentences.append(sentence)
  
  return sentences

In [42]:
def parse_article(chunk_size, filepath):
  n_words_per_chunk = chunk_size*25
  word_repr_vector_size = 301
  
  sentence_shape = (n_words_per_chunk , word_repr_vector_size)
  chunk_shape = (n_words_per_chunk , word_repr_vector_size)
  
  raw_sentences = read_sentences_from_file(filepath)
  
  n_sentences = len(raw_sentences)
  
  dataset = np.zeros([n_sentences-(chunk_size-1)], dtype=[('input', np.float32, sentence_shape)])
  
  # index dataset
  index = 0
  
  article_parsed = np.empty([len(raw_sentences)], dtype=object)
  
  # parse sentences in article
  for i, sentence in enumerate(raw_sentences):
    sentence = raw_sentences[i]

    parsed = np.empty((0, 301), np.float32)

    doc = nlp(sentence)

    for token in doc:
      # filter out stop words (not relevant/useful)
      # 96 = punctuation char (->SpaCy documentation)
      # if a word does not have vector form filter it out (very, very rare case)
      if not token.is_stop and not token.pos == 96 and token.has_vector:
        parsed = np.append(parsed, np.array([np.append(token.vector, float(token.pos)/100)]), axis=0)

    article_parsed[i] = parsed

  # for "every sentence" (the result is shorter, because the first and last sentences doesnt have enough neighbours)
  for k in range(0, len(article_parsed)-chunk_size+1):
    cursor = 0

    # itarate for chunk_size from the actual sentence
    for l in range(k, k+chunk_size):
      for m in range(0, len(article_parsed[l])):
        dataset[index]["input"][cursor] = article_parsed[l][m]
        cursor += 1
        if(cursor == n_words_per_chunk): break;
      if(cursor == n_words_per_chunk): break;

    index += 1
       
  return dataset

In [93]:
def identify(filepath, model):
  dataset = parse_article(chunk_size=3, filepath=filepath)
  
  predictions = model.predict(dataset["input"])

  sum = np.empty([len(predictions[0])],dtype=np.float32)
  for pred in predictions:
    sum += pred
  
  print("The identified author: ", authors[np.argmax(sum)] + "\n\n")

  print("Predictions for each sentence bundle:\n")
  print("%-25s %-15s %s" % ("Prediction", "Percent", "Real author"))
  print("="*55)
  for i in range(0, len(predictions)):
    print("%-25s %-15f %s" % (authors[np.argmax(predictions[i])], predictions[i][np.argmax(predictions[i])]*100, "AaronPressman"))
    print("-"*55)

In [94]:
identify("article.txt", model)

The identified author:  AaronPressman


Predictions for each sentence bundle:

Prediction                Percent         Real author
RobinSidel                84.526813       AaronPressman
-------------------------------------------------------
PatriciaCommins           96.500832       AaronPressman
-------------------------------------------------------
AaronPressman             94.752622       AaronPressman
-------------------------------------------------------
AaronPressman             93.996990       AaronPressman
-------------------------------------------------------
AaronPressman             99.968553       AaronPressman
-------------------------------------------------------
RobinSidel                65.138346       AaronPressman
-------------------------------------------------------
SimonCowell               78.033686       AaronPressman
-------------------------------------------------------
SimonCowell               75.882185       AaronPressman
---------------------------