# Authorship identification using deep learning
**Füleki Fábián,	Jani Balázs Gábor,	Torner Márton**  
*Project work for BME Deep Learning course (VITMAV45),  
Team: LoremIpsum*

**Dataset:**  
Our primary dataset is the Reuters_50_50 (C50), which is a subset of Reuters Corpus Volume I(RCVI). The RCV1 is archive of categorized newswire stories, made public for research purposes by Reuters, Ltd. The C50 collection consist of 50 texts for each of the 50 top author, for training and separately the same amount for testing purpose (5000 texts in total). This dataset has been previous used by previous studies of authorship recognition and can be found here: https://archive.ics.uci.edu/ml/machine-learning-databases/00217/C50.zip

In [1]:
# Get required resources
import spacy
import math
import time
import pandas as pd
import nltk
import os
import sys
import numpy as np
from nltk import tokenize
from keras.utils import np_utils
from tensorflow import set_random_seed

import pickle
import gc

np.random.seed(1234)
set_random_seed(1234)

Using TensorFlow backend.


In [2]:
# Clean storage for new files
!rm -r C50*

# Download of the Reuter_50_50 (C50) dataset
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00217/C50.zip"
!unzip -q C50.zip

# Download contains 2 directories split, merge them (we will do custom splitting)
!mkdir C50
!mv C50train/* C50/
!rsync -a C50test/ C50/

# Clean files we don't need
!rm C50.zip
!rm -r C50train
!rm -r C50test

--2018-12-09 10:00:12--  https://archive.ics.uci.edu/ml/machine-learning-databases/00217/C50.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.249
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8194031 (7.8M) [application/zip]
Saving to: ‘C50.zip’


2018-12-09 10:00:14 (5.00 MB/s) - ‘C50.zip’ saved [8194031/8194031]



In [3]:
# Download and install the largest language pack for SpaCy
# It contains 1 000 000 word vectors (so only very rare words can't be processed)
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz#egg=en_core_web_lg==2.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz (852.3MB)
[K    100% |████████████████████████████████| 852.3MB 129.0MB/s ta 0:00:01��████████████████        | 640.2MB 123.2MB/s eta 0:00:02
[?25hInstalling collected packages: en-core-web-lg
  Running setup.py install for en-core-web-lg ... [?25ldone
[?25hSuccessfully installed en-core-web-lg-2.0.0

[93m    Linking successful[0m
    /home/ffabi1997/.local/lib/python2.7/site-packages/en_core_web_lg -->
    /home/ffabi1997/.local/lib/python2.7/site-packages/spacy/data/en_core_web_lg

    You can now load the model via spacy.load('en_core_web_lg')



In [4]:
nltk.download('punkt')
pd.set_option("max_columns", None)
# for faster parsing we disable the components we don't use
nlp = spacy.load('en_core_web_lg', disable=['ner','parser'])

[nltk_data] Downloading package punkt to /home/ffabi1997/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# authors must be represented as numbers so we create
# a list and the indexes are the repesentations - easy translation

# array which contains the authors' names
authors = []

In [6]:
def read_sentences_from_file(root_dir, author, author_id, filename):
  sentences = []

  # read the content
  with open(root_dir+"/"+author+"/"+filename, 'r') as file:
      data=file.read()
      
  # split article into sentences
  for sentence in tokenize.sent_tokenize(data):
    sentences.append([author_id, sentence])
  
  return sentences

In [7]:
def load_raw_sentences(root_dir, articles_per_author):
  raw_sentences = []
  n_sentences = 0
  authors = []
  
  # read every file (articles) in the previously given root directory - the subdirectories are the authors' names
  for root, dirs, files in os.walk(root_dir):
    for dir in dirs:
      authors.append(str(dir))
      author_sentences = []
      
      for i, file in enumerate(os.listdir(root_dir+"/"+dir)):
        sentences = read_sentences_from_file(root_dir, dir, len(authors)-1, file)
        
        n_sentences += len(sentences)
        author_sentences.append(sentences)
        
        if i == articles_per_author-1: 
          break
          
      raw_sentences.append(author_sentences)

  return raw_sentences, n_sentences, authors

**Sentence parsing**

We parse all the sentences with SpaCy in the followig way:

1. Tokenize the sentence (split into words - in SpaCy the punctuation characters also count as words, but we remove them later, because they do not contain relevant information also stop words are removed for the same reason)

2. Get the vector form of each word, if it is not part of the largest collection (very rare words) we leave them out, because we can only use vectors for the inputs.

3. Detect for each word which part of the sentence it is (part-of-speech tags - syntactic information)

In [30]:
sample_article = read_sentences_from_file("C50", "AaronPressman", 0, "106247newsML.txt")

sentence = sample_article[0][1]
        
parsed = np.array([], dtype=[('text', object, 1), ('vector', object, 1), ('pos_str', object, 1), ('pos_num', object, 1)])
    
doc = nlp(sentence)

for token in doc:
  # filter out stop words (not relevant/useful)
  # 96 = punctuation char (->SpaCy documentation)
  # if a word does not have vector form filter it out (very, very rare case)
  if not token.is_stop and not token.pos == 96 and token.has_vector:
    parsed = np.append(parsed, np.array((token.text, token.vector, token.pos_, token.pos/100), dtype=[('text', object, 1), ('vector', object, 1), ('pos_str', object, 1), ('pos_num', object, 1)]))

print("One sample sentence:\n")
print("Author: " + "AaronPressman")
print("Sentence: " + sentence)
print("\n")
print("Parsed form:\n")
df = pd.DataFrame(data=parsed)
df.T

One sample sentence:

Author: AaronPressman
Sentence: The Internet may be overflowing with new technology but crime in cyberspace is still of the old-fashioned variety.


Parsed form:



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
text,The,Internet,may,be,overflowing,with,new,technology,but,crime,in,cyberspace,is,still,of,the,old,fashioned,variety
vector,"[0.27204, -0.06203, -0.1884, 0.023225, -0.0181...","[-0.50955, 0.088231, -0.32273, -0.40398, 0.003...","[-0.042501, 0.090773, -0.11918, 0.12372, -0.19...","[-0.059177, 0.10653, -0.21613, -0.086178, 0.00...","[0.074908, -0.036973, 0.082992, -0.31622, 0.22...","[-0.099534, 0.028202, -0.23189, 0.094477, 0.12...","[0.34046, 0.13752, -0.20643, -0.4555, 0.19251,...","[-0.32298, 0.38883, 0.4586, -0.5227, -0.064451...","[-0.01689, 0.17402, -0.30247, -0.30063, 0.2141...","[-0.43159, 0.22378, -0.03975, -0.5106, 0.22443...","[0.089187, 0.25792, 0.26282, -0.029365, 0.4718...","[0.031095, -0.24727, 0.076433, -0.018738, 0.34...","[-0.084961, 0.502, 0.0023823, -0.16755, 0.3072...","[0.11259, 0.1539, -0.14328, -0.18177, 0.12315,...","[0.060216, 0.21799, -0.04249, -0.38618, -0.153...","[0.27204, -0.06203, -0.1884, 0.023225, -0.0181...","[0.26105, -0.043804, -0.3964, 0.022796, -0.040...","[-0.082367, -0.22914, -0.191, 0.0044482, -0.29...","[-0.22488, 0.020037, 0.08535, -0.27456, 0.5060..."
pos_str,DET,NOUN,VERB,VERB,VERB,ADP,ADJ,NOUN,CCONJ,NOUN,ADP,NOUN,VERB,ADV,ADP,DET,ADJ,ADJ,NOUN
pos_num,0.89,0.91,0.99,0.99,0.99,0.84,0.83,0.91,0.88,0.91,0.84,0.91,0.99,0.85,0.84,0.89,0.83,0.83,0.91


**Equalization of the sentences in the dataset**

We plan to use sentence based identification so out system needs sentences which have equal lengths (word count), but obviously the articles are not written in this way, so we have to make the equalization.

Too short sentences (sentence chunks) are extended with wildcard (magic) words which will be filtered out in a way in the learning process.

Too long sentences (sentence chunks) are simply cut to shape.

In [8]:
def create_dataset(root_dir, chunk_size=1, articles_per_author=100, verbose=True):
  
  # just for writing out fancy things
  if verbose:
    start_time = time.time()
    s = ""
  
  # the length of a chunk - 
  # if the chunk of sentences is longer we cut the end, if shorter we leave 0s at the end (magic words)
  n_words_per_chunk = chunk_size*25
  # the length of the word vector
  word_repr_vector_size = 301
  
  # numpy array shape
  sentence_shape = (n_words_per_chunk , word_repr_vector_size)
  
  raw_sentences, n_sentences, authors = load_raw_sentences(root_dir, articles_per_author)
  
  n_articles = len(raw_sentences)*len(raw_sentences[0])
  
  if verbose:
    print("Sentences loaded from files. Start parsing.")
  
  # the final dataset will be shorter, because the first and last sentences don't have enough surrounding sentences)
  dataset = np.zeros([n_sentences-(chunk_size-1)*n_articles], dtype=[('input', np.float32, sentence_shape), ('output', np.float32, 1)])
  
  # index dataset
  index = 0
  
  for i, author in enumerate(raw_sentences):
    for j, article in enumerate(author):
      
      article_parsed = np.empty([len(article)], dtype=object)
      # author is the same for every sentence
      author_id = article[0][0]
      
      # parse sentences in article
      # we do this in advance because this is the longest process and the overlapping creates huge redundance
      for k in range(0, len(article)):
        sentence = raw_sentences[i][j][k][1]
        
        parsed = np.empty((0, 301), np.float32)
  
        doc = nlp(sentence)

        for token in doc:
          # filter out stop words (not relevant/useful)
          # 96 = punctuation char (->SpaCy documentation)
          # if a word does not have vector form filter it out (very, very rare case)
          if not token.is_stop and not token.pos == 96 and token.has_vector:
            parsed = np.append(parsed, np.array([np.append(token.vector, float(token.pos)/100)]), axis=0)
            
        article_parsed[k] = parsed
        
      # for "every sentence" (the result is shorter, because the first and last sentences don't have enough neighbours)
      for k in range(0, len(article_parsed)-chunk_size+1):
        cursor = 0
        dataset[index]["output"] = author_id
    
        # itarate for chunk_size from the actual sentence (eg. chunk_size = 3: 1st 2nd and 3rd sentence ... n-2, n-1, n sentence)
        for l in range(k, k+chunk_size):
          for m in range(0, len(article_parsed[l])):
            dataset[index]["input"][cursor] = article_parsed[l][m]
            cursor += 1
            if(cursor == n_words_per_chunk): break;
          if(cursor == n_words_per_chunk): break;
            
        index += 1
      
      # just for writing out fancy things
      if verbose:
        s =  str(i*articles_per_author+j+1)+ "/" + str(n_articles) +" articles parsed in " +str(round(time.time() - start_time))+ " seconds."

        cnt = int((i*articles_per_author+j+1)/n_articles*50)

        sys.stdout.write('\r'+ "Processing. [" + "="*cnt + ">" + " "*(50-cnt) + "] " + s )
       
  return dataset, authors

In [9]:
# ~10 min for chunk_size=3 with full database
dataset, authors = create_dataset("C50", 3)

Sentences loaded from files. Start parsing.

In [10]:
dataset["input"].shape

(98473, 75, 301)

In [19]:
np.save("data/authors_chunk3.serialized", authors)

In [17]:
np.save("data/dataset_chunk3.serialized", dataset)