In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist
import io
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding, Dropout
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

## **Load Data:**

In [3]:
from google.colab import files
uploaded = files.upload()

Saving data.csv to data.csv


In [16]:
df = pd.read_csv(io.BytesIO(uploaded['data.csv']), usecols=['text', 'author'])
df['author'].value_counts()

#df['author'].replace(['EAP', 'MWS', 'HPL'], [0,1,2], inplace=True)
df['author'].value_counts()

data_text = df['text'].values

from sklearn.preprocessing import label_binarize

# Transform labels into one hot encoded format.
y_ohe = label_binarize(df['author'], classes=['EAP', 'HPL', 'MWS'])
print('y_train_ohe shape: {}'.format(y_ohe.shape))
print('y_train_ohe samples:')
print(y_ohe[:5])

# split data to: train, validation & test sets; 60%, 20%, 20% respectively
X_train, X_test, y_train, y_test = train_test_split(data_text, y_ohe, test_size=0.2, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

print("Train-set size: ", len(X_train))
#print("Validation-set size:  ", len(X_val))
print("Test-set size:  ", len(X_test))




y_train_ohe shape: (19579, 3)
y_train_ohe samples:
[[1 0 0]
 [0 1 0]
 [1 0 0]
 [0 0 1]
 [0 1 0]]
Train-set size:  15663
Test-set size:   3916


## Load the training- and test-sets.

# *Combine* into one data-set for some uses below.

Print an example from the training-set to see that the data looks correct.

# Tokenizer


In [5]:
num_words = 40000

tokenizer = Tokenizer(num_words=num_words)

tokenizer.fit_on_texts(data_text)


# set num_words vocabulary size to all words 
if num_words is None:
    num_words = len(tokenizer.word_index)
    
# see the word token dictionary 
print("word token dictionary: " + str(tokenizer.word_index) +'\n')


# convert all texts in the training-set & test-set to lists of these tokens
x_train_tokens = tokenizer.texts_to_sequences(X_train)
#x_val_tokens = tokenizer.texts_to_sequences(X_val)
x_test_tokens = tokenizer.texts_to_sequences(X_test)


# example:
print("x_train_text exmple: " + str(X_train[1]) + '\n')
print("array x_train_tokens exmple: " + str(np.array(x_train_tokens[1])))



x_train_text exmple: IN the autumn of this year , the spirit of emigration crept in among the few survivors, who, congregating from various parts of England, met in London.

array x_train_tokens exmple: [    7     1  1856     2    26   519     1   248     2  8914  2591     7
   180     1   156  4397    61 16769    23   570  1573     2   440   545
     7   568]


# Padding and Truncating Data

2.   Now we'll prepare the text for some Word2Vec modeling, this hopfully will give us better results later on



In [0]:
#@title
def clean_text_for_word_2_vec(docs):
  """
  this function gets all the text series from data, and use spaCy 
  (instead of NLTK lib) libs to lemmatize ,divide it to sentences, removes punctuation, 
  turn all to lower case & split into words.
  returns: list of all 'text' sentences
  """
  sentences = []
  print(len(docs))
  print(docs[0])
  for doc in tqdm(docs, desc='Preparing Docs for Word2Vec:'):
    doc = nlp(doc, disable=['tagger']) # disabled tagger to keep pronouns
    doc = " ".join([tok.lemma_.lower() for tok in doc]) # lemmatized & lowercased
    doc = re.split("[\.?!;] ", doc) # Split to sentences
    doc = [re.sub("[\.,;:!?]", "", sent) for sent in doc] # remove punctuation
    doc = [sent.split() for sent in doc]
    
    sentences += doc
  
  return sentences
  
cleaned_text_for_word_2_vec = clean_text_for_word_2_vec(all_text)
word2vec_model = Word2Vec(cleaned_text_for_word_2_vec, size=300, window=5, min_count=3, workers=4, sg=1)




19579
This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.


HBox(children=(IntProgress(value=0, description='Preparing Docs for Word2Vec:', max=19579, style=ProgressStyle…

> *   Word2Vec cloud visualization:

In [0]:
#@title
def tsne_plot(model):
    """Creates and TSNE model and plots it"""
    labels = []
    tokens = []

    for word in tqdm(model.wv.vocab):
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in tqdm(new_values):
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in tqdm(range(len(x))):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i], xy=(x[i], y[i]), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
    plt.show()
    
tsne_plot(word2vec_model)    

## Calculating the average vector, dim(300, 1) of every text in the data

In [0]:
#@title
stopwords = stopwords.words('english')

def average_vec_from_word2vec(doc):
  average = np.zeros((300,), dtype='float32')
  num_words = 0.
  
  for word in doc:
    if word not in stopwords and word in word2vec_model.wv.vocab:
      average = np.add(average, word2vec_model[word])
      num_words += 1.          
  if num_words != 0.:  
    average = np.divide(average, num_words)
  
  return average
    
  
cleaned_text_vec = np.zeros((df.shape[0], 300), dtype="float32")
for i in range(len(cleaned_text)):
    cleaned_text_vec[i] = average_vec_from_word2vec(cleaned_text[i])
    
print("word vector shape:", cleaned_text_vec.shape)    

  if __name__ == '__main__':


word vector shape: (19579, 300)


In [6]:
pad = 'pre'

num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

np.mean(num_tokens)
print(np.max(num_tokens))
np.max(num_tokens)

max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

np.sum(num_tokens < max_tokens) / len(num_tokens)
print("precente of tokens under max: " + str(100 * np.sum(num_tokens < max_tokens) / len(num_tokens)) + "%")

# padding data

x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,padding=pad, truncating=pad)
#x_val_pad = pad_sequences(x_val_tokens, maxlen=max_tokens,padding=pad, truncating=pad)
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,padding=pad, truncating=pad)

print("x_train_pad shape: " + str(x_train_pad.shape))
#print("x_val_pad shape: " + str(x_val_pad.shape))
print("x_test_pad shape: " + str(x_test_pad.shape))


861
precente of tokens under max: 96.4962459778334%
x_train_pad shape: (15663, 64)
x_test_pad shape: (3916, 64)


# Tokenizer invert to words - helper func

In [0]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

def tokens_to_string(tokens):
    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]
    
    # Concatenate all words.
    text = " ".join(words)

    return text

# The RNN Model

In [0]:
model = Sequential()

embedding_size = 8

model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

model.add(GRU(units=16, return_sequences=True))
model.add(Dropout(0.5))

model.add(GRU(units=8, return_sequences=True))
model.add(Dropout(0.5))

model.add(GRU(units=300))
model.add(Dropout(0.5))

model.add(Dense(3, activation='relu'))

optimizer = Adam(keep_prob=1e-3)

model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

model.summary()

# Training session

In [0]:
%%time
model.fit(x_train_pad, y_train,validation_split=0.05, epochs=10, batch_size=64)
result = model.evaluate(x_test_pad, y_test)

print("Accuracy: {0:.2%}".format(result[1]))

# Mis-Classified Text

In [14]:
%%time
y_pred = model.predict(x=x_test_pad[0:1000])
y_pred = y_pred.T[0]

cls_pred = np.array([1.0 if p>0.5 else 0.0 for p in y_pred])

cls_true = np.array(y_test[0:1000])

incorrect = np.where(cls_pred != cls_true)
incorrect = incorrect[0]

len(incorrect)

idx = incorrect[0]
idx

text = X_test[idx]
text

y_pred[idx]

cls_true[idx]

CPU times: user 5.46 s, sys: 439 ms, total: 5.9 s
Wall time: 3.19 s


  
