In [73]:
#load modules
import pandas as pd
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence, text
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense, Activation, Dropout
from keras.layers import Embedding
from keras.layers import SpatialDropout1D
from tensorflow.keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
import math

In [9]:
#upload train/test files
from google.colab import files
uploaded = files.upload()


Saving glove.6B.100d.txt.zip to glove.6B.100d.txt.zip


In [7]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
!unzip glove*.zip

Archive:  glove.6B.100d.txt.zip
   creating: glove.6B.100d.txt/
  inflating: glove.6B.100d.txt/glove.6B.100d.txt  


In [19]:
#load data
df = pd.read_csv('/content/train.csv')
df_train = pd.read_csv('/content/test.csv')

# Train-validation split
x, y = df['text'].values, df['author'].values
x_train, x_valid, y_train, y_valid = train_test_split(x, y,
                                                      stratify = y,
                                                      random_state = 40,
                                                      test_size = 0.2,
                                                      shuffle = True)
x[:5]

array(['This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.',
       'It never once occurred to me that the fumbling might be a mere mistake.',
       'In his left hand was a gold snuff box, from which, as he capered down the hill, cutting all manner of fantastic steps, he took snuff incessantly with an air of the greatest possible self satisfaction.',
       'How lovely is spring As we looked from Windsor Terrace on the sixteen fertile counties spread beneath, speckled by happy cottages and wealthier towns, all looked as in former years, heart cheering and fair.',
       'Finding nothing else, not even gold, the Superintendent abandoned his attempts; but a perplexed look occasionally steals over his countenance as he sits thinking at his desk.'],
      dtype=object)

In [21]:
# label encoding
label_dict = {'EAP': 0, 'HPL': 1, 'MWS': 2}
y_train = pd.Series(y_train).replace(label_dict, inplace = False).values
y_valid = pd.Series(y_valid).replace(label_dict, inplace = False).values

# One hot encoding
pd.set_option('future.no_silent_downcasting', True) # surpress warning
y_train_matrix = to_categorical(y_train,num_classes=3)
y_valid_matrix = to_categorical(y_valid,num_classes=3)

In [26]:
#load GloVe vectors into a dictionary Global Vector Representation of Words

embed_glove = {}
glove_6b_100d = open('/content/glove.6B.100d.txt/glove.6B.100d.txt') #start with the smallest word vec each word has a 100 float represantion
for line in glove_6b_100d:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embed_glove[word] = coefs
glove_6b_100d.close()
print(f"Number of word vectors: {len(embed_glove)}")

Number of word vectors: 400000


In [34]:
# Tokenization
token = text.Tokenizer(num_words = None)
token.fit_on_texts(list(x_train) + list(x_valid))
word_index = token.word_index

# Convert text input to sequence of tokens
x_train_seq = token.texts_to_sequences(x_train)
x_valid_seq = token.texts_to_sequences(x_valid)

# Example
print(f"Text: {x_train[0]}")
print("Converted to sequence:")
print(x_train_seq[0])

Text: The archaic lanes and houses and unexpected bits of square and court had indeed delighted me, and when I found the poets and artists to be loud voiced pretenders whose quaintness is tinsel and whose lives are a denial of all that pure beauty which is poetry and art, I stayed on for love of these venerable things.
Converted to sequence:
[1, 3695, 4245, 3, 500, 3, 2545, 6589, 2, 1438, 3, 1943, 12, 156, 2678, 22, 3, 41, 6, 93, 1, 3103, 3, 5924, 4, 28, 1268, 10108, 16050, 131, 16051, 25, 12278, 3, 131, 1299, 56, 5, 5925, 2, 32, 9, 1176, 323, 18, 25, 5390, 3, 668, 6, 3938, 27, 17, 152, 2, 67, 2945, 139]


In [52]:
# Statistical analysis on standard deviation to see how spread the sequence lengths are.
len_train = [len(x_train_seq[i]) for i in range(len(x_train_seq))]
len_valid = [len(x_valid_seq[i]) for i in range(len(x_valid_seq))]
len_data = np.array(len_train + len_valid)
max = len_data.max()
min = len_data.min()
mean = len_data.mean()
std = len_data.std()
print('max length: ' + str(max))
print('min length: ' + str(min))
print('mean length: ' + str(mean))
print('std length: ' + str(std))
max_len = math.floor(mean + 2 * std) + 1 # with this formula of choosing sequence max length we cover ~97% of the sequences
count = 0
for i in range(len(len_data)):
  if len_data[i] <= max_len:
    count += 1
print('count is: ' + str(count))
raport = count / len(len_data)
print('raport is: ' + str(raport))
#


max length: 861
min length: 2
mean length: 26.71903570151693
std length: 19.061958508736083
count is: 18971
raport is: 0.9689463200367741


In [53]:
# padding using max_len
x_train_pad = sequence.pad_sequences(x_train_seq,
                                    maxlen = max_len,
                                    padding = 'pre',
                                    truncating = 'pre',
                                    value = 0.0)

x_valid_pad = sequence.pad_sequences(x_valid_seq,
                                    maxlen = max_len,
                                    padding = 'pre',
                                    truncating = 'pre',
                                    value = 0.0)

# build matrix M x N where M is number of unique words existing in our data set and N is a vector representation for each word of length 100

In [64]:
vec_representational_dimension = 100
word_vectorization_matrix = np.zeros((len(word_index) + 1, vec_representational_dimension))
no_representation_words = []
for word, i in word_index.items():
  vector = embed_glove.get(word)
  if vector is not None:
    word_vectorization_matrix[i] = vector
  else:
    no_representation_words.append(word)
    # print('No representation for this word: ' + word)

print(f"Shape of the matrix of word vectors: {word_vectorization_matrix.shape}")
print(str(len(no_representation_words)) + " do not have a representation")
raport = len(no_representation_words) / len(word_index)
print(f"{str(round(raport * 100))}% of the words do not have a representation")

Shape of the matrix of word vectors: (25944, 100)
3645 do not have a representation
14% of the words do not have a representation


#LSTM

In [77]:
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                    100,
                    weights = [word_vectorization_matrix],
                    trainable = False))

model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout = 0.3, recurrent_dropout = 0.3))

model.add(Dense(512, activation = 'relu'))
model.add(Dropout(0.8))

model.add(Dense(256, activation = 'relu'))
model.add(Dropout(0.8))

model.add(Dense(3, activation = 'softmax'))
model.build(input_shape=(None, x_train_pad.shape[1]))  # batch size, sequence length

model.summary()

In [83]:
initial_learning_rate = 0.001
model.compile(
    loss='categorical_crossentropy',  # because of one-hot labels
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)
# Early stopping callback
#Track validation loss. After epoch 60, if it hasn’t improved by at least 0.001 for 10 epochs, stop training early.
earlystop = EarlyStopping(monitor = 'val_loss',
                          min_delta = 0.001,
                          patience = 10,
                          verbose = 1,
                          mode = 'auto',
                          start_from_epoch = 60)


In [84]:
history = model.fit(
    x_train_pad, y_train_matrix,
    validation_data=(x_valid_pad, y_valid_matrix),
    batch_size=256,
    epochs=10,
    verbose=1,
    callbacks = [earlystop]
)

Epoch 1/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 576ms/step - accuracy: 0.4845 - loss: 1.0200 - val_accuracy: 0.5638 - val_loss: 0.9325
Epoch 2/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 509ms/step - accuracy: 0.5279 - loss: 0.9888 - val_accuracy: 0.5953 - val_loss: 0.8866
Epoch 3/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 553ms/step - accuracy: 0.5494 - loss: 0.9663 - val_accuracy: 0.6233 - val_loss: 0.8798
Epoch 4/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 530ms/step - accuracy: 0.5585 - loss: 0.9464 - val_accuracy: 0.6203 - val_loss: 0.8485
Epoch 5/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 515ms/step - accuracy: 0.5776 - loss: 0.9254 - val_accuracy: 0.6310 - val_loss: 0.8484
Epoch 6/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 522ms/step - accuracy: 0.5876 - loss: 0.9123 - val_accuracy: 0.6450 - val_loss: 0.8296
Epoch 7/10
[1m62/62[

In [88]:
model.save_weights('model.weights.h5')