In [1]:
import pandas as pd
import spacy
from sklearn import model_selection
import numpy as np


from tensorflow import keras
from tensorflow.keras import layers, models

In [2]:
df = pd.read_json(open("../data/data.json", "r", encoding="utf8"))
df.head()

Unnamed: 0,title,url,author,year,text,themes
0,"[32, 448, 99, 844]",https://poets.org/poem/body-and-soul-ii,601,2002,"[2315, 1039, 2, 2315, 257, 936, 143, 442, 60, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,[2719],https://poets.org/poem/novel,310,2002,"[298, 4288, 1, 2425, 263, 18, 1330, 570, 428, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,[172],https://poets.org/poem/flying,3477,2002,"[13, 668, 11, 1396, 208, 131, 394, 12, 2186, 2...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[1158, 2799, 93, 288, 1104, 2212]",https://poets.org/poem/photograph-people-danci...,2360,2002,"[269, 19, 54, 5, 5, 28, 1282, 2002, 3387, 47, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[230, 1158]",https://poets.org/poem/war-photograph,2124,2002,"[589, 53, 112, 495, 156, 610, 148, 85, 27, 43,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [3]:
# drop texts with length < 50
texts_len = df['text'].apply(len)
df.drop(df[texts_len<50].index, inplace=True)

In [4]:
max_features = 5000  # maximum number of words in vocabulari 5000
max_len = 150  # max length of string
output_dim =100 

In [5]:
# join title with text
joined_text = df['title'] + df['text']
X = keras.preprocessing.sequence.pad_sequences(list(joined_text), maxlen=max_len, padding='post')
Y = np.array(df['themes'].to_list())

In [6]:
X.shape

(11416, 150)

In [7]:
Y.shape

(11416, 153)

In [8]:
# split dataset
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.2, random_state=42)

### First model - LSTM used

In [9]:
model = keras.Sequential([
    layers.Embedding(max_features, output_dim, input_length=max_len),
    layers.SpatialDropout1D(0.2),
    layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    layers.Dense(153, activation='sigmoid')
])

In [10]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [11]:
epochs = 5
batch_size = 128

early_stopping_callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[early_stopping_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
result = model.predict(X_test[8])



In [13]:
result.shape

(150, 153)

In [14]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 5.484
  Accuracy: 0.104


In [15]:
X_test[7]

array([ 262,    4,   30, 4950,    4,  196, 1218,  651, 1218,  788,   21,
       1896,  151,  765, 2111, 2111,  257,  206,  410,   10,   98, 1092,
         10,   98, 2117, 1523,   10,   98,  128,   10,   98,  128,  161,
         14,  202,    3,   14, 3471,  114, 1405,  905,   95,  359,  468,
         78,    4,  209,  366,   58,    4,  123,   30,    4, 2119,  651,
       1218,  651,    4,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0], d

In [16]:
inputs = keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, output_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

predictions = layers.Dense(153, activation='sigmoid', name="predictions")(x)

model2 = keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model2.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])


In [None]:
model2.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5

In [None]:
accr = model2.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
result = model.predict(X_test[8])

In [None]:
result.shape