In [1]:
import pandas as pd
import spacy
from sklearn import model_selection
import numpy as np


from tensorflow import keras
from tensorflow.keras import layers, models

In [2]:
df = pd.read_json(open("../../data/data.json", "r", encoding="utf8"))

In [3]:
df

Unnamed: 0,title,url,author,year,text,themes
0,"[28, 508, 152, 720]",https://poets.org/poem/body-and-soul-ii,602,2002,"[2221, 1012, 2, 2221, 273, 902, 128, 472, 57, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,[2668],https://poets.org/poem/novel,311,2002,"[286, 3904, 1, 2778, 249, 19, 1268, 643, 410, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,[178],https://poets.org/poem/flying,3478,2002,"[12, 602, 11, 1574, 197, 113, 402, 13, 2428, 2...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[1113, 2862, 88, 289, 1171, 2335]",https://poets.org/poem/photograph-people-danci...,2361,2002,"[267, 16, 35, 5, 5, 27, 1229, 1823, 3196, 48, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[246, 1113]",https://poets.org/poem/war-photograph,2125,2002,"[596, 50, 124, 467, 163, 612, 133, 77, 29, 44,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...
17069,"[66, 148, 466, 506, 1171, 512, 1513]",https://www.poetryfoundation.org/poetrymagazin...,1826,1990,"[148, 35, 466, 4186, 26, 43, 3285, 278, 241, 5...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
17070,"[66, 88]",https://www.poetryfoundation.org/poetrymagazin...,2883,1990,"[88, 16, 35, 90, 506, 733, 68, 959, 397, 253, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
17071,"[66, 180, 8, 9]",https://www.poetryfoundation.org/poetrymagazin...,5,2005,"[9, 24, 22, 299, 140, 467, 25, 299, 995, 81, 7...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
17072,"[3966, 549]",https://www.poetryfoundation.org/poetrymagazin...,2043,1990,"[465, 569, 218, 465, 381, 190, 54, 549]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [4]:
# one hot encoding for authors
def encode_authors(author_code):
    qty = df.author.max()
    result = [0] * (qty + 1)
    result[author_code] = 1
    return result

df.author = df.author.apply(encode_authors)

In [5]:
max_features = 10000  # maximum number of words in vocabulari 5000
max_len = 150  # max length of string

In [6]:
df['joined_text'] = df['text'] + df['title']
df['X2'] = df['themes'] + df['author'] 
train_df, test_df = model_selection.train_test_split(df, test_size=0.1, random_state=42)
X1_train = keras.preprocessing.sequence.pad_sequences(train_df['joined_text'].to_list(), maxlen=max_len, padding='post')
X2_train = np.stack(train_df['X2'])
Y_train = np.stack(train_df['year'])
X1_test = keras.preprocessing.sequence.pad_sequences(list(test_df['joined_text']), maxlen=max_len, padding='post')
X2_test = np.stack(test_df['X2'])
Y_test = np.stack(test_df['year'])

### Model 

In [7]:
embedding_dim =64
model1 = keras.models.Sequential([
  keras.layers.Embedding(input_dim=max_features,
                           output_dim=embedding_dim,
                           input_length=max_len),
  keras.layers.Flatten(),
  keras.layers.Dense(2000,activation='relu'),
  keras.layers.Dense(500,activation='relu'),
  keras.layers.Dense(100,activation='relu'),
  keras.layers.Dense(1, activation='relu')
])

model1.compile(optimizer='nadam',
              loss='mean_squared_error',
              metrics=['MAE'])

model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 150, 64)           640000    
_________________________________________________________________
flatten (Flatten)            (None, 9600)              0         
_________________________________________________________________
dense (Dense)                (None, 2000)              19202000  
_________________________________________________________________
dense_1 (Dense)              (None, 500)               1000500   
_________________________________________________________________
dense_2 (Dense)              (None, 100)               50100     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 20,892,701
Trainable params: 20,892,701
Non-trainable params: 0
____________________________________________

In [19]:
epochs = 5
model1.fit(X1_train, Y_train,
          #batch_size=128,
          validation_data=(X1_test,Y_test),
          epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7ff03472a9b0>

In [20]:
score1 = model1.evaluate(np.array(X1_test), np.array(Y_test)) 

print("Test Score:", score1[0])
print("Test Accuracy:", score1[1])

Test Score: 3638.696533203125
Test Accuracy: 39.90259552001953


In [23]:
model1.save('../year_prediction_model.h5')

In [13]:
model2 = keras.models.Sequential([
  keras.layers.Embedding(input_dim=2,
                           output_dim=8,
                           input_length=len(X2_train[0])),
  keras.layers.Flatten(),
  keras.layers.Dense(2000,activation='relu'),
  keras.layers.Dense(500,activation='relu'),
  keras.layers.Dense(100,activation='relu'),
  keras.layers.Dense(1, activation='relu')
])

model2.build()

In [14]:
model2.compile(optimizer='nadam',
              loss='mean_squared_error',
              metrics=['MAE'])

In [15]:
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4285, 8)           16        
_________________________________________________________________
flatten_1 (Flatten)          (None, 34280)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 2000)              68562000  
_________________________________________________________________
dense_5 (Dense)              (None, 500)               1000500   
_________________________________________________________________
dense_6 (Dense)              (None, 100)               50100     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 101       
Total params: 69,612,717
Trainable params: 69,612,717
Non-trainable params: 0
__________________________________________

In [24]:
model2.fit(X2_train, Y_train, epochs=5, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7ff034674ac8>

In [25]:
score2 = model2.evaluate(np.array(X2_test), np.array(Y_test)) 

print("Test Score:", score2[0])
print("Test Accuracy:", score2[1])

Test Score: 2001.484619140625
Test Accuracy: 31.37334442138672


In [None]:
y_pred1 = model1.predict(X1_test)
y_pred2 = model2.predict(X2_test)
result = (y_pred1 + y_pred2)/2

In [None]:
keras.metrics.mean_absolute_error(
    y_pred1.flatten(), y_pred2.flatten()
)

### Multi-input model

In [None]:
text_input = keras.Input(shape=(max_len,))
categorical_input = keras.Input(shape=(len(X2_train[0]),))

text_embedding = layers.Embedding(max_features, 64)(text_input)
categorical_embedding = layers.Embedding(2, 8)(categorical_input)

flat_text = layers.Flatten()(text_embedding)
flat_categories = layers.Flatten()(categorical_embedding)

concatenated = keras.layers.Concatenate()([flat_text, flat_categories])

dense1 = keras.layers.Dense(2000, activation='relu', )(concatenated)
dense2 = keras.layers.Dense(500, activation='relu', )(dense1)
dense3 = keras.layers.Dense(100, activation='relu', )(dense2)
out = keras.layers.Dense(1, activation='relu', )(dense3)


united_model = keras.Model(inputs=[text_input, categorical_input], outputs=out)


In [None]:
united_model.compile(optimizer='nadam',
              loss='mean_squared_error',
              metrics=['MAE'])
united_model.summary()

In [None]:
united_model.fit([X1_train, X2_train], Y_train, epochs=5, validation_split=0.1)

In [None]:
score3 = united_model.evaluate([np.array(X1_test), np.array(X2_test)], np.array(Y_test)) 

print("Test Score:", score3[0])
print("Test Accuracy:", score3[1])

In [None]:
output_dim = 100

inputs = keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, output_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

predictions = layers.Dense(1, activation='relu', name="predictions")(x)

model3 = keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model3.compile(loss="mean_squared_error", optimizer="adam", metrics=["MAE"])


In [None]:
model3.summary()

In [None]:
batch_size = 64
epochs = 5
model3.fit(X1_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1)

In [None]:
accr = model3.evaluate(X1_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))