In [1]:
import pandas as pd
import spacy
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
from google.colab import files
from sklearn import model_selection
from tensorflow import keras
from tensorflow.keras import layers, models
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from tensorflow.keras.layers import BatchNormalization

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
df = pd.read_json('https://raw.githubusercontent.com/triantafillu/Bootcamp-Repository-Language-2/main/data/ne_data.json')
df.head()

Unnamed: 0,title,author,year,text,themes
0,Body and Soul II,Charles Wright,2002,(for Coleman Hawkins)\nThe structure of landsc...,[audio & music]
1,Novel,Arthur Rimbaud,2002,I.\n\nNo one's serious at seventeen.\n—On beau...,[lgbtq]
2,Flying,Sarah Arvio,2002,One said to me tonight or was it day \nor was ...,[dreams]
3,Photograph of People Dancing in France,Leslie Adrienne Miller,2002,It's true that you don't know them--nor do I \...,[arts & sciences]
4,War Photograph,Kate Daniels,2002,A naked child is running\nalong the path towar...,"[violence, arts & sciences, war]"


# Preprocessing

In [3]:
df.shape

(16583, 5)

In [4]:
df = df.dropna()

In [5]:
themes_encoder = MultiLabelBinarizer()
y = themes_encoder.fit_transform(df['themes'])
df['themes_encoded'] = [x for x in y]

In [6]:
df['themes_encoded'][0]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0])

In [7]:
df['full_text'] = df.apply(lambda row: row.title + " " + row.text, axis = 1)
df.head()

Unnamed: 0,title,author,year,text,themes,themes_encoded,full_text
0,Body and Soul II,Charles Wright,2002,(for Coleman Hawkins)\nThe structure of landsc...,[audio & music],"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",Body and Soul II (for Coleman Hawkins)\nThe st...
1,Novel,Arthur Rimbaud,2002,I.\n\nNo one's serious at seventeen.\n—On beau...,[lgbtq],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Novel I.\n\nNo one's serious at seventeen.\n—O...
2,Flying,Sarah Arvio,2002,One said to me tonight or was it day \nor was ...,[dreams],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Flying One said to me tonight or was it day \n...
3,Photograph of People Dancing in France,Leslie Adrienne Miller,2002,It's true that you don't know them--nor do I \...,[arts & sciences],"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",Photograph of People Dancing in France It's tr...
4,War Photograph,Kate Daniels,2002,A naked child is running\nalong the path towar...,"[violence, arts & sciences, war]","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",War Photograph A naked child is running\nalong...


In [8]:
df.drop(['title', 'text', 'themes'], axis=1, inplace=True)
df.head()

Unnamed: 0,author,year,themes_encoded,full_text
0,Charles Wright,2002,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",Body and Soul II (for Coleman Hawkins)\nThe st...
1,Arthur Rimbaud,2002,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Novel I.\n\nNo one's serious at seventeen.\n—O...
2,Sarah Arvio,2002,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Flying One said to me tonight or was it day \n...
3,Leslie Adrienne Miller,2002,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",Photograph of People Dancing in France It's tr...
4,Kate Daniels,2002,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",War Photograph A naked child is running\nalong...


In [9]:
labelencoder = LabelEncoder()
df['author'] = labelencoder.fit_transform(df['author'])
df.head()

Unnamed: 0,author,year,themes_encoded,full_text
0,593,2002,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",Body and Soul II (for Coleman Hawkins)\nThe st...
1,305,2002,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Novel I.\n\nNo one's serious at seventeen.\n—O...
2,3416,2002,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Flying One said to me tonight or was it day \n...
3,2315,2002,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",Photograph of People Dancing in France It's tr...
4,2082,2002,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",War Photograph A naked child is running\nalong...


In [10]:
# Decontraction
def full_form(word):
    if word == "nt": word = 'not'
    if word == "re": word = 'be'
    if word == "d": word = 'would'
    if word == "m": word = 'am'
    if word == "s": word = 'be'
    if word == "ve": word = 'have'
    return word

def preprocessing(text):
  tokenizer = RegexpTokenizer(r'\w+')
  text = tokenizer.tokenize(text)
  stop_words = set(stopwords.words('english'))
  cleaned_text = []
  for word in text:
    if word not in stop_words:
      cleaned_text.append(word)
  wnl = WordNetLemmatizer()
  text = [wnl.lemmatize(token) for token in cleaned_text]
  text = [full_form(w).lower() for w in text]

  return text
  

In [11]:
df['full_text'] = df['full_text'].apply(preprocessing)
df['full_text']

0        [body, soul, ii, coleman, hawkins, the, struct...
1        [novel, i, no, one, serious, seventeen, on, be...
2        [flying, one, said, tonight, day, passage, two...
3        [photograph, people, dancing, france, it, true...
4        [war, photograph, a, naked, child, running, al...
                               ...                        
17069    [you, can, buy, shoes, painting, you, even, bu...
17070    [you, people, people, ask, shoe, the, valley, ...
17071    [you, that, i, loved, you, i, loved, life, lon...
17072    [your, clothes, of, course, empty, shell, with...
17073    [your, luck, is, about, to, change, ominous, i...
Name: full_text, Length: 16583, dtype: object

In [12]:
texts_len = df['full_text'].apply(len)
df.drop(df[texts_len<50].index, inplace=True)
df.drop(df[texts_len>150].index, inplace=True)

  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
tokenizer = Tokenizer(num_words=3000)

tokenizer.fit_on_texts(df['full_text'])

# Encode training data sentences into sequences
df['full_text'] = tokenizer.texts_to_sequences(df['full_text'])

df['full_text']

2        [935, 5, 56, 774, 8, 1854, 64, 11, 254, 214, 1...
4        [293, 1409, 6, 745, 65, 675, 247, 550, 267, 9,...
5        [29, 29, 146, 544, 321, 157, 1250, 1941, 2532,...
6        [30, 120, 599, 301, 225, 24, 2533, 1877, 2802,...
8        [1533, 8, 27, 1545, 328, 390, 898, 176, 44, 21...
                               ...                        
17065    [445, 213, 201, 1034, 48, 1464, 1170, 129, 291...
17067    [968, 1, 1763, 26, 403, 697, 21, 2854, 80, 152...
17069    [19, 619, 1101, 1006, 19, 51, 1101, 19, 23, 37...
17071    [19, 43, 1, 360, 19, 1, 360, 24, 42, 5, 19, 1,...
17073    [242, 1518, 220, 1512, 27, 379, 1919, 866, 103...
Name: full_text, Length: 9733, dtype: object

In [14]:
# Get max training sequence length
maxlen = 150 #max([len(x) for x in df['full_text']])

# Pad the training sequences
padded = pad_sequences(df['full_text'], padding='post', truncating='post', maxlen=maxlen)

padded

array([[ 935,    5,   56, ...,    0,    0,    0],
       [ 293, 1409,    6, ...,    0,    0,    0],
       [  29,   29,  146, ...,    0,    0,    0],
       ...,
       [  19,  619, 1101, ...,    0,    0,    0],
       [  19,   43,    1, ...,    0,    0,    0],
       [ 242, 1518,  220, ...,    0,    0,    0]], dtype=int32)

In [15]:
df['full_text'] = [x for x in padded]

df['full_text']

2        [935, 5, 56, 774, 8, 1854, 64, 11, 254, 214, 1...
4        [293, 1409, 6, 745, 65, 675, 247, 550, 267, 9,...
5        [29, 29, 146, 544, 321, 157, 1250, 1941, 2532,...
6        [30, 120, 599, 301, 225, 24, 2533, 1877, 2802,...
8        [1533, 8, 27, 1545, 328, 390, 898, 176, 44, 21...
                               ...                        
17065    [445, 213, 201, 1034, 48, 1464, 1170, 129, 291...
17067    [968, 1, 1763, 26, 403, 697, 21, 2854, 80, 152...
17069    [19, 619, 1101, 1006, 19, 51, 1101, 19, 23, 37...
17071    [19, 43, 1, 360, 19, 1, 360, 24, 42, 5, 19, 1,...
17073    [242, 1518, 220, 1512, 27, 379, 1919, 866, 103...
Name: full_text, Length: 9733, dtype: object

In [16]:
df.head()

Unnamed: 0,author,year,themes_encoded,full_text
2,3416,2002,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[935, 5, 56, 774, 8, 1854, 64, 11, 254, 214, 1..."
4,2082,2002,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[293, 1409, 6, 745, 65, 675, 247, 550, 267, 9,..."
5,3289,2001,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[29, 29, 146, 544, 321, 157, 1250, 1941, 2532,..."
6,41,2002,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[30, 120, 599, 301, 225, 24, 2533, 1877, 2802,..."
8,752,2002,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1533, 8, 27, 1545, 328, 390, 898, 176, 44, 21..."


In [17]:
df.drop(df.columns[0], axis=1, inplace = True)
df.head()

Unnamed: 0,year,themes_encoded,full_text
2,2002,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[935, 5, 56, 774, 8, 1854, 64, 11, 254, 214, 1..."
4,2002,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[293, 1409, 6, 745, 65, 675, 247, 550, 267, 9,..."
5,2001,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[29, 29, 146, 544, 321, 157, 1250, 1941, 2532,..."
6,2002,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[30, 120, 599, 301, 225, 24, 2533, 1877, 2802,..."
8,2002,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1533, 8, 27, 1545, 328, 390, 898, 176, 44, 21..."


In [18]:
def convert_to_decade(x):
  dec = x // 10
  res = dec * 10
  return res

df.year = df.year.apply(convert_to_decade)
df

Unnamed: 0,year,themes_encoded,full_text
2,2000,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[935, 5, 56, 774, 8, 1854, 64, 11, 254, 214, 1..."
4,2000,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[293, 1409, 6, 745, 65, 675, 247, 550, 267, 9,..."
5,2000,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[29, 29, 146, 544, 321, 157, 1250, 1941, 2532,..."
6,2000,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[30, 120, 599, 301, 225, 24, 2533, 1877, 2802,..."
8,2000,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1533, 8, 27, 1545, 328, 390, 898, 176, 44, 21..."
...,...,...,...
17065,1990,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[445, 213, 201, 1034, 48, 1464, 1170, 129, 291..."
17067,1990,"[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[968, 1, 1763, 26, 403, 697, 21, 2854, 80, 152..."
17069,1990,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[19, 619, 1101, 1006, 19, 51, 1101, 19, 23, 37..."
17071,2000,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[19, 43, 1, 360, 19, 1, 360, 24, 42, 5, 19, 1,..."


# Model

In [19]:
X = np.array(df['full_text'].to_list())
Y = np.array(df['year'].to_list())

In [20]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.1, random_state=37)

In [117]:
max_features = 10000 
max_len = 150  
embedding_dim = 16
model = keras.models.Sequential([
  keras.layers.Embedding(input_dim=max_features,
                           output_dim=embedding_dim,
                           input_length=max_len),
  keras.layers.Flatten(),
  keras.layers.Dense(64,activation='relu'),
  keras.layers.Dense(32,activation='relu'),
  keras.layers.Dense(16,activation='relu'),
  keras.layers.Dense(8,activation='relu'),
  keras.layers.Dense(1, activation='relu')
])

model.build()
model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['MAE'])
model.summary()

Model: "sequential_44"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_44 (Embedding)     (None, 150, 16)           160000    
_________________________________________________________________
flatten_44 (Flatten)         (None, 2400)              0         
_________________________________________________________________
dense_192 (Dense)            (None, 64)                153664    
_________________________________________________________________
dense_193 (Dense)            (None, 32)                2080      
_________________________________________________________________
dense_194 (Dense)            (None, 16)                528       
_________________________________________________________________
dense_195 (Dense)            (None, 8)                 136       
_________________________________________________________________
dense_196 (Dense)            (None, 1)               

In [118]:
model.fit(X_train, Y_train, epochs=10, batch_size = 64, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa1c2705c10>

In [119]:
score = model.evaluate(np.array(X_test), np.array(Y_test)) 

print("Test Score:", score2[0])
print("Test Accuracy:", score2[1])

Test Score: 2224.391357421875
Test Accuracy: 28.726938247680664


In [120]:
from google.colab import drive
drive.mount('/drive')

Mounted at /drive


In [122]:
model.save('/drive/My Drive/Colab Notebooks/year_prediction.h5')