In [1]:
import string, json, itertools, spacy
import pandas as pd
from tensorflow import keras
import numpy as np

from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
df = pd.read_json(open("final_dataset/merged_dset.json", "r", encoding="utf8"))
df = df.drop(['url'], axis=1)

In [3]:
# count quantity of entries by theme
theme_entry_count = {}
for entry in df.themes:
    for theme in entry:
        if theme in theme_entry_count.keys():
            theme_entry_count[theme] += 1
        else:
            theme_entry_count[theme] = 1

In [4]:
# remove the least frequent themes if needed
themes_to_remove = []
for key, value in theme_entry_count.items(): 
    if value < 100:
        themes_to_remove.append(key)
        
def remove_themes(themes): # TODO
    result = []
    for theme in themes:
        if theme not in themes_to_remove:
            result.append(theme)
    return result

# low correlated themes
themes_to_remove = themes_to_remove + ['blank verse', 'living', 'philosophy', 'relationships', 
                                       'social commentaries', 'social justice']

df.themes = df.themes.apply(remove_themes)

# remove entries without theme
df = df[df['themes'].str.len() != 0]

In [5]:
# get a set of themes
themes = set(itertools.chain.from_iterable(df.themes))

# themes encoding
themes = {list(themes)[i]: i for i in range(len(themes))}

def encode_themes(available_themes):
    result = [0,] * len(themes)
    for i in available_themes:
        result[themes[i]] = 1
    return result

df.themes = df.themes.apply(encode_themes)

In [6]:
len(themes)

93

In [7]:
# authors encoding
authors = df.author.unique()
authors = {list(authors)[i]: i for i in range(len(authors))}
df.author = [authors[row['author']]for index, row in df.iterrows()]

In [8]:
#  Remove punctuation
table = str.maketrans('', '', string.punctuation)
df['text'] = [row['text'].translate(table) for index, row in df.iterrows()]
df['title'] = [row['title'].translate(table) for index, row in df.iterrows()]
#  Remove stopwords
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in STOP_WORDS))

In [9]:
#  Lemmatization
nlp = spacy.load("en_core_web_sm")
lemmatizer = nlp.get_pipe("lemmatizer")

df['title'] = [
                [token.lemma_ for token in nlp(row['title'])]
                for index, row in df.iterrows()
             ]
df['text'] = [
                [token.lemma_ for token in nlp(row['text'])]
                for index, row in df.iterrows()
             ]

In [10]:
# remove too short texts
texts_len = df['text'].apply(len)
df.drop(df[texts_len<50].index, inplace=True)

In [11]:
max_features=10000

# tokenization
tok = keras.preprocessing.text.Tokenizer(
    num_words=max_features,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True)  
tok.fit_on_texts(df['text'])

In [12]:
# text to number sequences
df['text'] = tok.texts_to_sequences(df['text'])
df['title'] = tok.texts_to_sequences(df['title'])

In [13]:
df

Unnamed: 0,title,author,year,text,themes
0,"[45, 4, 141, 901]",0,2002,"[2, 2330, 1093, 5, 2330, 297, 9769, 1000, 415,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,[2846],1,2002,"[1, 94, 106, 4382, 3, 144, 292, 33, 1422, 8790...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,[209],2,2002,"[106, 20, 732, 21, 1470, 329, 240, 161, 421, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[1210, 40, 113, 1985, 17, 2315]",3,2002,"[329, 304, 19, 70, 8, 1, 8, 1, 43, 1347, 1, 20...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[258, 1210]",4,2002,"[10, 650, 76, 140, 544, 206, 679, 179, 107, 44...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
...,...,...,...,...,...
17067,"[151, 17, 2, 5250]",168,1990,"[1, 399, 2514, 331, 545, 8931, 28, 2567, 93, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
17068,[],1172,1990,"[2, 596, 152, 153, 200, 277, 167, 4105, 3074, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
17070,"[15, 113]",3189,1990,"[113, 19, 70, 125, 629, 2, 797, 1, 99, 1130, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
17071,"[15, 25, 1, 13]",3431,2005,"[15, 1, 13, 37, 36, 15, 1, 310, 174, 544, 38, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."


In [14]:
# export encoded data
data = df.to_json()
with open('../data/data.json', 'w') as f:
    f.write(data)
with open('../data/themes.json', 'w') as f:
    json.dump(themes, f)