In [1]:
import string, json, itertools, spacy
import pandas as pd
from tensorflow import keras
import numpy as np

from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
df = pd.read_json(open("final_dataset/merged_dset.json", "r", encoding="utf8"))
df = df.drop(['url'], axis=1)

In [3]:
# count quantity of entries by theme
theme_entry_count = {}
for entry in df.themes:
    for theme in entry:
        if theme in theme_entry_count.keys():
            theme_entry_count[theme] += 1
        else:
            theme_entry_count[theme] = 1

In [4]:
# remove the least frequent themes if needed
themes_to_remove = []
for key, value in theme_entry_count.items(): 
    if value < 100:
        themes_to_remove.append(key)
        
def remove_themes(themes): # TODO
    result = []
    for theme in themes:
        if theme not in themes_to_remove:
            result.append(theme)
    return result

df.themes = df.themes.apply(remove_themes)

# remove entries without theme
df = df[df['themes'].str.len() != 0]

In [5]:
# get a set of themes
themes = set(itertools.chain.from_iterable(df.themes))

# themes encoding
themes = {list(themes)[i]: i for i in range(len(themes))}

def encode_themes(available_themes):
    result = [0,] * len(themes)
    for i in available_themes:
        result[themes[i]] = 1
    return result

df.themes = df.themes.apply(encode_themes)

In [6]:
# authors encoding
authors = df.author.unique()
authors = {list(authors)[i]: i for i in range(len(authors))}
df.author = [authors[row['author']]for index, row in df.iterrows()]

In [7]:
#  Remove punctuation
table = str.maketrans('', '', string.punctuation)
df['text'] = [row['text'].translate(table) for index, row in df.iterrows()]
df['title'] = [row['title'].translate(table) for index, row in df.iterrows()]
#  Remove stopwords
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in STOP_WORDS))

print(df.head())

                   title  author  year  \
0       Body and Soul II       0  2002   
1                  Novel       1  2002   
2                 Flying       2  2002   
4         War Photograph       3  2002   
5  Interlude Still Still       4  2001   

                                                text  \
0  Coleman Hawkins The structure landscape infini...   
1  I No ones seventeen —On beautiful nights beer ...   
2  One said tonight day passage Its hard remember...   
4  A naked child running path arms stretched mout...   
5  Inside hole yellow boy dropped quarter guitar ...   

                                              themes  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
4  [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...  
5  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  


In [8]:
#  Lemmatization
nlp = spacy.load("en_core_web_sm")
lemmatizer = nlp.get_pipe("lemmatizer")

df['title'] = [
                [token.lemma_ for token in nlp(row['title'])]
                for index, row in df.iterrows()
             ]
df['text'] = [
                [token.lemma_ for token in nlp(row['text'])]
                for index, row in df.iterrows()
             ]
df

Unnamed: 0,title,author,year,text,themes
0,"[body, and, Soul, II]",0,2002,"[Coleman, Hawkins, the, structure, landscape, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,[novel],1,2002,"[I, no, one, seventeen, —, on, beautiful, nigh...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,[fly],2,2002,"[one, say, tonight, day, passage, its, hard, r...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[War, Photograph]",3,2002,"[a, naked, child, run, path, arm, stretch, mou...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,"[Interlude, still, still]",4,2001,"[inside, hole, yellow, boy, drop, quarter, gui...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
17069,"[you, ca, nt, buy, shoe, in, a, painting]",1004,1990,"[you, ca, nt, buy, soda, you, thing, mother, s...","[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
17070,"[you, People]",3182,1990,"[People, do, nt, ask, shoe, the, valley, I, wa...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
17071,"[you, that, I, love]",3425,2005,"[you, I, love, life, long, you, I, follow, lin...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
17072,"[your, clothe]",4009,1990,"[of, course, shell, hope, animation, of, cours...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [9]:
# Decontraction
def full_form(word):
    if word == "nt": word = 'not'
    if word == "re": word = 'be'
    if word == "d": word = 'would'
    if word == "m": word = 'am'
    if word == "s": word = 'be'
    if word == "ve": word = 'have'
    return word

df['text'] = [[full_form(w) for w in row['text']] for index, row in df.iterrows()]
df['title'] = [[full_form(w) for w in row['title']] for index, row in df.iterrows()]

df['text']

0        [Coleman, Hawkins, the, structure, landscape, ...
1        [I, no, one, seventeen, —, on, beautiful, nigh...
2        [one, say, tonight, day, passage, its, hard, r...
4        [a, naked, child, run, path, arm, stretch, mou...
5        [inside, hole, yellow, boy, drop, quarter, gui...
                               ...                        
17069    [you, ca, not, buy, soda, you, thing, mother, ...
17070    [People, do, not, ask, shoe, the, valley, I, w...
17071    [you, I, love, life, long, you, I, follow, lin...
17072    [of, course, shell, hope, animation, of, cours...
17073    [ominous, inscrutable, chinese, news, Christma...
Name: text, Length: 16515, dtype: object

In [9]:
max_features=10000

# tokenization
tok = keras.preprocessing.text.Tokenizer(
    num_words=max_features,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True)  
tok.fit_on_texts(df['text'])

In [10]:
# text to number sequences
df['text'] = tok.texts_to_sequences(df['text'])
df['title'] = tok.texts_to_sequences(df['title'])

In [11]:
df.head()

Unnamed: 0,title,author,year,text,themes
0,"[45, 4, 141, 903]",0,2002,"[2, 2363, 1098, 5, 2363, 294, 9947, 990, 413, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,[2866],1,2002,"[1, 96, 106, 4479, 3, 147, 289, 32, 1425, 8429...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,[208],2,2002,"[106, 18, 708, 21, 1444, 330, 239, 164, 425, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[262, 1207]",3,2002,"[10, 649, 74, 142, 536, 200, 677, 178, 107, 43...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,"[524, 524]",4,2001,"[172, 530, 348, 161, 316, 1931, 2258, 1369, 38...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [12]:
# export encoded data
data = df.to_json()
with open('../data/data.json', 'w') as f:
    f.write(data)
with open('../data/themes.json', 'w') as f:
    json.dump(themes, f)