In [None]:
import pandas as pd
import spacy
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
from google.colab import files
from sklearn import model_selection
from tensorflow import keras
from tensorflow.keras import layers, models
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from tensorflow.keras.layers import BatchNormalization,SpatialDropout1D
from tensorflow.keras.metrics import FalsePositives, TruePositives, TrueNegatives, FalseNegatives

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df = pd.read_json('https://raw.githubusercontent.com/triantafillu/Bootcamp-Repository-Language-2/main/data/ne_data.json')
df.head()

Unnamed: 0,title,author,year,text,themes
0,Body and Soul II,Charles Wright,2002,(for Coleman Hawkins)\nThe structure of landsc...,[audio & music]
1,Novel,Arthur Rimbaud,2002,I.\n\nNo one's serious at seventeen.\n—On beau...,[lgbtq]
2,Flying,Sarah Arvio,2002,One said to me tonight or was it day \nor was ...,[dreams]
3,Photograph of People Dancing in France,Leslie Adrienne Miller,2002,It's true that you don't know them--nor do I \...,[arts & sciences]
4,War Photograph,Kate Daniels,2002,A naked child is running\nalong the path towar...,"[violence, arts & sciences, war]"


In [None]:
df = df.dropna()

In [None]:
# Combine text and title
df['full_text'] = df.apply(lambda row: row.title + " " + row.text, axis = 1)
df.head()

Unnamed: 0,title,author,year,text,themes,full_text
0,Body and Soul II,Charles Wright,2002,(for Coleman Hawkins)\nThe structure of landsc...,[audio & music],Body and Soul II (for Coleman Hawkins)\nThe st...
1,Novel,Arthur Rimbaud,2002,I.\n\nNo one's serious at seventeen.\n—On beau...,[lgbtq],Novel I.\n\nNo one's serious at seventeen.\n—O...
2,Flying,Sarah Arvio,2002,One said to me tonight or was it day \nor was ...,[dreams],Flying One said to me tonight or was it day \n...
3,Photograph of People Dancing in France,Leslie Adrienne Miller,2002,It's true that you don't know them--nor do I \...,[arts & sciences],Photograph of People Dancing in France It's tr...
4,War Photograph,Kate Daniels,2002,A naked child is running\nalong the path towar...,"[violence, arts & sciences, war]",War Photograph A naked child is running\nalong...


In [None]:
# Encode author column
labelencoder = LabelEncoder()
df['author'] = labelencoder.fit_transform(df['author'])
df.head()

Unnamed: 0,title,author,year,text,themes,full_text
0,Body and Soul II,593,2002,(for Coleman Hawkins)\nThe structure of landsc...,[audio & music],Body and Soul II (for Coleman Hawkins)\nThe st...
1,Novel,305,2002,I.\n\nNo one's serious at seventeen.\n—On beau...,[lgbtq],Novel I.\n\nNo one's serious at seventeen.\n—O...
2,Flying,3416,2002,One said to me tonight or was it day \nor was ...,[dreams],Flying One said to me tonight or was it day \n...
3,Photograph of People Dancing in France,2315,2002,It's true that you don't know them--nor do I \...,[arts & sciences],Photograph of People Dancing in France It's tr...
4,War Photograph,2082,2002,A naked child is running\nalong the path towar...,"[violence, arts & sciences, war]",War Photograph A naked child is running\nalong...


In [None]:
# Decontract words 
def full_form(word):
    if word == "nt": word = 'not'
    if word == "re": word = 'be'
    if word == "d": word = 'would'
    if word == "m": word = 'am'
    if word == "s": word = 'be'
    if word == "ve": word = 'have'
    return word

# Preprocess text
def preprocessing(text):
  tokenizer = RegexpTokenizer(r'\w+')
  text = tokenizer.tokenize(text)
  stop_words = set(stopwords.words('english'))
  cleaned_text = []
  for word in text:
    if word not in stop_words:
      cleaned_text.append(word)
  wnl = WordNetLemmatizer()
  text = [wnl.lemmatize(token) for token in cleaned_text]
  text = [full_form(w).lower() for w in text]

  return text
  

In [None]:
# Apply preprocessing
df['full_text'] = df['full_text'].apply(preprocessing)
df['full_text']

0        [body, soul, ii, coleman, hawkins, the, struct...
1        [novel, i, no, one, serious, seventeen, on, be...
2        [flying, one, said, tonight, day, passage, two...
3        [photograph, people, dancing, france, it, true...
4        [war, photograph, a, naked, child, running, al...
                               ...                        
17069    [you, can, buy, shoes, painting, you, even, bu...
17070    [you, people, people, ask, shoe, the, valley, ...
17071    [you, that, i, loved, you, i, loved, life, lon...
17072    [your, clothes, of, course, empty, shell, with...
17073    [your, luck, is, about, to, change, ominous, i...
Name: full_text, Length: 16583, dtype: object

In [None]:
# Delete text which are too short
texts_len = df['full_text'].apply(len)
df.drop(df[texts_len<50].index, inplace=True)

In [None]:
tokenizer = Tokenizer(num_words=3000)
tokenizer.fit_on_texts(df['full_text'])

# Encode training data sentences into sequences
df['full_text'] = tokenizer.texts_to_sequences(df['full_text'])

df['full_text']

0        [43, 145, 875, 2, 2648, 1155, 4, 2648, 303, 11...
1        [1, 101, 5, 1350, 147, 310, 21, 1545, 3, 719, ...
2        [980, 5, 42, 964, 8, 1595, 70, 15, 256, 225, 1...
3        [1472, 117, 1083, 2539, 15, 313, 13, 1, 13, 1,...
4        [266, 1472, 6, 705, 68, 699, 236, 587, 298, 7,...
                               ...                        
17068    [2, 646, 126, 1380, 195, 296, 314, 20, 424, 24...
17069    [28, 606, 1016, 996, 28, 55, 1016, 28, 19, 36,...
17070    [28, 117, 117, 340, 684, 2, 844, 1, 625, 1072,...
17071    [28, 32, 1, 376, 28, 1, 376, 23, 41, 5, 28, 1,...
17073    [302, 1734, 203, 1217, 16, 375, 1991, 847, 116...
Name: full_text, Length: 14176, dtype: object

In [None]:
# Get max training sequence length
maxlen = 150 #max([len(x) for x in df['full_text']])

# Pad the training sequences
padded = pad_sequences(df['full_text'], padding='post', truncating='post', maxlen=maxlen)

padded

array([[  43,  145,  875, ...,   44, 1356, 1577],
       [   1,  101,    5, ...,    0,    0,    0],
       [ 980,    5,   42, ...,    0,    0,    0],
       ...,
       [  28,  117,  117, ...,  169, 1221,  498],
       [  28,   32,    1, ...,    0,    0,    0],
       [ 302, 1734,  203, ...,    0,    0,    0]], dtype=int32)

In [None]:
df['full_text'] = [x for x in padded]

df['full_text']

0        [43, 145, 875, 2, 2648, 1155, 4, 2648, 303, 11...
1        [1, 101, 5, 1350, 147, 310, 21, 1545, 3, 719, ...
2        [980, 5, 42, 964, 8, 1595, 70, 15, 256, 225, 1...
3        [1472, 117, 1083, 2539, 15, 313, 13, 1, 13, 1,...
4        [266, 1472, 6, 705, 68, 699, 236, 587, 298, 7,...
                               ...                        
17068    [2, 646, 126, 1380, 195, 296, 314, 20, 424, 24...
17069    [28, 606, 1016, 996, 28, 55, 1016, 28, 19, 36,...
17070    [28, 117, 117, 340, 684, 2, 844, 1, 625, 1072,...
17071    [28, 32, 1, 376, 28, 1, 376, 23, 41, 5, 28, 1,...
17073    [302, 1734, 203, 1217, 16, 375, 1991, 847, 116...
Name: full_text, Length: 14176, dtype: object

In [None]:
themes = ['nature', 'family', 'love', 'body', 'animals']

In [None]:
# Label the topic column
def label_themes(theme, row):
   if theme in row['themes']:
      return 1
   else: 
     return 0

In [None]:
def model(X,Y,label):
  X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.1, random_state=37)
  embedding_dim = 8
  max_features = 5001 
  max_len = 150  

  model1 = keras.models.Sequential([
    keras.layers.Embedding(input_dim=max_features,
                          output_dim=embedding_dim,
                          input_length=max_len),
    keras.layers.SpatialDropout1D(0.5),
    keras.layers.Flatten(),
    keras.layers.Dense(4, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation='sigmoid')
  ])

  model1.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy', 'AUC'])

  epochs = 10
  model1.fit(np.array(X_train), np.array(Y_train),
            batch_size=128,
            validation_data=(np.array(X_test),np.array(Y_test)),
            epochs=epochs)

  filename = 'model_' + label+'.h5'

  model1.save('/drive/My Drive/Colab Notebooks/baby models/'+filename)


In [None]:
from google.colab import drive
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [None]:
# Train a model for every theme
for theme in themes:
  tmp_df = df.copy()
  tmp_df['label'] = tmp_df.apply(lambda row: label_themes(theme, row), axis=1)
  X = np.array(tmp_df['full_text'].to_list())
  Y = np.array(tmp_df['label'].to_list())
  model(X,Y,theme)
  

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
