In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf, requests as rqst, io
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import layers

rnd = np.random.randint

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#create the grouped dataset based on the data from each participant
vlad_file = open("/content/drive/MyDrive/NLP/datavlad.txt")
vlad_sentences = vlad_file.read().split('.')[:-1]

artem_file = open("/content/drive/MyDrive/NLP/dataartem.txt")
artem_sentences = artem_file.read().split('.')[:-1]

ks_file = open("/content/drive/MyDrive/NLP/dataks.txt")
ks_sentences = ks_file.read().split('.')[:-1]

In [4]:
new_df = []

for new in vlad_sentences:
  new_df.append({'Sentence':new, 'Label':'neural network art'})

for new in ks_sentences:
  new_df.append({'Sentence':new, 'Label':'climate change'})

for new in artem_sentences:
  new_df.append({'Sentence':new, 'Label':'neural network recognition'})

new_df = pd.DataFrame(data=new_df, columns=['Sentence', 'Label'])

new_df['Target'] = new_df['Label']
new_df.replace({'Target':{'neural network art':1, 'climate change':0, 'neural network recognition':0}}, inplace=True)
new_df

Unnamed: 0,Sentence,Label,Target
0,"In the past few years, many artists have begun...",neural network art,1
1,\nIn computer vision and perceptual psychology...,neural network art,1
2,"\nIn other words, modern neural models lend th...",neural network art,1
3,\nThe most prominent tool in neural art at the...,neural network art,1
4,\nGiven a large collection of images of a spec...,neural network art,1
...,...,...,...
302,"\nIn May 2017, a man was arrested using an aut...",neural network recognition,0
303,[68] Live facial recognition has been trialled...,neural network recognition,0
304,[69] In August 2020 the Court of Appeal ruled ...,neural network recognition,0
305,S,neural network recognition,0


In [5]:
# vectorization of text
max_tokens = 10000

sentences = vlad_sentences+artem_sentences+ks_sentences
tokens_count = 0
for new in sentences:
  tokens_count+=len(new.split())
avg_tokens = round(tokens_count/len(sentences))

In [6]:
#tokenization and embedding
text_vectorizer = TextVectorization(max_tokens=max_tokens, 
                                    standardize="lower_and_strip_punctuation", 
                                    split="whitespace", 
                                    ngrams=None, 
                                    output_mode="int",
                                    output_sequence_length=avg_tokens, 
                                    pad_to_max_tokens=True)

text_vectorizer.adapt(new_df['Sentence'])

text_vectorizer(new_df['Sentence'])

<tf.Tensor: shape=(307, 25), dtype=int64, numpy=
array([[   6,    2,  212, ...,  826,    6, 1800],
       [   6,  303,  328, ...,    0,    0,    0],
       [   6,   62,  325, ...,  139,  455,  266],
       ...,
       [2051,    6, 1943, ...,  251,  101,    6],
       [ 575,    0,    0, ...,    0,    0,    0],
       [ 763,    3,  363, ...,   24,  102,  524]])>

In [7]:
embedding = layers.Embedding(input_dim=max_tokens, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=avg_tokens) # how long is each input

embedding(text_vectorizer(new_df['Sentence']))

<tf.Tensor: shape=(307, 25, 128), dtype=float32, numpy=
array([[[ 5.0104149e-03,  3.5234224e-02, -2.8986169e-02, ...,
         -2.8221322e-02, -2.2857077e-03, -2.5993362e-03],
        [ 9.0782046e-03,  1.5913811e-02, -1.7580390e-03, ...,
         -8.5942820e-04, -7.9221502e-03,  2.9427204e-02],
        [-3.2362390e-02,  6.4261071e-03, -1.0967709e-02, ...,
         -1.9243240e-02, -2.9514587e-02,  6.8360083e-03],
        ...,
        [ 2.5640760e-02, -3.2779947e-03, -1.1034392e-02, ...,
          3.0237708e-02, -4.0322531e-02,  2.3042608e-02],
        [ 5.0104149e-03,  3.5234224e-02, -2.8986169e-02, ...,
         -2.8221322e-02, -2.2857077e-03, -2.5993362e-03],
        [ 4.3088105e-02, -1.8636882e-02, -4.5479059e-02, ...,
          3.7443828e-02,  2.6025858e-02, -4.8174299e-02]],

       [[ 5.0104149e-03,  3.5234224e-02, -2.8986169e-02, ...,
         -2.8221322e-02, -2.2857077e-03, -2.5993362e-03],
        [-3.2996438e-02,  4.8160665e-03,  1.1765659e-02, ...,
          1.8762436e-02, -3

In [71]:
#import libraries for classification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results


In [100]:
#lstm
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.LSTM(64)(x) # return vector for whole sequence
x = layers.Dense(64, activation="relu")(x) # optional dense layer on top of output of LSTM cell
outputs = layers.Dense(1, activation="sigmoid")(x)
model_lstm = tf.keras.Model(inputs, outputs, name="model_LSTM")


In [101]:
model_lstm.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [102]:
model_lstm.summary()

Model: "model_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_14 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 25)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 25, 128)           1280000   
                                                                 
 lstm_6 (LSTM)               (None, 64)                49408     
                                                                 
 dense_19 (Dense)            (None, 64)                4160      
                                                                 
 dense_20 (Dense)            (None, 1)                 65        
                                                        

In [103]:
model_lstm_history = model_lstm.fit(train_split['Sentence'],
                              train_split['Target'],
                              epochs=9,
                              validation_data=(test_split['Sentence'], test_split['Target']))


Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


In [104]:
model_lstm_pred_probs = model_lstm.predict(test_split['Sentence'])
model_lstm_preds = tf.squeeze(tf.round(model_lstm_pred_probs))
calculate_results(y_true=test_split['Target'], y_pred=model_lstm_preds)


{'accuracy': 93.54838709677419,
 'f1': 0.9369825034655876,
 'precision': 0.9467040673211782,
 'recall': 0.9354838709677419}

In [77]:
#GRU
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GRU(64)(x) 
x = layers.Dense(64, activation="relu")(x) # optional dense layer after GRU cell
outputs = layers.Dense(1, activation="sigmoid")(x)
model_gru = tf.keras.Model(inputs, outputs, name="model_GRU")


In [79]:
model_gru.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])


In [80]:
model_gru_history = model_gru.fit(train_split['Sentence'],
                              train_split['Target'],
                              epochs=9,
                              validation_data=(test_split['Sentence'], test_split['Target']))


Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


In [81]:
model_gru_pred_probs = model_gru.predict(test_split['Sentence'])
model_gru_preds = tf.squeeze(tf.round(model_gru_pred_probs))
calculate_results(y_true=test_split['Target'], y_pred=model_gru_preds)

{'accuracy': 95.16129032258065,
 'f1': 0.9525086972595338,
 'precision': 0.9582111436950146,
 'recall': 0.9516129032258065}

In [82]:
#bidirectional LSTM
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
# x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x) # stacking RNN layers requires return_sequences=True
x = layers.Bidirectional(layers.LSTM(64))(x) # bidirectional goes both ways so has double the parameters of a regular LSTM layer
outputs = layers.Dense(1, activation="sigmoid")(x)
model_bi_lstm = tf.keras.Model(inputs, outputs, name="model_Bi_lstm")


In [83]:
model_bi_lstm.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])


In [84]:
model_bi_lstm.summary()


Model: "model_Bi_lstm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_11 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 25)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 25, 128)           1280000   
                                                                 
 bidirectional_4 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense_16 (Dense)            (None, 1)                 129       
                                                                 
Total params: 1,378,945
Trainable params: 1,378,945
N

In [85]:
model_bi_lstm_history = model_bi_lstm.fit(train_split['Sentence'],
                              train_split['Target'],
                              epochs=9,
                              validation_data=(test_split['Sentence'], test_split['Target']))


Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


In [93]:
model_bi_lstm_pred_probs = model_bi_lstm.predict(test_split['Sentence'])
model_bi_lstm_preds = tf.squeeze(tf.round(model_bi_lstm_pred_probs))
calculate_results(y_true=test_split['Target'], y_pred=model_bi_lstm_preds)

{'accuracy': 95.16129032258065,
 'f1': 0.9525086972595338,
 'precision': 0.9582111436950146,
 'recall': 0.9516129032258065}

In [94]:
#bidirectional GRU
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Bidirectional(layers.GRU(64))(x) # bidirectional goes both ways so has double the parameters of a regular LSTM layer
outputs = layers.Dense(1, activation="sigmoid")(x)
model_bi_gru = tf.keras.Model(inputs, outputs, name="model_Bi_gru")

In [95]:
model_bi_gru.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [96]:
model_bi_gru.summary()

Model: "model_Bi_gru"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_13 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 25)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 25, 128)           1280000   
                                                                 
 bidirectional_6 (Bidirectio  (None, 128)              74496     
 nal)                                                            
                                                                 
 dense_18 (Dense)            (None, 1)                 129       
                                                                 
Total params: 1,354,625
Trainable params: 1,354,625
No

In [97]:
model_bi_gru_history = model_bi_gru.fit(train_split['Sentence'],
                              train_split['Target'],
                              epochs=9,
                              validation_data=(test_split['Sentence'], test_split['Target']))

Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


In [99]:
model_bi_gru_pred_probs = model_bi_gru.predict(test_split['Sentence'])
model_bi_gru_preds = tf.squeeze(tf.round(model_bi_gru_pred_probs))
calculate_results(y_true=test_split['Target'], y_pred=model_bi_gru_preds)

{'accuracy': 96.7741935483871,
 'f1': 0.9681643625192012,
 'precision': 0.9708141321044547,
 'recall': 0.967741935483871}

Every model has 9 epochs for learning. The best results were shown by bidirectional GRU model.