In [18]:
import numpy as np
import pandas as pd
import tensorflow as tf, requests as rqst, io
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import layers

rnd = np.random.randint

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
#create the grouped dataset based on the data from each participant
vlad_file = open("/content/drive/MyDrive/NLP/datavlad.txt")
vlad_sentences = vlad_file.read().split('.')[:-1]

artem_file = open("/content/drive/MyDrive/NLP/dataartem.txt")
artem_sentences = artem_file.read().split('.')[:-1]

ks_file = open("/content/drive/MyDrive/NLP/dataks.txt")
ks_sentences = ks_file.read().split('.')[:-1]

In [21]:
new_df = []

for new in vlad_sentences:
  new_df.append({'Sentence':new, 'Label':'neural network art'})

for new in ks_sentences:
  new_df.append({'Sentence':new, 'Label':'climate change'})

for new in artem_sentences:
  new_df.append({'Sentence':new, 'Label':'neural network recognition'})

new_df = pd.DataFrame(data=new_df, columns=['Sentence', 'Label'])

new_df['Target'] = new_df['Label']
new_df.replace({'Target':{'neural network art':1, 'climate change':0, 'neural network recognition':0}}, inplace=True)
new_df

Unnamed: 0,Sentence,Label,Target
0,"In the past few years, many artists have begun...",neural network art,1
1,\nIn computer vision and perceptual psychology...,neural network art,1
2,"\nIn other words, modern neural models lend th...",neural network art,1
3,\nThe most prominent tool in neural art at the...,neural network art,1
4,\nGiven a large collection of images of a spec...,neural network art,1
...,...,...,...
302,"\nIn May 2017, a man was arrested using an aut...",neural network recognition,0
303,[68] Live facial recognition has been trialled...,neural network recognition,0
304,[69] In August 2020 the Court of Appeal ruled ...,neural network recognition,0
305,S,neural network recognition,0


In [22]:
# vectorization of text
max_tokens = 10000

sentences = vlad_sentences+artem_sentences+ks_sentences
tokens_count = 0
for new in sentences:
  tokens_count+=len(new.split())
avg_tokens = round(tokens_count/len(sentences))

In [23]:
#tokenization and embedding
text_vectorizer = TextVectorization(max_tokens=max_tokens, 
                                    standardize="lower_and_strip_punctuation", 
                                    split="whitespace", 
                                    ngrams=None, 
                                    output_mode="int",
                                    output_sequence_length=avg_tokens, 
                                    pad_to_max_tokens=True)

text_vectorizer.adapt(new_df['Sentence'])

text_vectorizer(new_df['Sentence'])

<tf.Tensor: shape=(307, 25), dtype=int64, numpy=
array([[   6,    2,  212, ...,  826,    6, 1800],
       [   6,  303,  328, ...,    0,    0,    0],
       [   6,   62,  325, ...,  139,  455,  266],
       ...,
       [2051,    6, 1943, ...,  251,  101,    6],
       [ 575,    0,    0, ...,    0,    0,    0],
       [ 763,    3,  363, ...,   24,  102,  524]])>

In [24]:
embedding = layers.Embedding(input_dim=max_tokens, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=avg_tokens) # how long is each input

embedding(text_vectorizer(new_df['Sentence']))

<tf.Tensor: shape=(307, 25, 128), dtype=float32, numpy=
array([[[-0.0233851 , -0.01348491, -0.03136371, ..., -0.03432669,
         -0.01660462, -0.01027635],
        [-0.01769461,  0.03343001, -0.00187425, ..., -0.04508854,
         -0.04897651, -0.04471302],
        [ 0.03980151,  0.03117919, -0.00617325, ...,  0.02765748,
          0.03162095, -0.03758148],
        ...,
        [ 0.04871792,  0.04337497,  0.02303931, ..., -0.04296513,
          0.03719758,  0.04070419],
        [-0.0233851 , -0.01348491, -0.03136371, ..., -0.03432669,
         -0.01660462, -0.01027635],
        [ 0.02497759, -0.02604815, -0.03634252, ..., -0.01039992,
          0.01683113, -0.02052484]],

       [[-0.0233851 , -0.01348491, -0.03136371, ..., -0.03432669,
         -0.01660462, -0.01027635],
        [-0.02103479, -0.03589001, -0.00330781, ..., -0.02518375,
         -0.03208981, -0.02051231],
        [ 0.02694023,  0.02850043, -0.00638145, ...,  0.02723211,
          0.03869936,  0.01958818],
        ...

In [25]:
# import libraries
from sklearn.model_selection import train_test_split

train_split, test_split = train_test_split(new_df, train_size=0.8, test_size=0.2)
train_split

Unnamed: 0,Sentence,Label,Target
223,"Scientists have attributed the fires, which i...",climate change,0
224,California governor Jerry Brown lamented that...,climate change,0
186,\nHumans have created and released greenhouse ...,climate change,0
5,"\nHowever, GANs operate in terms of image cues...",neural network art,1
262,"\nAmid this ongoing uncertainty, we believe th...",neural network recognition,0
...,...,...,...
304,[69] In August 2020 the Court of Appeal ruled ...,neural network recognition,0
49,\nWhen synthesising an image that combines the...,neural network art,1
233,Plants and animals might not be able to survi...,climate change,0
297,\nThree-dimensional data points from a face va...,neural network recognition,0


In [34]:
#import libraries for classification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results



In [35]:
#universal sentence encoder (USE)
import tensorflow_hub as hub
url="https://tfhub.dev/google/universal-sentence-encoder/4"

sentence_encoder_layer = hub.KerasLayer(url,input_shape=[], # shape of inputs coming to our model 
                                        dtype=tf.string, # data type of inputs coming to the USE layer
                                        trainable=False, # keep the pretrained weights (we'll create a feature extractor)
                                        name="USE")

model_use = tf.keras.Sequential([
  sentence_encoder_layer, # take in sentences and then encode them into an embedding
  layers.Dense(64, activation="relu"),
  layers.Dense(1, activation="sigmoid")
], name="model_6_USE")

# Compile model
model_use.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

model_use.summary()


Model: "model_6_USE"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 dense_8 (Dense)             (None, 64)                32832     
                                                                 
 dense_9 (Dense)             (None, 1)                 65        
                                                                 
Total params: 256,830,721
Trainable params: 32,897
Non-trainable params: 256,797,824
_________________________________________________________________


In [36]:
model_use_history=model_use.fit(train_split['Sentence'],train_split['Target'],epochs=9,
                        validation_data=(test_split['Sentence'],test_split['Target']))

Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


In [37]:
model_use_pred_probs = model_use.predict(test_split['Sentence'])
model_use_preds = tf.squeeze(tf.round(model_use_pred_probs))
calculate_results(y_true=test_split['Target'], y_pred=model_use_preds)





{'accuracy': 96.7741935483871,
 'f1': 0.967741935483871,
 'precision': 0.967741935483871,
 'recall': 0.967741935483871}

In [38]:
#universal sentence encoder (USE) with fine tuning
tf_hub_embedding_layer = hub.KerasLayer(url,input_shape=[], # shape of inputs coming to our model 
                                        dtype=tf.string, # data type of inputs coming to the USE layer
                                        trainable=True, 
                                        name="USE-FT")

model_use_ft = tf.keras.Sequential([
  tf_hub_embedding_layer, # take in sentences and then encode them into an embedding
  layers.Dense(64, activation="relu"),
  layers.Dense(1, activation="sigmoid")
], name="model_use_ft")

# Compile model
model_use_ft.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])


model_use_ft.summary()


Model: "model_use_ft"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE-FT (KerasLayer)         (None, 512)               256797824 
                                                                 
 dense_10 (Dense)            (None, 64)                32832     
                                                                 
 dense_11 (Dense)            (None, 1)                 65        
                                                                 
Total params: 256,830,721
Trainable params: 256,830,721
Non-trainable params: 0
_________________________________________________________________


In [39]:
model_use_ft_history=model_use_ft.fit(train_split['Sentence'],train_split['Target'],epochs=9,
                        validation_data=(test_split['Sentence'],test_split['Target']))


Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


In [41]:
model_use_ft_pred_probs = model_use_ft.predict(test_split['Sentence'])
model_use_ft_preds = tf.squeeze(tf.round(model_use_ft_pred_probs))
calculate_results(y_true=test_split['Target'], y_pred=model_use_ft_preds)

{'accuracy': 100.0, 'f1': 1.0, 'precision': 1.0, 'recall': 1.0}

Universal sentence encoder (USE) with fine tuning showed better results - it reached the accuracy of 100%
