In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf, requests as rqst, io
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import layers

rnd = np.random.randint

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#create the grouped dataset based on the data from each participant
vlad_file = open("/content/drive/MyDrive/NLP/datavlad.txt")
vlad_sentences = vlad_file.read().split('.')[:-1]

artem_file = open("/content/drive/MyDrive/NLP/dataartem.txt")
artem_sentences = artem_file.read().split('.')[:-1]

ks_file = open("/content/drive/MyDrive/NLP/dataks.txt")
ks_sentences = ks_file.read().split('.')[:-1]

vlad_sentences

artem_sentences

ks_sentences

['Scientists attribute the global warming trend observed since the mid-20th century to the human expansion of the "greenhouse effect"1 — warming that results when the atmosphere traps heat radiating from Earth toward space',
 '\nCertain gases in the atmosphere block heat from escaping',
 ' Long-lived gases that remain semi-permanently in the atmosphere and do not respond physically or chemically to changes in temperature are described as "forcing" climate change',
 ' Gases, such as water vapor, which respond physically or chemically to changes in temperature are seen as "feedbacks',
 '"\nGlobal climate change has already had observable effects on the environment',
 ' Glaciers have shrunk, ice on rivers and lakes is breaking up earlier, plant and animal ranges have shifted and trees are flowering sooner',
 '\nEffects that scientists had predicted in the past would result from global climate change are now occurring: loss of sea ice, accelerated sea level rise and longer, more intense he

In [4]:
new_df = []

for new in vlad_sentences:
  new_df.append({'Sentence':new, 'Label':'neural network art'})

for new in ks_sentences:
  new_df.append({'Sentence':new, 'Label':'climate change'})

for new in artem_sentences:
  new_df.append({'Sentence':new, 'Label':'neural network recognition'})

new_df = pd.DataFrame(data=new_df, columns=['Sentence', 'Label'])

new_df['Target'] = new_df['Label']
new_df.replace({'Target':{'neural network art':1, 'climate change':0, 'neural network recognition':0}}, inplace=True)
new_df

Unnamed: 0,Sentence,Label,Target
0,"In the past few years, many artists have begun...",neural network art,1
1,\nIn computer vision and perceptual psychology...,neural network art,1
2,"\nIn other words, modern neural models lend th...",neural network art,1
3,\nThe most prominent tool in neural art at the...,neural network art,1
4,\nGiven a large collection of images of a spec...,neural network art,1
...,...,...,...
302,"\nIn May 2017, a man was arrested using an aut...",neural network recognition,0
303,[68] Live facial recognition has been trialled...,neural network recognition,0
304,[69] In August 2020 the Court of Appeal ruled ...,neural network recognition,0
305,S,neural network recognition,0


In [5]:
# vectorization of text
max_tokens = 10000

sentences = vlad_sentences+artem_sentences+ks_sentences
tokens_count = 0
for new in sentences:
  tokens_count+=len(new.split())
avg_tokens = round(tokens_count/len(sentences))
avg_tokens

25

In [6]:
#tokenization and embedding
text_vectorizer = TextVectorization(max_tokens=max_tokens, 
                                    standardize="lower_and_strip_punctuation", 
                                    split="whitespace", 
                                    ngrams=None, 
                                    output_mode="int",
                                    output_sequence_length=avg_tokens, 
                                    pad_to_max_tokens=True)

text_vectorizer.adapt(new_df['Sentence'])

text_vectorizer(new_df['Sentence'])

<tf.Tensor: shape=(307, 25), dtype=int64, numpy=
array([[   6,    2,  212, ...,  826,    6, 1800],
       [   6,  303,  328, ...,    0,    0,    0],
       [   6,   62,  325, ...,  139,  455,  266],
       ...,
       [2051,    6, 1943, ...,  251,  101,    6],
       [ 575,    0,    0, ...,    0,    0,    0],
       [ 763,    3,  363, ...,   24,  102,  524]])>

In [7]:
print(f"Most Used: {text_vectorizer.get_vocabulary()[:5]}")
print(f"Most Unused: {text_vectorizer.get_vocabulary()[-5:]}")

Most Used: ['', '[UNK]', 'the', 'of', 'to']
Most Unused: ['125', '12000', '117', '10', '0']


In [8]:
embedding = layers.Embedding(input_dim=max_tokens, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=avg_tokens) # how long is each input

embedding(text_vectorizer(new_df['Sentence']))

<tf.Tensor: shape=(307, 25, 128), dtype=float32, numpy=
array([[[-0.02749467, -0.01595894, -0.03922828, ...,  0.02976247,
          0.02037157,  0.00302383],
        [-0.03265711, -0.03129109, -0.0354785 , ...,  0.02647159,
         -0.01067201, -0.04187087],
        [ 0.02307943, -0.02446737, -0.04502044, ..., -0.02807719,
         -0.03352444, -0.00244167],
        ...,
        [-0.03057656,  0.04549277,  0.03021098, ..., -0.00366615,
         -0.02244839, -0.00055165],
        [-0.02749467, -0.01595894, -0.03922828, ...,  0.02976247,
          0.02037157,  0.00302383],
        [-0.02698604, -0.00293987, -0.0318244 , ..., -0.02749285,
          0.00856367,  0.00073034]],

       [[-0.02749467, -0.01595894, -0.03922828, ...,  0.02976247,
          0.02037157,  0.00302383],
        [ 0.00381104,  0.03898697,  0.04099632, ...,  0.0400547 ,
         -0.046062  ,  0.02598841],
        [ 0.0170255 ,  0.01099968,  0.03378505, ...,  0.02919859,
         -0.04416387,  0.00369433],
        ...

In [9]:
# import libraries
from sklearn.model_selection import train_test_split

train_split, test_split = train_test_split(new_df, train_size=0.8, test_size=0.2)
train_split

Unnamed: 0,Sentence,Label,Target
304,[69] In August 2020 the Court of Appeal ruled ...,neural network recognition,0
156,And the impacts of rising temperatures aren’t...,climate change,0
206,Most climate models failed to predict a slowd...,climate change,0
272,\nRecognize and manipulate faces from Python o...,neural network recognition,0
217,"As temperatures climb, some regions could exp...",climate change,0
...,...,...,...
183,"Beginning in October 2015, a methane gas leak...",climate change,0
82,\nThree datasets of digital images of painting...,neural network art,1
252,"\nIn the coming weeks, Meta will shut down the...",neural network recognition,0
305,S,neural network recognition,0


In [10]:
#import libraries for classification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results


In [11]:
#import libraries for Naive Bayes Model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_split['Sentence'], train_split['Target'])
baseline_score = model_0.score(test_split['Sentence'], test_split['Target'])
print(f"Baseline model accuracy is: {baseline_score*100:.2f}%")

Baseline model accuracy is: 93.55%


In [12]:
#dense neural network
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string") # inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numerized numbers
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding (try running the model without this layer and see what happens)
outputs = layers.Dense(1, activation="sigmoid")(x) # create the output layer, want binary outputs so use sigmoid activation
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense") # construct the model


In [13]:
model_1.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

model_1.summary()


Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 25)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 25, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [14]:
# Fit the model
model_1_history = model_1.fit(train_split['Sentence'], # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_split['Target'],
                              epochs=50,
                              validation_data=(test_split['Sentence'], test_split['Target']))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [15]:
#compare model effectiveness
calculate_results(test_split['Target'], model_0.predict(test_split['Sentence']))

{'accuracy': 93.54838709677419,
 'f1': 0.9335886603229755,
 'precision': 0.9412186379928316,
 'recall': 0.9354838709677419}

In [16]:
calculate_results(test_split['Target'], tf.squeeze(tf.round(model_1.predict(test_split['Sentence']))))


{'accuracy': 96.7741935483871,
 'f1': 0.9673195084485408,
 'precision': 0.9692423105776444,
 'recall': 0.967741935483871}