In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf, requests as rqst, io
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import layers

rnd = np.random.randint

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
file = open("/content/drive/MyDrive/NLP/datavlad.txt")
sentences = file.read().split('.')[:-1]
sentences

['In the past few years, many artists have begun to explore neural networks as artistic tools, and their works have begun to appear in cutting-edge “Artificial Intelligence” art shows as well',
 '\nIn computer vision and perceptual psychology, image perception is often analyzed in terms of visual cues',
 '\nIn other words, modern neural models lend themselves to creating interesting imagery, because they were designed modern real images, and so modifying them creates realistic but unreal images',
 '\nThe most prominent tool in neural art at the moment is the Generative Adversarial Network (GAN)',
 '\nGiven a large collection of images of a specific class (such as faces or landscapes), a GAN is trained to produce new images that look like they also came from that class',
 '\nHowever, GANs operate in terms of image cues that are difficult to explain; they are not just manipulating simple properties like color and texture',
 '\nGANs are the latest in a long line of research in natural ima

In [4]:
new_df = []
for sent in sentences:
  new_df.append({'Sentence':sent, 'Label':"Neural network artworks"})

new_df

[{'Label': 'Neural network artworks',
  'Sentence': 'In the past few years, many artists have begun to explore neural networks as artistic tools, and their works have begun to appear in cutting-edge “Artificial Intelligence” art shows as well'},
 {'Label': 'Neural network artworks',
  'Sentence': '\nIn computer vision and perceptual psychology, image perception is often analyzed in terms of visual cues'},
 {'Label': 'Neural network artworks',
  'Sentence': '\nIn other words, modern neural models lend themselves to creating interesting imagery, because they were designed modern real images, and so modifying them creates realistic but unreal images'},
 {'Label': 'Neural network artworks',
  'Sentence': '\nThe most prominent tool in neural art at the moment is the Generative Adversarial Network (GAN)'},
 {'Label': 'Neural network artworks',
  'Sentence': '\nGiven a large collection of images of a specific class (such as faces or landscapes), a GAN is trained to produce new images that loo

In [5]:
#creating two columns with information about sentenses
new_df = pd.DataFrame(data=new_df, columns=['Sentence', 'Label'])
#save separated sentences to csv
new_df.to_csv("/content/drive/MyDrive/NLP/datavlad.csv")
new_df

Unnamed: 0,Sentence,Label
0,"In the past few years, many artists have begun...",Neural network artworks
1,\nIn computer vision and perceptual psychology...,Neural network artworks
2,"\nIn other words, modern neural models lend th...",Neural network artworks
3,\nThe most prominent tool in neural art at the...,Neural network artworks
4,\nGiven a large collection of images of a spec...,Neural network artworks
...,...,...
100,\nWe propose a new method for stroke based ima...,Neural network artworks
101,"\nTo build a neural renderer, a general practi...",Neural network artworks
102,\nWe can see that our method successfully lear...,Neural network artworks
103,\nWe can see our method generates more vivid r...,Neural network artworks


In [6]:
# import libraries
from sklearn.model_selection import train_test_split

train_split, test_split = train_test_split(new_df, train_size=0.8, test_size=0.2)
train_split

Unnamed: 0,Sentence,Label
95,"\nIn this paper, we explore the secret nature ...",Neural network artworks
75,\nTwenty-five styles represented by a large da...,Neural network artworks
3,\nThe most prominent tool in neural art at the...,Neural network artworks
6,\nGANs are the latest in a long line of resear...,Neural network artworks
65,\nThis paper presents a multi-stage machine le...,Neural network artworks
...,...,...
30,"\nEven if new GANs become flawless, artists wi...",Neural network artworks
32,\nThus far the algorithmic basis of this proce...,Neural network artworks
82,\nThree datasets of digital images of painting...,Neural network artworks
64,\nArtistic stylisation of images is traditiona...,Neural network artworks


In [10]:
# vectorization of text
max_tokens = 10000
count = 0

for new in sentences:
  count+=len(new.split())
avg_tokens = round(count/len(sentences))
avg_tokens

24

In [13]:
text_vectorizer = TextVectorization(max_tokens=max_tokens, # how many words in the vocabulary (all of the different words in your text)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=avg_tokens, # how long should the output sequence of tokens be?
                                    pad_to_max_tokens=True)
text_vectorizer.adapt(new_df['Sentence'])

text_vectorizer(new_df['Sentence'])

<tf.Tensor: shape=(105, 24), dtype=int64, numpy=
array([[  6,   2, 521, ...,   5, 797,   6],
       [  6, 157, 123, ...,   0,   0,   0],
       [  6,  74, 174, ..., 562, 131, 155],
       ...,
       [ 16,  26, 137, ...,   0,   0,   0],
       [ 16,  26, 137, ...,   5,  99, 767],
       [ 16, 734,  24, ...,   7,   2, 471]])>

In [14]:
print(f"Most Used: {text_vectorizer.get_vocabulary()[:5]}")
print(f"Most Unused: {text_vectorizer.get_vocabulary()[-5:]}")

Most Used: ['', '[UNK]', 'the', 'of', 'a']
Most Unused: ['2338', '1994', '16', '13', '1']


In [15]:
# embedding of data
embedding = layers.Embedding(input_dim=max_tokens, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=avg_tokens) # how long is each input

embedding(text_vectorizer(new_df['Sentence']))


<tf.Tensor: shape=(105, 24, 128), dtype=float32, numpy=
array([[[ 1.16179697e-02, -6.89704344e-03, -4.85221259e-02, ...,
         -4.04276848e-02,  3.04611661e-02, -4.30170186e-02],
        [ 3.91794704e-02,  8.12895223e-03,  4.17335294e-02, ...,
         -4.91617918e-02, -8.18390772e-03, -4.14472930e-02],
        [-2.54979264e-02, -8.25766474e-03, -1.68346539e-02, ...,
         -1.45431869e-02,  2.33328827e-02, -1.59457326e-02],
        ...,
        [ 1.08373985e-02, -1.49660595e-02,  4.01986949e-02, ...,
         -2.36765631e-02, -7.72241503e-03, -2.43506581e-03],
        [-3.12988311e-02, -4.88207228e-02, -2.27710009e-02, ...,
          2.13618390e-02,  4.37307470e-02, -1.55324340e-02],
        [ 1.16179697e-02, -6.89704344e-03, -4.85221259e-02, ...,
         -4.04276848e-02,  3.04611661e-02, -4.30170186e-02]],

       [[ 1.16179697e-02, -6.89704344e-03, -4.85221259e-02, ...,
         -4.04276848e-02,  3.04611661e-02, -4.30170186e-02],
        [ 2.47412436e-02,  2.25798823e-02,  2.9