# Preparing Data (lab2)

Import needed libraries

In [27]:
import numpy as np
import pandas as pd
import tensorflow as tf, requests as rqst, io
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import layers

rnd = np.random.randint

1. Now, lets open file (data set) and split it. This data set contains sentences from articles about *Climate change*.

In [28]:
# open .txt file
file = open('data.txt')
# split each sentense from the file 
sentences = file.read().split('. ')[:-1]
sentences

['Scientists attribute the global warming trend observed since the mid-20th century to the human expansion of the "greenhouse effect"1 — warming that results when the atmosphere traps heat radiating from Earth toward space.\nCertain gases in the atmosphere block heat from escaping',
 'Long-lived gases that remain semi-permanently in the atmosphere and do not respond physically or chemically to changes in temperature are described as "forcing" climate change',
 'Gases, such as water vapor, which respond physically or chemically to changes in temperature are seen as "feedbacks."\nGlobal climate change has already had observable effects on the environment',
 'Glaciers have shrunk, ice on rivers and lakes is breaking up earlier, plant and animal ranges have shifted and trees are flowering sooner.\nEffects that scientists had predicted in the past would result from global climate change are now occurring: loss of sea ice, accelerated sea level rise and longer, more intense heat waves.\nScie

2. Let's create a dataframe with sentences that are most suitable fot the field.

In [29]:
new_df = []
for i in sentences:
  new_df.append({'Sentence':i, 'Label':'climate change'})

new_df

[{'Sentence': 'Scientists attribute the global warming trend observed since the mid-20th century to the human expansion of the "greenhouse effect"1 — warming that results when the atmosphere traps heat radiating from Earth toward space.\nCertain gases in the atmosphere block heat from escaping',
  'Label': 'climate change'},
 {'Sentence': 'Long-lived gases that remain semi-permanently in the atmosphere and do not respond physically or chemically to changes in temperature are described as "forcing" climate change',
  'Label': 'climate change'},
 {'Sentence': 'Gases, such as water vapor, which respond physically or chemically to changes in temperature are seen as "feedbacks."\nGlobal climate change has already had observable effects on the environment',
  'Label': 'climate change'},
 {'Sentence': 'Glaciers have shrunk, ice on rivers and lakes is breaking up earlier, plant and animal ranges have shifted and trees are flowering sooner.\nEffects that scientists had predicted in the past wou

In [30]:
#creating two columns with information about sentenses
new_df = pd.DataFrame(data=new_df, columns=['Sentence', 'Label'])
#save separated sentences to csv
new_df.to_csv('data.csv')
new_df

Unnamed: 0,Sentence,Label
0,Scientists attribute the global warming trend ...,climate change
1,Long-lived gases that remain semi-permanently ...,climate change
2,"Gases, such as water vapor, which respond phys...",climate change
3,"Glaciers have shrunk, ice on rivers and lakes ...",climate change
4,The Intergovernmental Panel on Climate Change ...,climate change
...,...,...
88,The hurricane season of 2017 proved to be the ...,climate change
89,Some areas well suited to farming might become...,climate change
90,Long periods of drought could turn fertile lan...,climate change
91,Plants and animals might not be able to surviv...,climate change


3. Let's make preparation of data before modeling 

In [31]:
# import libraries
from sklearn.model_selection import train_test_split

train_split, test_split = train_test_split(new_df, train_size=0.8, test_size=0.2)
train_split

Unnamed: 0,Sentence,Label
63,Gallup's polling further indicates that people...,climate change
46,Deforestation has also played a role in this i...,climate change
40,Volcanic activity also affects the climate bec...,climate change
15,"For example, nights are warming even faster th...",climate change
25,"And in Australia, meteorologists have added a ...",climate change
...,...,...
42,Natural causes of climate change are referred ...,climate change
73,The science of climate change is so complex th...,climate change
16,And upper layers of the atmosphere have actual...,climate change
27,These are the foreshocks of climate change.\nT...,climate change


In [32]:
# vectorization of text
max_tokens = 10000
count = 0

for i in sentences:
  count+=len(i.split())
avg_tokens = round(count/len(sentences))
avg_tokens

29

In [33]:
text_vectorizer = TextVectorization(max_tokens=max_tokens, # how many words in the vocabulary (all of the different words in your text)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=avg_tokens, # how long should the output sequence of tokens be?
                                    pad_to_max_tokens=True)
text_vectorizer

<keras.layers.preprocessing.text_vectorization.TextVectorization at 0x7f09146dfee0>

In [43]:
text_vectorizer(new_df['Sentence'])

<tf.Tensor: shape=(93, 29), dtype=int64, numpy=
array([[ 23, 850,   2, ..., 381,  46, 506],
       [611,  13,   7, ...,   0,   0,   0],
       [ 13,  43,  10, ...,  20,   2, 732],
       ...,
       [108, 549,   3, ...,   0,   0,   0],
       [226,   4, 182, ...,   0,   0,   0],
       [ 33,   2, 108, ...,   0,   0,   0]])>

In [44]:
print(f"Most Used: {text_vectorizer.get_vocabulary()[:5]}")
print(f"Most Unused: {text_vectorizer.get_vocabulary()[-5:]}")

Most Used: ['', '[UNK]', 'the', 'of', 'and']
Most Unused: ['125', '12000', '10', '1', '09']


In [45]:
# embedding of data
embedding = layers.Embedding(input_dim=max_tokens, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=avg_tokens) # how long is each input

embedding(text_vectorizer(new_df['Sentence']))

<tf.Tensor: shape=(93, 29, 128), dtype=float32, numpy=
array([[[ 0.00065888,  0.04247094, -0.03514342, ..., -0.00560534,
          0.04605092,  0.01225934],
        [ 0.02416494, -0.02070975,  0.01165458, ...,  0.04230498,
          0.04461965,  0.04143015],
        [ 0.03674496,  0.00927975,  0.04947234, ..., -0.00765027,
         -0.02319611, -0.04947633],
        ...,
        [ 0.01124585, -0.02888825,  0.01388128, ..., -0.02712025,
         -0.04408462, -0.04175515],
        [ 0.04929071,  0.02777014,  0.03310979, ...,  0.03279879,
         -0.01435507, -0.01501228],
        [-0.02788458, -0.00806959, -0.03326535, ...,  0.00063555,
          0.03255424, -0.03123465]],

       [[-0.00055509, -0.03222983, -0.00514112, ..., -0.00696825,
          0.01045809, -0.02509835],
        [-0.03609122, -0.01049735, -0.03204509, ...,  0.00409378,
         -0.01871764, -0.02677453],
        [-0.03731779, -0.04489025, -0.02256283, ...,  0.04711555,
         -0.04860559, -0.02804002],
        ...,