<a href="https://colab.research.google.com/github/waldekmaciejko/DETpy/blob/main/bert_tokenizer_using_sentiment140.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# https://www.tensorflow.org/datasets/catalog/sentiment140

#Importing dependencies

In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

from google.colab import drive

In [2]:
!pip install bert-for-tf2

Collecting bert-for-tf2
  Downloading bert-for-tf2-0.14.9.tar.gz (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/41.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py-params>=0.9.6 (from bert-for-tf2)
  Downloading py-params-0.10.2.tar.gz (7.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting params-flow>=0.8.0 (from bert-for-tf2)
  Downloading params-flow-0.8.2.tar.gz (22 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30509 sha256=3089c81ab590f3d902bce26c8349f33324747669b14bb32ea2aef66907083cb7
  Stored in directory: /root/.

In [3]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [4]:
import tensorflow as tf
tf.__version__

'2.14.0'

In [5]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert

# Loading files

In [6]:
drive.mount("/content/drive/")

Mounted at /content/drive/


In [13]:
cols = ['sentiment', 'id', 'date', 'query', 'user', 'text']

In [14]:
# data from help.sentiment140.com/for-students

data = pd.read_csv("/content/drive/MyDrive/Datasets/StanfordSentimentAnalysis/training.1600000.processed.noemoticon.csv",
                   header=None,
                   names=cols,
                   engine="python",
                   encoding='latin'
                   )

In [9]:
data.head(5)

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


# Cleaning data

In [10]:
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

In [11]:
def clean_tweet(tweet):
  tweet = BeautifulSoup(tweet, 'lxml').get_text()
  tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
  tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
  tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
  tweet = re.sub(r" +", ' ', tweet)
  return tweet

In [12]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

  tweet = BeautifulSoup(tweet, 'lxml').get_text()


In [15]:
np.unique(data.sentiment.values)
data_labels = data.sentiment.values
data_labels[data_labels==4] = 1

## Tokenizer - create BERT layer to have an acces to metadata from tokenizer

In [16]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
                            trainable=False)

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [17]:
def encode_sentence(sent):
  return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [18]:
tokenizer.tokenize("My dog loves strawberries")

['my', 'dog', 'loves', 'straw', '##berries']

In [19]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("My dog loves strawberries"))

[2026, 3899, 7459, 13137, 20968]

In [20]:
data_inputs = [encode_sentence(sent) for sent in data_clean]

# Dataset creation - we will create paddet baches

In [26]:
# create data list with aprox the same length sentence

data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]

# we have to shuffle becouse in initial data file data were sorted acording to
# label (first half related to positive, 2nd to negative)
# - we have no batches with only positive and only negative
random.shuffle(data_with_len)

# sort acording to length of the sentence
data_with_len.sort(key=lambda x: x[2])

# take only sentence with len > 7
sorted_all = [(sent_lab[0], sent_lab[1])
                for sent_lab in data_with_len if sent_lab[2] > 7]

In [None]:
data_with_len

In [30]:
# create tf dataset generator
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [29]:
type(all_dataset)
#next(iter(all_dataset))

(<tf.Tensor: shape=(8,), dtype=int32, numpy=array([2643, 1045, 1005, 1049, 2107, 1037, 6616, 2039], dtype=int32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=0>)

In [32]:
BATCH_SIZE = 32
# Combines consecutive elements of this dataset into padded batches.
# This transformation combines multiple consecutive elements of the input
# dataset into a single element.
all_batched = all_dataset.padded_batch(BATCH_SIZE,
                                       padded_shapes=((None, ), ()))

In [33]:
# math.ceil - smallest inetger
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)

NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

In [35]:
#type(test_dataset)
next(iter(test_dataset))

(<tf.Tensor: shape=(32, 8), dtype=int32, numpy=
 array([[ 2643,  1045,  1005,  1049,  2107,  1037,  6616,  2039],
        [ 4165,  8235,  4299,  1045,  2071,  2022,  2045,   999],
        [ 2130,  2003,  2025, 12344,  1012,  1012,  1012,  1012],
        [ 8996,  2016,  2018,  2000,  2175,  2000, 10047,  2278],
        [ 1045,  2572,  3331,  2000,  2026,  6904,  2615,  2611],
        [ 2036,  2115,  2047,  4653,  3861,  2003, 10140,  1012],
        [ 2748,  1012,  1012,  2002,  2003,  2026,  8837,  3364],
        [ 2204,  2851, 10474,  3077,   999,  1060, 11636,  2080],
        [ 1045,  2293, 27669,  2015,  1998, 10930,  7677,  1012],
        [ 1045,  2064,  2156,  2870,  2006, 10294,  2015,  3898],
        [ 2026,  2197,  2154,  1999, 12436,  1038,  3726,  6776],
        [ 2292,  1005,  1055,  2831,  2043,  1045,  2131,  2067],
        [ 2735,  7245, 22104,  2051,  2005,  2147,  1012,  1012],
        [ 3602,  2000,  2870, 19387,  2008, 26772,  5914, 14686],
        [ 1045,  2123,  1005

# Model

In [38]:
class DCNN(tf.keras.Model):

  def __init__(self,
               vocab_size,
               emb_dim=128, # dim of vectors
               nb_filters=50, # number of convolution filters
               FFN_unites=512, # number of hidden units
               nb_classes=2, # number of classes out 1/0
               dropout_rate=0.1,
               traning=False,
               name="dcnn"):

    super(DCNN, self).__init__(name=name)

    # start to create embeding layers
    # embeding layer - input single tokens - single numbers,
    self.embeding  = layers.embeding(vocab_size,
                                     emb_dim)

    self.bigram = layers.Conv1D(filters=nb_filters,
                                kernel_size=2,
                                padding="valid",
                                activation="relu")

    self.trigram = layers.Conv1D(filters=nb_filters,
                                kernel_size=2,
                                padding="valid",
                                activation="relu")

    self.fourgram = layers.Conv1D(filters=nb_filters,
                                kernel_size=2,
                                padding="valid",
                                activation="relu")

    self.pool = layers.GlobalAveragePooling1D()

    self.dense_1 = layers.Dense(units=FFN_unites,
                                activation='relu')

    self.dropout = layers.Dropout(rate=dropout_rate)

    if nb_classes == 2:
      self.last_dense = layers.Dense(units=1,
                                     activation='sigmoid')

    else:
      self.last_dense = layers.Dense(units=nb_classes,
                                     activation='softmax')

    # call method
    def call(self, inputs, traning):
      x = self.embeding(inputs)
      x_1 = self.bigram(x)
      x_1 = self.pool(x_1)
      x_2 = self.trigram(x_1)
      x_2 = self.pool(x_2)
      x_3 = self.fourgram(x_2)
      x_3 = self.pool(x_3) # (bach_size, nb_filters)

      merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3*nb_filters)
      merged = self.dense_1(merged)
      merged = self.dropout(merged, traning)
      output = self.last_dense(merged)

      return output
