In [2]:
pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 7.3 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.10.3


In [3]:
pip install tqdm



In [4]:
pip install transformers

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 8.6 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 43.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 60.8 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Installing collected packages: sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.0.12 pyyaml-5.4.1 sacremoses-0.0.45 transformers-4.9.1


In [9]:
pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 6.6 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [5]:
%matplotlib inline
import pandas as pd
import numpy as np
import plotly.express as pe
import tensorflow as tf
from tokenizers import BertWordPieceTokenizer
from tqdm.notebook import tqdm
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.metrics import *

In [20]:
data_train = pd.read_csv('/content/data_train.csv')
data_test = pd.read_csv('/content/data_test.csv')

In [21]:
data_train.data_correct = data_train.data_correct.fillna('unavailable')
data_test.data_correct = data_test.data_correct.fillna('unavailable')

In [22]:
dataset_train = data_train[['data_correct','Sentiment']]
dataset_test = data_test['data_correct']

In [23]:
train, validation = train_test_split(dataset_train, test_size=0.2, random_state=1)

In [24]:
print(train.shape)
print(validation.shape)

(35280, 2)
(8820, 2)


In [50]:
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer, InputFeatures

In [61]:
model_bert = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels=3)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_layer_norm', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_97']
You should probably TRAIN this model on a down-stream task to be able to use i

In [52]:
vocabulary = tokenizer.get_vocab()

In [53]:
tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

Tokenizer(vocabulary_size=30522, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=False, wordpieces_prefix=##)

In [42]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding()
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [59]:
def build_model(transformer, max_len=512):

    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, :]
    out = Dense(3, activation='softmax')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, beta_1=0.9, beta_2=0.999, epsilon=1e-8), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
    
    return model

In [44]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [87]:
AUTO = tf.data.experimental.AUTOTUNE

EPOCHS = 2
BATCH_SIZE = 16  * strategy.num_replicas_in_sync
MAX_LEN = 25

In [88]:
x_train = fast_encode(train.data_correct.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_valid = fast_encode(validation.data_correct.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_test = fast_encode(dataset_test.astype(str), fast_tokenizer, maxlen=MAX_LEN)

y_train = train.Sentiment.values
y_valid = validation.Sentiment.values

HBox(children=(FloatProgress(value=0.0, max=138.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




In [89]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(100)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

In [90]:
with strategy.scope():
    model = build_model(model_bert, max_len=MAX_LEN)
model.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 25)]              0         
_________________________________________________________________
tf_distil_bert_for_sequence_ TFSequenceClassifierOutpu 66955779  
_________________________________________________________________
tf.__operators__.getitem_9 ( (None, 3)                 0         
_________________________________________________________________
dense_7 (Dense)              (None, 3)                 12        
Total params: 66,955,791
Trainable params: 66,955,791
Non-trainable params: 0
_________________________________________________________________


In [91]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

Epoch 1/2
Epoch 2/2


In [None]:
# n_steps = x_valid.shape[0] // BATCH_SIZE
# train_history_2 = model.fit(
#     valid_dataset.repeat(),
#     steps_per_epoch=n_steps,
#     epochs=EPOCHS*2
# )

In [93]:
prediction = model.predict(test_dataset, verbose=1)



In [94]:
prediction

array([[0.25899985, 0.14527194, 0.5957282 ],
       [0.9011535 , 0.04158773, 0.05725873],
       [0.9891678 , 0.00247069, 0.00836144],
       ...,
       [0.8995481 , 0.06761614, 0.03283576],
       [0.8660745 , 0.0407711 , 0.09315434],
       [0.5225225 , 0.30499345, 0.17248394]], dtype=float32)

In [95]:
pred = pd.DataFrame(prediction)

In [96]:
pred.to_csv('pred1.csv',index=False)