In [1]:
#!/usr/bin/env python3
# 53907afe-531b-11ea-a595-00505601122b
# b7ea974c-d389-11e8-a4be-00505601122b

# 1. Setup

## 1.1. FS/OS Requirements

In [2]:
!cp /kaggle/input/sentiment-analysis/text_classification_dataset.py /kaggle/working/text_classification_dataset.py
!cp -r /kaggle/input/sentiment-analysis/czech_facebook /kaggle/working/
!tree /kaggle/working

In [2]:
#!pip install -U tensorflow-gpu==2.8 tensorflow-addons==0.16.1 tensorflow-probability==0.16.0 tensorflow-hub==0.12.0 scipy
!pip freeze | grep tensorflow

In [3]:
!grep -c ^processor /proc/cpuinfo
!grep ^cpu\\scores /proc/cpuinfo | uniq |  awk '{print $4}'

## 1.2. Python imports

In [3]:
import argparse
import datetime
import functools
import os
import re

os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")  # Report only TF errors by default

import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import wandb
from wandb.keras import WandbCallback

try:
    import transformers
except Exception:
    raise RuntimeError("You need to install the `transformers` package")

from text_classification_dataset import TextClassificationDataset

## 1.3. Args

In [4]:
parser = argparse.ArgumentParser()
parser.add_argument("--batch_size", default=None, type=int, help="Batch size.")
parser.add_argument("--epochs", default=None, type=int, help="Number of epochs.")
parser.add_argument("--seed", default=42, type=int, help="Random seed.")
parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.")
parser.add_argument("--buffer_size", default=None, type=int, help="Dataset buffer size to load into memory. By default load the whole dataset.")
parser.add_argument("--checkpoints_period", default=None, type=int, help="Checkpoint callback period.")
parser.add_argument("--stopping_patience", default=None, type=int, help="Early stopping epochs patience.")
parser.add_argument("--label_smoothing", default=None, type=float, help="")
parser.add_argument("--learning_rate", default=0.01, type=float, help="Initial model learning rate.")
parser.add_argument("--decay_steps", default=None, type=int, help="Decay steps for cosine decay")

args = parser.parse_args([
    '--threads=2',
    '--batch_size=32',
    '--epochs=10',
    '--checkpoints_period=3',
    '--stopping_patience=3',
    '--learning_rate=0.00005',
    '--label_smoothing=0.1',
] if "__file__" not in globals() else None)

# Create logdir name
args.logdir = os.path.join(
    "logs",
    "{}-{}-{}".format(
        os.path.basename(globals().get("__file__", "notebook")),
        datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"),
        ",".join(
            (
                "{}={}".format(re.sub("(.)[^_]*_?", r"\1", k), v)
                for k, v in sorted(vars(args).items())
            )
        ),
    ),
)

tf.random.set_seed(args.seed) # tf2.6 (I have gpu issues on tf2.8 unfortunately)
tf.config.threading.set_inter_op_parallelism_threads(args.threads)
tf.config.threading.set_intra_op_parallelism_threads(args.threads)

if args.decay_steps:
    args.learning_rate = tf.keras.optimizers.schedules.CosineDecay(args.learning_rate, args.decay_steps)
elif 'facebook' in vars():
    pass
    #args.decay_steps = tf.


if args.buffer_size is None and 'facebook' in vars():
    args.buffer_size = len(facebook.train)

args

## 1.4. WanDB

In [5]:
wandb.login()

In [7]:
run = wandb.init(project='sentiment_analysis_school_assignment',
                 resume='allow',
                 config={
                     **vars(args),
                     #"loss_function": "sparse_categorical_crossentropy",
                     #"architecture": "seq2seq-with-cnn-encoder-and-attention",
                     "dataset": "czech_facebook"
                 })

# 2. Data

In [8]:
tokenizer = transformers.AutoTokenizer.from_pretrained("ufal/eleczech-lc-small", padding=True)
tokenizer

In [9]:
facebook = TextClassificationDataset(
    "czech_facebook",
    tokenizer=tokenizer if 'tokenizer' in vars() or 'tokenizer' in globals() else None
)

if args.buffer_size is None:
    args.buffer_size = len(facebook.train.dataset)

print("Num labels: ", facebook.train.label_mapping.vocabulary_size())
print("Train size: ", len(facebook.train.dataset))
print("Dev size: ", len(facebook.dev.dataset))
print("Test size: ", len(facebook.test.dataset))

In [11]:
facebook.train.data.keys()

In [16]:
for x, y in zip(facebook.train.data["documents"][:10], facebook.train.data["labels"][:10]):
    print("{}: {}".format(y, x))

In [12]:
for x in facebook.train.data["tokens"][:10]:
    print(x)

In [13]:
print(
    facebook.test.label_mapping(["p", "n", "0"]),
    facebook.train.label_mapping(["p", "n", "0"]),
    facebook.dev.label_mapping(["p", "n", "0"])
)

In [14]:
plt.figure(figsize=(14,7))
sns.histplot(facebook.train.data["labels"], kde=True, discrete=True).set_title("Train labels distribution")

In [15]:
plt.figure(figsize=(14,7))
sns.histplot([len(x) for x in facebook.train.data["documents"]], kde=True, discrete=True).set_title("Train documents length distribution")

In [16]:
plt.figure(figsize=(14,7))
sns.histplot([len(x) for x in facebook.dev.data["documents"]], kde=True, discrete=True).set_title("Dev documents length distribution")

In [17]:
plt.figure(figsize=(14,7))
sns.histplot([len(x) for x in facebook.test.data["documents"]], kde=True, discrete=True).set_title("Test documents length distribution")

In [14]:
tf.one_hot([2,3,4], 5)

In [10]:
def create_dataset(name):
    
    dataset_spec = getattr(facebook, name)
    data = tokenizer(dataset_spec.data["documents"], padding=True, return_tensors="tf")
    labels = tf.cast(
        dataset_spec.label_mapping(
            dataset_spec.data["labels"] if name != "test" else tf.fill([len(dataset_spec.dataset)], "0")
        ),
        tf.int32
    )
    
    if args.label_smoothing:
        labels = tf.one_hot(labels, dataset_spec.label_mapping.vocabulary_size())
    
    dataset = tf.data.Dataset.from_tensor_slices((dict(data), labels))
    dataset = dataset.shuffle(args.buffer_size, seed=args.seed) if name == "train" else dataset
    dataset = dataset.batch(args.batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

train, dev, test = create_dataset("train"), create_dataset("dev"), create_dataset("test")
print(train, '\n', dev, '\n', test)

# 3. Model

In [11]:
eleczech = transformers.TFAutoModelForSequenceClassification.from_pretrained("ufal/eleczech-lc-small", num_labels=facebook.train.label_mapping.vocabulary_size())
eleczech

In [12]:
metrics = ['accuracy']

if args.label_smoothing:
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=args.label_smoothing)
    metrics.append('categorical_accuracy')
else:
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    metrics.append('sparse_categorical_accuracy')

eleczech.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=args.learning_rate),
    loss=loss,
    metrics=metrics
)
eleczech.summary()

In [13]:
eleczech.fit(
    train,
    validation_data=dev,
    epochs=args.epochs,
    shuffle=False,
    callbacks=[
        WandbCallback(labels=facebook.train.label_mapping.get_vocabulary(), save_model=False, validation_data=dev)
    ]
)

In [70]:
label_strings = facebook.test.label_mapping.get_vocabulary()
predictiosn = tf.nn.softmax(eleczech.predict(test.take(1)).logits, axis=-1)
for sentence in predictiosn:
    print(label_strings[np.argmax(sentence)])

In [14]:
#os.makedirs(args.logdir, exist_ok=True)
with open("/kaggle/working/sentiment_analysis.txt", "w", encoding="utf-8") as predictions_file:
    # TODO: Predict the tags on the test set.
    predictions = eleczech.predict(test).logits # We dont need to aply softmax since argmax(logits) = argmax(softmax(logits))

    label_strings = facebook.test.label_mapping.get_vocabulary()
    for sentence in predictions:
        print(label_strings[np.argmax(sentence)], file=predictions_file)