In [164]:
import tensorflow as tf
import pandas as pd
import re
import string
import numpy as np
import torch as nn
import transformers

In [5]:
tf.__version__

'2.1.0'

In [None]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    TRAIN_BATCH_SIZE=8,
    VALID_BATCH_SIZE = 4,
    LR=3e-4,
    EPOCHS=2,
    BERT_PATH = "../input/bert_base_uncased/",
    MODEL_PATH = "model.bin",
    MAX_LEN=100, # necessary to limit memory usage
    TOKENIZER = transformers.BertTokenizer.from_pretrained(BERT_PATH,do_lower_case=True)
)

In [None]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 10
BERT_PATH = "../input/bert_base_uncased/"
MODEL_PATH = "model.bin"
TRAINING_FILE = "../input/imdb.csv"
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BERT_PATH,
    do_lower_case=True
)

In [None]:
def replace_regex(targetval,replacedval,regex):
    return re.sub(regex, replacedval, targetval)

def remove_punctuation(s):
    translator = str.maketrans('', '', string.punctuation)
    return s.translate(translator)

In [None]:
def clean_data(df):
    df['SentimentText'] = df['SentimentText'].apply(lambda x: replace_regex(x,'','(http:\/\/(bit\.ly|t\.co|lnkd\.in|tcrn\.ch)\S*)'))
    df['SentimentText'] = df['SentimentText'].apply(lambda x: replace_regex(x,'','@\w+'))
    df['SentimentText'] = df['SentimentText'].apply(remove_punctuation)
    df['SentimentText'] = df['SentimentText'].apply(lambda x: replace_regex(x,' ',' +'))
    df['SentimentText'] = df['SentimentText'].str.strip()
    return df
    

In [None]:
class ROBERTADataset:
    def __init__(self, review, target):
        self.review = review
        self.target = target
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
    
    def __len__(self):
        return len(self.review)
    
    def __getitem__(self, item):
        review = str(self.review[item])
        review = " ".join(review.split())

        inputs = self.tokenizer.encode_plus(
            review,
            None,
            add_special_tokens=True,
            max_length=self.max_len
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        padding_length = self.max_len - len(ids)
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.target[item], dtype=torch.float)
        }

In [70]:
train = pd.read_csv('train.csv',encoding='latin-1')

In [71]:
train = clean_data(train)

In [73]:
train.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


In [80]:
train['SentimentText'].loc[90]

'jonas day is almost over'

In [95]:
from transformers import RobertaConfig, TFRobertaForSequenceClassification, RobertaTokenizer

In [126]:
configuration = RobertaConfig()
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')

In [127]:
model.config_class

transformers.configuration_roberta.RobertaConfig

In [128]:
configuration = model.config
configuration

RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "do_sample": false,
  "eos_token_ids": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": null,
  "pruned_heads": {},
  "repetition_penalty": 1.0,
  "temperature": 1.0,
  "top_k": 50,
  "top_p": 1.0,
  "torchscript": false,
  "type_vocab_size": 1,
  "use

In [96]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [140]:
tokenizer.convert_ids_to_tokens(2335) 

'Ġdog'

In [139]:
tokenizer.convert_tokens_to_ids('ĠHello') 

20920

In [118]:
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]
input_ids

<tf.Tensor: shape=(1, 8), dtype=int32, numpy=array([[    0, 20920,     6,   127,  2335,    16, 11962,     2]])>

In [124]:
input_ids = tf.constant(tokenizer.encode("The Hello, my dog austr is cute", add_special_tokens=True))[None, :]
input_ids

<tf.Tensor: shape=(1, 11), dtype=int32, numpy=
array([[    0,    20, 20920,     6,   127,  2335, 28410,   338,    16,
        11962,     2]])>

In [168]:
input_ids = nn.torch.tensor(tokenizer.encode_plus("India won the world cup","hello this is inis", add_special_tokens=True))
input_ids


RuntimeError: Could not infer dtype of dict

In [119]:
tf.expand_dims(input_ids, 0) 

<tf.Tensor: shape=(1, 1, 8), dtype=int32, numpy=array([[[    0, 20920,     6,   127,  2335,    16, 11962,     2]]])>

In [120]:
labels = tf.constant([1])[None, :] 

In [121]:
outputs = model(input_ids)
logits = outputs[0]
logits

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.07220323,  0.03044231]], dtype=float32)>

In [141]:
outputs

(<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.07220323,  0.03044231]], dtype=float32)>,)

In [142]:
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForMaskedLM

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]

In [153]:
import numpy as np
t = np.array(outputs)[0][0][0]

In [154]:
t.shape

(50265,)