## Install transformers ##

In [None]:
!pip install transformers

In [None]:
import pandas as pd

## Load file ##

In [None]:
df = pd.read_csv("/content/SMSSpamCollection",sep="\t", names= ["label", "message"])
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## Feature Extraction ##

In [None]:
#X = list(df['message'])   Independent variables
X = list(df['message'])
X

In [None]:
y = list(df['label'])
y

### Mapping ##

In [None]:
y = pd.get_dummies(y,drop_first=True)['spam']
y

## Train-test split ##

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

## Import Bert tokenizer ##

In [None]:
from transformers import BertTokenizer

In [None]:
model_name="bert-base-uncased"
# Load the BERT large tokenizer
tokenizer = BertTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Get encodings ##
train the features to the BERT model

In [None]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [None]:
#train_encodings
#test_encodings

## Convert encodings into Dataset objects ##

In [None]:
import tensorflow as tf
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

In [None]:
#train_dataset

## import BERT model ##

In [None]:
from transformers import  BertModel, TFTrainer, TFTrainingArguments

In [None]:
training_args = TFTrainingArguments(
    output_dir="./bert-base-imdb",
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    save_total_limit=2,
    save_steps=500,
    logging_steps=100,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="tensorboard",
)

# Training the model with TFTrainer #

In [None]:
with training_args.strategy.scope():
   model = BertModel.from_pretrained(model_name,from_tf=True)

trainer = TFTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset             # evaluation dataset
)

trainer.train()

Downloading tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


ValueError: ignored

In [None]:
trainer.evaluate(test_dataset)

In [None]:
trainer.predict(test_dataset)

In [None]:
output = trainer.predict(test_dataset)[1]
output