In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.11.1-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 4.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 37.8 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 44.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 39.4 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from transformers import EarlyStoppingCallback



# Define pretrained tokenizer and model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)




Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
config = BertConfig.from_pretrained(model_name)
config.num_labels = 2

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
# model = BertForSequenceClassification.from_pretrained(config)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
# Read data
# FILE_PATH = '/content/drive/MyDrive/Colab Notebooks/dataset/training.1600000.processed.noemoticon.csv'
FILE_PATH = './Twitter_Data.csv'
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"*+


df = pd.read_csv(FILE_PATH, encoding =DATASET_ENCODING, names=DATASET_COLUMNS)


In [None]:
RANDOM_SEED = 42
MAX_LENGTH=100
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

def decode_sentiment(input):
    # return decode_map[int(label)]
    return int(input/4)
df.target = df.target.apply(lambda x: decode_sentiment(x))




In [None]:
max(df.target)

1

In [None]:
trainRatio = 0.006
testRatio = trainRatio / 3


In [None]:
# df_train_val, df_a = train_test_split(df, test_size=0.002, random_state=RANDOM_SEED)
df_train_val, df_a = train_test_split(df, test_size=trainRatio, random_state=RANDOM_SEED)
X = list(df_a.text)
y = list(df_a.target)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
y[:20]

[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0]

In [None]:
y[np.argmax(y)]

1

In [None]:
df_bb, df_b = train_test_split(df, test_size=testRatio, random_state=RANDOM_SEED)
XX = list(df_b.text)
YY = list(df_b.target)
XX_tokenize = tokenizer(XX, padding=True, truncation=True, max_length=MAX_LENGTH)
YY[:20]



[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0]

In [None]:
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=MAX_LENGTH)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=MAX_LENGTH)



In [None]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [None]:
teset_dataset = Dataset(XX_tokenize)

In [None]:
# ----- 2. Fine-tune pretrained model -----#
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


In [None]:
# Define Trainer
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    seed=0,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [None]:
# Train pre-trained model
trainer.train()

***** Running training *****
  Num examples = 7680
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1920


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,No log,0.534167,0.74375,0.788392,0.672859,0.726058
200,No log,0.536635,0.738021,0.685805,0.887513,0.773729
300,No log,0.481575,0.788021,0.782129,0.803922,0.792875
400,No log,0.521615,0.791146,0.800211,0.781218,0.790601
500,0.543600,0.480879,0.792708,0.833918,0.73581,0.781798
600,0.543600,0.52369,0.785417,0.753412,0.854489,0.800774
700,0.543600,0.454721,0.807292,0.792766,0.836945,0.814257
800,0.543600,0.494309,0.807813,0.814465,0.801858,0.808112
900,0.543600,0.492404,0.803646,0.827434,0.77193,0.798719
1000,0.444100,0.652841,0.798958,0.76621,0.865841,0.812984


***** Running Evaluation *****
  Num examples = 1920
  Batch size = 8
Saving model checkpoint to output/checkpoint-100
Configuration saved in output/checkpoint-100/config.json
Model weights saved in output/checkpoint-100/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1920
  Batch size = 8
Saving model checkpoint to output/checkpoint-200
Configuration saved in output/checkpoint-200/config.json
Model weights saved in output/checkpoint-200/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1920
  Batch size = 8
Saving model checkpoint to output/checkpoint-300
Configuration saved in output/checkpoint-300/config.json
Model weights saved in output/checkpoint-300/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1920
  Batch size = 8
Saving model checkpoint to output/checkpoint-400
Configuration saved in output/checkpoint-400/config.json
Model weights saved in output/checkpoint-400/pytorch_model.bin
***** Running Evaluation *****
  Num example

TrainOutput(global_step=1000, training_loss=0.493853515625, metrics={'train_runtime': 689.4454, 'train_samples_per_second': 22.279, 'train_steps_per_second': 2.785, 'total_flos': 411111024000000.0, 'train_loss': 0.493853515625, 'epoch': 1.04})

In [None]:
# ----- 3. Predict -----#
# Load test data
model_path = "output/checkpoint-800"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)


loading configuration file output/checkpoint-800/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.11.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file output/checkpoint-800/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceCla

In [None]:
test_trainer = Trainer(model)
raw_pred, _, _ = test_trainer.predict(teset_dataset)
y_pred = np.argmax(raw_pred, axis=1)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 3200
  Batch size = 8


In [None]:
y_pred[:20]

array([1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0])

In [None]:
YY[:20]

[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0]

In [None]:
a_s= accuracy_score(y_true=YY, y_pred=y_pred)
a_s

0.8778125

In [None]:
PATH = f"/content/drive/MyDrive/Colab Notebooks/models/210930_a{str(a_s)[2:]}/"
torch.save(model.state_dict(), PATH + "state_dict.pt")


In [None]:
! ls -all "$PATH"

total 427754
-rw------- 1 root root 438019245 Sep 29 18:35 state_dict.pt
