In [86]:
# %cd /samsung-4tb/cp-eng/pattern/course-projects
%cd /workspace/
import json
import re

import numpy as np
import pandas as pd
import torch
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from transformers import (AutoModelForSequenceClassification, 
                          AutoTokenizer, DataCollatorWithPadding, 
                          TrainingArguments, Trainer)
from tqdm.auto import tqdm

from baseline.tokenizer import tokenize
from baseline.model import NbSVC, NbLogisticRegression
from baseline.evaluate import calculate_score

# LIMESODA_DIR = "/samsung-4tb/cp-eng/pattern/course-projects/dataset/LimeSoda/"
LIMESODA_DIR = "/workspace/dataset/LimeSoda"
DELIMITER = ""

/workspace


ImportError: cannot import name 'calculate_score' from 'baseline.evaluate' (/workspace/baseline/evaluate.py)

## Load dataset

In [42]:
def read_limesoda(delimiter=" "):
    train, val, test = [], [], []
    mapper = {"Fake News": 0, "Fact News": 1}
    
    # train
    with open(f"{LIMESODA_DIR}//../tempLimesoda/train_v1.jsonl", "r") as f:
        for line in tqdm(f.readlines()):
            line = json.loads(line)
            line["label"] = mapper[line["Document Tag"]]
            line["text"] = delimiter.join([t for t in line["Text"] if len(t.strip()) > 0])
            line.pop("Document Tag")
            line.pop("Text")
            train.append(line)
           
    # val
    with open(f"{LIMESODA_DIR}//../tempLimesoda/val_v1.jsonl", "r") as f:
        for line in tqdm(f.readlines()):
            line = json.loads(line)
            line["label"] = mapper[line["Document Tag"]]
            line["text"] = delimiter.join([t for t in line["Text"] if len(t) > 0])
            line.pop("Document Tag")
            line.pop("Text")
            val.append(line)
            
    with open(f"{LIMESODA_DIR}//../tempLimesoda/test_v1.jsonl", "r") as f:
        for line in tqdm(f.readlines()):
            line = json.loads(line)
            if line["Document Tag"] not in mapper.keys():
                continue
            line["label"] = mapper[line["Document Tag"]]
            line["text"] = delimiter.join([t for t in line["Text"] if len(t) > 0])
            line.pop("Document Tag")
            line.pop("Text")
            test.append(line)

    return {
        "train": pd.DataFrame(train),
        "val": pd.DataFrame(val),
        "test": pd.DataFrame(test)
    }
            
dataset = read_limesoda(delimiter=DELIMITER)

  0%|          | 0/2698 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/2765 [00:00<?, ?it/s]


## Prepare Dataset

In [43]:
model_name = 'airesearch/wangchanberta-base-att-spm-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file https://huggingface.co/airesearch/wangchanberta-base-att-spm-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/616a9e2dfc52e9d019b75d219ed800a27158ed299bd4fad91363110fe93dfce1.27c4f6581fbedf3d12e9fae96d4fbb8bc3064cd88ae545414e7cffc7c5bbc52f
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19

In [44]:
class LimeSodaDataset(torch.utils.data.Dataset):
    def __init__(self, label_path, tokenizer, max_length=416, delimiter=" "):  # max wangchan 416 subwords
        self.mapper = {"Fake News": 0, "Fact News": 1}
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = tokenizer
        self.label_path = label_path
        self.delimiter = delimiter
        self.max_length = max_length
        self.load_dataframe()
        
    def load_dataframe(self):
        print("Loading data...")
        data = []
        with open(self.label_path, "r") as f:
            for line in tqdm(f.readlines()):
                line = json.loads(line)
                if line["Document Tag"] not in self.mapper.keys():
                    continue
                line["label"] = self.mapper[line["Document Tag"]]
                line["text"] = self.delimiter.join([t for t in line["Text"] if len(t.strip()) > 0])
                line.pop("Document Tag")
                line.pop("Text")
                data.append(line)
        self.data = pd.DataFrame(data)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        text = item["text"]
        label = item["label"]
        feature = self.tokenizer(text, padding="max_length", max_length=self.max_length, truncation=True)
        feature = {k: torch.tensor(v).to(self.device) for k, v in feature.items()}
        feature["labels"] = torch.tensor(label).to(self.device)
        return feature

In [45]:
train_dataset = LimeSodaDataset(f"{LIMESODA_DIR}/../tempLimesoda/train_v1.jsonl", tokenizer)
val_dataset = LimeSodaDataset(f"{LIMESODA_DIR}/../tempLimesoda/val_v1.jsonl", tokenizer)
test_dataset = LimeSodaDataset(f"{LIMESODA_DIR}/../tempLimesoda/test_v1.jsonl", tokenizer)

Loading data...


  0%|          | 0/2698 [00:00<?, ?it/s]

Loading data...


  0%|          | 0/300 [00:00<?, ?it/s]

Loading data...


  0%|          | 0/2765 [00:00<?, ?it/s]

## Prepare WangchanBERTa

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

if torch.cuda.is_available():
    model = model.cuda()

Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

In [14]:
from datasets import load_metric


metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results/wangchanberta",
    learning_rate=2e-6,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [15]:
trainer.train()

***** Running training *****
  Num examples = 2698
  Num Epochs = 20
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 1
  Total optimization steps = 1140


Step,Training Loss
500,0.375
1000,0.319


Saving model checkpoint to ./results/wangchanberta/checkpoint-500
Configuration saved in ./results/wangchanberta/checkpoint-500/config.json
Model weights saved in ./results/wangchanberta/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/wangchanberta/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/wangchanberta/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/wangchanberta/checkpoint-1000
Configuration saved in ./results/wangchanberta/checkpoint-1000/config.json
Model weights saved in ./results/wangchanberta/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/wangchanberta/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/wangchanberta/checkpoint-1000/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1140, training_loss=0.3415559266742907, metrics={'train_runtime': 1201.2585, 'train_samples_per_second': 44.92, 'train_steps_per_second': 0.949, 'total_flos': 1.15354464446208e+16, 'train_loss': 0.3415559266742907, 'epoch': 20.0})

In [102]:
model.save_pretrained("results/wangchanberta/final")

Configuration saved in results/wangchanberta/final/config.json
Model weights saved in results/wangchanberta/final/pytorch_model.bin


## Evaluate

In [88]:
from sklearn.metrics import (accuracy_score, confusion_matrix, f1_score,
                             precision_score, recall_score)

def calculate_score(y, y_pred):
    precision_micro = precision_score(y, y_pred, average='micro')
    precision_macro = precision_score(y, y_pred, average='macro')

    recall_micro = recall_score(y, y_pred, average='micro')
    recall_macro = recall_score(y, y_pred, average='macro')

    f1_micro = f1_score(y, y_pred, average='micro')
    f1_macro = f1_score(y, y_pred, average='macro')

    conf_matrix = confusion_matrix(y, y_pred, normalize=None)
    norm_true_cm = np.nan_to_num(
        confusion_matrix(y, y_pred, normalize="true"), 0.)
    norm_pred_cm = np.nan_to_num(
        confusion_matrix(y, y_pred, normalize="pred"), 0.)

    overall_accuracy = accuracy_score(y, y_pred)
    average_accuracy = np.diag(norm_true_cm).mean()
    return {
        "prediction": y_pred.tolist(),
        "accuracy": {"overall": overall_accuracy, "average": average_accuracy},
        "recall": {"micro": recall_micro, "macro": recall_macro},
        "precision": {"micro": precision_micro, "macro": precision_macro},
        "f1": {"micro": f1_micro, "macro": f1_macro},
        "confusion_matrix": {
            "none": conf_matrix.tolist(),
            "true": norm_true_cm.tolist(),
            "pred": norm_pred_cm.tolist()
        }
    }

In [16]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 300
  Batch size = 32


{'eval_loss': 0.33064138889312744,
 'eval_accuracy': 0.8466666666666667,
 'eval_runtime': 3.3073,
 'eval_samples_per_second': 90.708,
 'eval_steps_per_second': 3.024,
 'epoch': 20.0}

In [70]:
result = trainer.predict(val_dataset)
val_result = calculate_score(dataset["val"]["label"].values, result.predictions.argmax(-1))
_ = val_result.pop("prediction")

***** Running Prediction *****
  Num examples = 300
  Batch size = 32


In [96]:
val_result

{'accuracy': {'overall': 0.8466666666666667, 'average': 0.846728307254623},
 'recall': {'micro': 0.8466666666666667, 'macro': 0.846728307254623},
 'precision': {'micro': 0.8466666666666667, 'macro': 0.8466666666666667},
 'f1': {'micro': 0.8466666666666667, 'macro': 0.8466598515489577},
 'confusion_matrix': {'none': [[126, 22], [24, 128]],
  'true': [[0.8513513513513513, 0.14864864864864866],
   [0.15789473684210525, 0.8421052631578947]],
  'pred': [[0.84, 0.14666666666666667], [0.16, 0.8533333333333334]]}}

In [98]:
result = trainer.predict(test_dataset)
test_result = calculate_score(dataset["test"]["label"].values, result.predictions.argmax(-1))
_ = test_result.pop("prediction")

***** Running Prediction *****
  Num examples = 438
  Batch size = 32


In [99]:
test_result

{'accuracy': {'overall': 0.815068493150685, 'average': 0.7886028172811215},
 'recall': {'micro': 0.815068493150685, 'macro': 0.7886028172811215},
 'precision': {'micro': 0.815068493150685, 'macro': 0.6266863905325444},
 'f1': {'micro': 0.815068493150685, 'macro': 0.6495757731398714},
 'confusion_matrix': {'none': [[329, 72], [9, 28]],
  'true': [[0.8204488778054863, 0.17955112219451372],
   [0.24324324324324326, 0.7567567567567568]],
  'pred': [[0.9733727810650887, 0.72], [0.026627218934911243, 0.28]]}}