# 8: Creating a NLP model that can predict the grade for a given expression

It is supposed to run on google colab

In [None]:
! pip install datasets transformers
! pip install --upgrade pandas
! apt install git-lfs

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import transformers
import numpy as np
import pandas as pd
print(transformers.__version__)

# Fine-tuning a model on a text classification task

## Loading the dataset

In [None]:
from datasets import load_dataset, load_metric

In [None]:
import pickle
import pandas as pd
df = pickle.load(open('/content/drive/MyDrive/BCUFR/expressions.p', 'rb'))
df.head()
df.dropna(subset='grade')
df = df.loc[(df.grade != '')]
df['grade'] = df.grade.astype(float)
df = df.loc[(df.grade > -1) & (df.grade < 10)]
df['grade']  = df.grade.astype(int)
df['expression'] = df.expression.astype(str)
df = df.rename({'grade':'label', 'expression':'text'}, axis=1)
df.head()

To get a sense of what the data looks like, the following function will show some examples picked randomly in the dataset.

In [None]:
from datasets import Dataset
dataset = Dataset.from_dict(df)
dataset = dataset.train_test_split(test_size=0.3)

In [None]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(dataset['train'])

## Preprocessing the data

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader

BASE_MODEL = "distilbert-base-uncased"
LEARNING_RATE = 2e-5
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 20

# Let's name the classes 0, 1, 2, 3, 4 like their indices
id2label = {k:k for k in range(10)}
label2id = {k:k for k in range(10)}

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, id2label=id2label, label2id=label2id)

In [None]:
def preprocess_function(examples):
    label = examples["label"] 
    examples = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)
    examples["label"] = label
    return examples

for split in dataset:
    dataset[split] = dataset[split].map(preprocess_function, remove_columns='text')

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

## Fine-tuning the model

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    f"Heritage-in-Digital-Age-{BASE_MODEL}-expression-rating",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=0.03,
    push_to_hub=True,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    
)

trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.push_to_hub()

Then we just need to pass all of this along with our datasets to the `Trainer`:

In [None]:
from transformers import Pipeline
from transformers import AutoTokenizer
    

def softmax(outputs):
    maxes = np.max(outputs, axis=-1, keepdims=True)
    shifted_exp = np.exp(outputs - maxes)
    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)



class ExpressionRankingPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        return preprocess_kwargs, {}, {}

    def preprocess(self, inputs, maybe_arg=2):
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)
        model_input = tokenizer(inputs, truncation=True, padding="max_length", max_length=256, return_tensors="pt")

        return {"model_input": model_input}

    def _forward(self, model_inputs):
        return self.model(**model_inputs['model_input'])
        
    def postprocess(self, model_outputs):
        logits = model_outputs.logits[0].numpy()
        probabilities = softmax(logits)

        best_class = np.argmax(probabilities)
        label = self.model.config.id2label[best_class]
        score = probabilities[best_class].item()
        logits = logits.tolist()

        return {"label": label, "score": score, "logits": logits}


In [None]:
from transformers.pipelines import PIPELINE_REGISTRY
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification

PIPELINE_REGISTRY.register_pipeline(
    "expression-ranking",
    pipeline_class=ExpressionRankingPipeline,
    pt_model=AutoModelForSequenceClassification,
    default={"pt": ("tgieruc/Heritage-in-Digital-Age-distilbert-base-uncased-expression-rating", "expression rating")},
)

In [None]:
from transformers import pipeline


classifier = pipeline("expression-ranking", model="tgieruc/Heritage-in-Digital-Age-distilbert-base-uncased-expression-rating")
