# The Kopi Latte Ratio Project: Model Training

The objective of this notebook is to train a text classification model to classify a location based on its reviews

## Load Labelled Model Training Dataset

In [1]:
import os
os.chdir('..')

In [2]:
import pandas as pd

reviews_train_df = pd.read_csv('data/interim/reviews_train.csv')
reviews_train_df.head()

Unnamed: 0,text,label
0,Junyi fortune fish. 吉祥鱼\nDabao the salted veg ...,0
1,Five star for Fish So Nice shop selling grille...,0
2,Mixed Rice stall- i don’t even know how they c...,0
3,Crazy drink lady charged me $1 for this cup of...,0
4,Dim sum stall needs improvement.\nManagement s...,0


## Prepare Train Test Split

I will be using the Hugginface toolkit (Datasets, Transformers) which provides an easy-to-use API interface for efficient training of NLP models

In [3]:
from datasets import Dataset
from datasets import Features, Value, ClassLabel

# Define the schema with ClassLabel for classification columns
features = Features({
    'text': Value(dtype='string', id=None),
    'label': ClassLabel(num_classes=2),  # Assuming binary classification (0 or 1)
})

# Create the Dataset
dataset = Dataset.from_pandas(reviews_train_df, features=features)
dataset = dataset.train_test_split(test_size=0.2, stratify_by_column='label')
dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2644
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 662
    })
})

## Train DistilBERT model 

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")


In [5]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [6]:
tokenized_data = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 2644/2644 [00:00<00:00, 17659.95 examples/s]
Map: 100%|██████████| 662/662 [00:00<00:00, 20059.45 examples/s]


In [7]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
import evaluate

accuracy = evaluate.load("accuracy")

In [9]:
import numpy as np


def compute_metrics(eval_pred):

    predictions, labels = eval_pred

    predictions = np.argmax(predictions, axis=1)

    return accuracy.compute(predictions=predictions, references=labels)

In [10]:
id2label = {0: "KOPITIAM", 1: "CAFE"}

label2id = {"KOPITIAM": 0, "CAFE": 1}

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(

    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id

)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
training_args = TrainingArguments(

    output_dir="kopi_latte",

    learning_rate=2e-5,

    per_device_train_batch_size=16,

    per_device_eval_batch_size=16,

    num_train_epochs=5,

    weight_decay=0.01,

    eval_strategy="epoch",

    save_strategy="epoch",

    load_best_model_at_end=True,


)

trainer = Trainer(

    model=model,

    args=training_args,

    train_dataset=tokenized_data["train"],

    eval_dataset=tokenized_data["test"],

    tokenizer=tokenizer,

    data_collator=data_collator,

    compute_metrics=compute_metrics,

)

trainer.train()

                                                 
 20%|██        | 166/830 [00:19<01:21,  8.13it/s]

{'eval_loss': 0.2533736526966095, 'eval_accuracy': 0.9003021148036254, 'eval_runtime': 1.3771, 'eval_samples_per_second': 480.724, 'eval_steps_per_second': 30.499, 'epoch': 1.0}


                                                 
 40%|████      | 332/830 [00:39<01:05,  7.55it/s]

{'eval_loss': 0.2422170490026474, 'eval_accuracy': 0.9063444108761329, 'eval_runtime': 1.3928, 'eval_samples_per_second': 475.3, 'eval_steps_per_second': 30.155, 'epoch': 2.0}


                                                 
 60%|██████    | 498/830 [00:59<00:33,  9.92it/s]

{'eval_loss': 0.3031269907951355, 'eval_accuracy': 0.9093655589123867, 'eval_runtime': 1.3758, 'eval_samples_per_second': 481.178, 'eval_steps_per_second': 30.528, 'epoch': 3.0}


 60%|██████    | 500/830 [01:01<02:53,  1.90it/s]

{'loss': 0.2389, 'grad_norm': 21.62948989868164, 'learning_rate': 7.951807228915663e-06, 'epoch': 3.01}


                                                 
 80%|████████  | 664/830 [01:19<00:21,  7.77it/s]

{'eval_loss': 0.34027019143104553, 'eval_accuracy': 0.9123867069486404, 'eval_runtime': 1.375, 'eval_samples_per_second': 481.469, 'eval_steps_per_second': 30.546, 'epoch': 4.0}


                                                 
100%|██████████| 830/830 [01:40<00:00,  8.53it/s]

{'eval_loss': 0.36195605993270874, 'eval_accuracy': 0.9093655589123867, 'eval_runtime': 1.3796, 'eval_samples_per_second': 479.858, 'eval_steps_per_second': 30.444, 'epoch': 5.0}


100%|██████████| 830/830 [01:41<00:00,  8.16it/s]

{'train_runtime': 101.7675, 'train_samples_per_second': 129.904, 'train_steps_per_second': 8.156, 'train_loss': 0.16312801177243152, 'epoch': 5.0}





TrainOutput(global_step=830, training_loss=0.16312801177243152, metrics={'train_runtime': 101.7675, 'train_samples_per_second': 129.904, 'train_steps_per_second': 8.156, 'total_flos': 773260846328016.0, 'train_loss': 0.16312801177243152, 'epoch': 5.0})

In [None]:
trainer.save_model('kopi_latte')