# The Kopi Latte Ratio Project: Model Training

The objective of this notebook is to train a text classification model to classify a location based on its reviews

## Load Labelled Model Training Dataset

In [1]:
import os
os.chdir('..')

In [2]:
import pandas as pd

reviews_train_df = pd.read_csv('data/interim/reviews_train.csv')
reviews_train_df.head()

Unnamed: 0,text,label
0,Junyi fortune fish. 吉祥鱼\nDabao the salted veg ...,0
1,Five star for Fish So Nice shop selling grille...,0
2,Mixed Rice stall- i don’t even know how they c...,0
3,Crazy drink lady charged me $1 for this cup of...,0
4,Dim sum stall needs improvement.\nManagement s...,0


## Prepare Train Test Split

I will be using the Hugginface toolkit (Datasets, Transformers) which provides an easy-to-use API interface for efficient training of NLP models

In [3]:
from datasets import Dataset
from datasets import Features, Value, ClassLabel

# Define the schema with ClassLabel for classification columns
features = Features({
    'text': Value(dtype='string', id=None),
    'label': ClassLabel(num_classes=2),  # Assuming binary classification (0 or 1)
})

# Create the Dataset
dataset = Dataset.from_pandas(reviews_train_df, features=features)
dataset = dataset.train_test_split(test_size=0.2, stratify_by_column='label')
dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2644
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 662
    })
})

## Train DistilBERT model 

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")


In [5]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [6]:
tokenized_data = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 2644/2644 [00:00<00:00, 17937.71 examples/s]
Map: 100%|██████████| 662/662 [00:00<00:00, 21799.03 examples/s]


In [7]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
import evaluate

accuracy = evaluate.load("accuracy")

In [9]:
import numpy as np


def compute_metrics(eval_pred):

    predictions, labels = eval_pred

    predictions = np.argmax(predictions, axis=1)

    return accuracy.compute(predictions=predictions, references=labels)

In [10]:
id2label = {0: "KOPITIAM", 1: "CAFE"}

label2id = {"KOPITIAM": 0, "CAFE": 1}

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(

    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id

)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
training_args = TrainingArguments(

    output_dir="kopi_latte",

    learning_rate=2e-5,

    per_device_train_batch_size=16,

    per_device_eval_batch_size=16,

    num_train_epochs=5,

    weight_decay=0.01,

    eval_strategy="epoch",

    save_strategy="epoch",

    load_best_model_at_end=True,


)

trainer = Trainer(

    model=model,

    args=training_args,

    train_dataset=tokenized_data["train"],

    eval_dataset=tokenized_data["test"],

    tokenizer=tokenizer,

    data_collator=data_collator,

    compute_metrics=compute_metrics,

)

trainer.train()

                                                 
 20%|██        | 166/830 [00:20<01:21,  8.13it/s]

{'eval_loss': 0.22107505798339844, 'eval_accuracy': 0.9199395770392749, 'eval_runtime': 1.5176, 'eval_samples_per_second': 436.226, 'eval_steps_per_second': 27.676, 'epoch': 1.0}


                                                 
 40%|████      | 332/830 [00:40<00:46, 10.77it/s]

{'eval_loss': 0.2609260380268097, 'eval_accuracy': 0.9018126888217523, 'eval_runtime': 1.4163, 'eval_samples_per_second': 467.405, 'eval_steps_per_second': 29.654, 'epoch': 2.0}


                                                 
 60%|██████    | 498/830 [01:01<00:28, 11.78it/s]

{'eval_loss': 0.2515360116958618, 'eval_accuracy': 0.9214501510574018, 'eval_runtime': 1.4373, 'eval_samples_per_second': 460.574, 'eval_steps_per_second': 29.221, 'epoch': 3.0}


 60%|██████    | 501/830 [01:01<01:53,  2.90it/s]

{'loss': 0.2373, 'grad_norm': 0.6908215880393982, 'learning_rate': 7.951807228915663e-06, 'epoch': 3.01}


                                                 
 80%|████████  | 664/830 [01:21<00:17,  9.54it/s]

{'eval_loss': 0.28035953640937805, 'eval_accuracy': 0.9274924471299094, 'eval_runtime': 1.4614, 'eval_samples_per_second': 452.987, 'eval_steps_per_second': 28.739, 'epoch': 4.0}


                                                 
100%|██████████| 830/830 [01:41<00:00, 10.23it/s]

{'eval_loss': 0.28974005579948425, 'eval_accuracy': 0.9244712990936556, 'eval_runtime': 1.5169, 'eval_samples_per_second': 436.405, 'eval_steps_per_second': 27.687, 'epoch': 5.0}


100%|██████████| 830/830 [01:42<00:00,  8.09it/s]

{'train_runtime': 102.6382, 'train_samples_per_second': 128.802, 'train_steps_per_second': 8.087, 'train_loss': 0.16432236361216349, 'epoch': 5.0}





TrainOutput(global_step=830, training_loss=0.16432236361216349, metrics={'train_runtime': 102.6382, 'train_samples_per_second': 128.802, 'train_steps_per_second': 8.087, 'total_flos': 779370905091024.0, 'train_loss': 0.16432236361216349, 'epoch': 5.0})

In [13]:
trainer.save_model('kopi_latte')