#### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from transformers import DistilBertTokenizer, AutoTokenizer
from transformers import DistilBertForSequenceClassification, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import EvalPrediction

from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.metrics import roc_auc_score, f1_score, hamming_loss


In [None]:
!pip install -U accelerate
!pip install -U transformers

In [4]:
data = pd.read_csv("quotes.csv")
data.head()

Unnamed: 0,quote,author,tags
0,“The world as we have created it is a process ...,Albert Einstein,"['change', 'deep-thoughts', 'thinking', 'world']"
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"['abilities', 'choices']"
2,“There are only two ways to live your life. On...,Albert Einstein,"['inspirational', 'life', 'live', 'miracle', '..."
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"['aliteracy', 'books', 'classic', 'humor']"
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"['be-yourself', 'inspirational']"


In [5]:
data['author'].value_counts()

author
Albert Einstein           10
J.K. Rowling               9
Marilyn Monroe             7
Dr. Seuss                  6
Mark Twain                 6
C.S. Lewis                 5
Jane Austen                5
Bob Marley                 3
Eleanor Roosevelt          2
Charles Bukowski           2
Suzanne Collins            2
George R.R. Martin         2
Ralph Waldo Emerson        2
Mother Teresa              2
Ernest Hemingway           2
J.D. Salinger              1
George Bernard Shaw        1
J.R.R. Tolkien             1
Alfred Tennyson            1
Terry Pratchett            1
John Lennon                1
George Carlin              1
W.C. Fields                1
Ayn Rand                   1
Jimi Hendrix               1
J.M. Barrie                1
E.E. Cummings              1
Khaled Hosseini            1
Harper Lee                 1
Helen Keller               1
Haruki Murakami            1
Stephenie Meyer            1
Garrison Keillor           1
Thomas A. Edison           1
Douglas

In [24]:
# Analysis dataset
o_df = data.copy()
o_df["word_count"] =data["quote"].apply(lambda x: len(str(x).split(" ")))
o_df["number_count"] = data["quote"].apply(lambda x: len([x for x in x.split() 
                                       if x.isdigit()]))

o_df.head(10)

Unnamed: 0,quote,author,tags,word_count,number_count
0,“The world as we have created it is a process ...,Albert Einstein,"['change', 'deep-thoughts', 'thinking', 'world']",21,0
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"['abilities', 'choices']",16,0
2,“There are only two ways to live your life. On...,Albert Einstein,"['inspirational', 'life', 'live', 'miracle', '...",26,0
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"['aliteracy', 'books', 'classic', 'humor']",19,0
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"['be-yourself', 'inspirational']",16,0
5,“Try not to become a man of success. Rather be...,Albert Einstein,"['adulthood', 'success', 'value']",14,0
6,“It is better to be hated for what you are tha...,André Gide,"['life', 'love']",19,0
7,"“I have not failed. I've just found 10,000 way...",Thomas A. Edison,"['edison', 'failure', 'inspirational', 'paraph...",12,0
8,“A woman is like a tea bag; you never know how...,Eleanor Roosevelt,['misattributed-eleanor-roosevelt'],19,0
9,"“A day without sunshine is like, you know, nig...",Steve Martin,"['humor', 'obvious', 'simile']",9,0


In [7]:
df = data.copy()
df

Unnamed: 0,quote,author,tags
0,“The world as we have created it is a process ...,Albert Einstein,"['change', 'deep-thoughts', 'thinking', 'world']"
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"['abilities', 'choices']"
2,“There are only two ways to live your life. On...,Albert Einstein,"['inspirational', 'life', 'live', 'miracle', '..."
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"['aliteracy', 'books', 'classic', 'humor']"
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"['be-yourself', 'inspirational']"
...,...,...,...
95,“You never really understand a person until yo...,Harper Lee,['better-life-empathy']
96,“You have to write the book that wants to be w...,Madeleine L'Engle,"['books', 'children', 'difficult', 'grown-ups'..."
97,“Never tell the truth to people who are not wo...,Mark Twain,['truth']
98,"“A person's a person, no matter how small.”",Dr. Seuss,['inspirational']


## Label Encoder
This code prepares the data for a multi-label classification task where you have quotes as text data and multiple tags associated with each quote. The MultiLabelBinarizer ensures that the tags are represented in a suitable format for training a multi-label classification model.

In [9]:
multilabel = MultiLabelBinarizer()

labels = multilabel.fit_transform(df['tags']).astype('float32')

texts = df['quote'].tolist()

In [10]:
labels
texts[:5]

['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
 '“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
 '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
 '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
 "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”"]

### Model Building 

In [12]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels,
                                                                    test_size=0.2, random_state=42)

In [13]:
checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=len(labels[0]),
                                                            problem_type="multi_label_classification")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = torch.tensor(self.labels[idx])

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
        }


In [15]:
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

In [26]:



def multi_labels_metrics(predictions, labels, threshold=0.3):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))

    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    y_true = labels

    metrics = {}

    if len(np.unique(y_true)) > 1:  # Check if there are more than one class in y_true
        f1 = f1_score(y_true, y_pred, average='macro')
        roc_auc = roc_auc_score(y_true, y_pred, average='macro')
        hamming = hamming_loss(y_true, y_pred)

        metrics = {
            "roc_auc": roc_auc,
            "hamming_loss": hamming,
            "f1": f1
        }

    return metrics
def compute_metrics(p:EvalPrediction):
   preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

   result = multi_labels_metrics(predictions=preds,
                                labels=p.label_ids)

   return result 

In [27]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir = './results',
    num_train_epochs=5,
    save_steps=1000,
    save_total_limit=2
)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=train_dataset,
                  eval_dataset = val_dataset,
                  compute_metrics=compute_metrics)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [28]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=50, training_loss=0.4995375442504883, metrics={'train_runtime': 126.3955, 'train_samples_per_second': 3.165, 'train_steps_per_second': 0.396, 'total_flos': 13253590732800.0, 'train_loss': 0.4995375442504883, 'epoch': 5.0})

In [None]:
trainer.evaluate()

# Questions

**Where could you use such a project?**
- Recently, with the increase in studies, many articles are published during the day. It can be used to categorise scientific articles into disciplines such as medicine, physics, biology, etc. It can be used to determine which research field or subfield the articles belong to.
- LLMs can understand and translate multiple languages. This capability allows you to use your DistilBERT model to classify multilingual texts. LLMs can understand the semantic similarities and differences of texts in different languages. This can help your DistilBERT model overcome language barriers and serve a global audience.

### Convert text data into numeric data
- What are the options? (At most 1 sentence for each technique is enough)
- Why did you choose your option ? 

- Label Encoding: Assigns a unique integer to each unique textual category.
- One-Hot Encoding: Creates a new binary column for each unique category, indicating presence or absence.
- Count Vectorization: Creates a matrix with each row representing a document and columns representing unique words; values indicate word frequencies.
- TF-IDF: Determines the importance of a word by calculating how often a word occurs in a document (term frequency) and how common that word is in all documents (inverse document frequency).



I chose Label encoder because it is faster than other encoders, which makes it an ideal choice for real-time applications. Also, since there are multiple labels in my dataset, Label encoding is not limited to a single label but can handle multiple labels directly. This helps me to better capture complex relationships in the dataset.