# Import necessary libraries

In [None]:
!pip install datasets



In [None]:
import pandas as pd  # Pandas for data manipulation and handling DataFrame objects
from sklearn.model_selection import train_test_split  # Scikit-learn's function to split data into train and test sets
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification  # HuggingFace Transformers for tokenization and model
from transformers import Trainer, TrainingArguments  # HuggingFace Trainer API for model training and fine-tuning
from datasets import Dataset  # HuggingFace datasets library to handle dataset objects and easy integration with models
import torch  # PyTorch for tensor operations and model handling
from sklearn.metrics import classification_report, confusion_matrix  # Scikit-learn for generating classification metrics

# Load Datasets

In [None]:
df = pd.read_csv("intent_train.csv")
df

Unnamed: 0,text,intent
0,listen to westbam alumb allergic on google music,PlayMusic
1,add step to me to the 50 clásicos playlist,AddToPlaylist
2,i give this current textbook a rating value of...,RateBook
3,play the song little robin redbreast,PlayMusic
4,please add iris dement to my playlist this is ...,AddToPlaylist
...,...,...
13079,i want to eat choucroute at a brasserie for 8,BookRestaurant
13080,play funky heavy bluesy,PlayMusic
13081,rate the current album 2 points out of 6,RateBook
13082,go to the photograph the inflated tear,SearchCreativeWork


In [None]:
df['intent'].value_counts()

Unnamed: 0_level_0,count
intent,Unnamed: 1_level_1
PlayMusic,1914
GetWeather,1896
BookRestaurant,1881
RateBook,1876
SearchScreeningEvent,1852
SearchCreativeWork,1847
AddToPlaylist,1818


In [None]:
df = df.sample(n=3000)

In [None]:
df.isnull().sum()

Unnamed: 0,0
text,0
intent,0


# Cleaning

In [None]:
df['intent'] = df['intent'].apply(lambda x: x.lower())

In [None]:
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords (only need to do this once)
nltk.download('stopwords')
# Load the list of stopwords
stop_words = set(stopwords.words('english'))


# Preprocessing function: convert text to lowercase and remove stopwords
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove stopwords: split the text, filter out stopwords, and join back
    text = ' '.join([word for word in text.split() if word not in stop_words])

    return text

# Example usage
text_input = 'Stop to play music'

# Preprocess the text
processed_text = preprocess_text(text_input)
print(f"Processed Text: {processed_text}")


Processed Text: stop play music


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df['text'] = df['text'].apply(preprocess_text)

# Encode Intent

In [None]:
# Check the unique labels in the 'Intent_Label' column
unique_labels = df['intent'].unique()
print("unique labels :", pd.unique)

# Create a mapping from labels to numbers (numeric encoding)
label_to_id = {label: i for i, label in enumerate(unique_labels)}

# Map the 'Intent_Label' to numeric labels in the 'Label' column
df['Label'] = df['intent'].map(label_to_id)

# Check the updated DataFrame
df.head()


unique labels : <function unique at 0x7b167d65ecb0>


Unnamed: 0,text,intent,Label
2680,use deezer service play opera,playmusic,0
8979,put album dubstep dangles dirty playlist,addtoplaylist,1
5560,would please play symphony 1995,playmusic,0
1561,time utamaro five women showing alamo drafthou...,searchscreeningevent,2
6987,play skin & bone jess stacy deezer,playmusic,0


# Tokenitation

In [None]:
# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")  # Load pre-trained DistilBERT tokenizer

# Calculate the maximum tokenized length from the dataset
max_length = max([len(tokenizer.encode(text)) for text in df['text']])  # Tokenize and count the length of each tokenized text
print("Max Length:", max_length)

# Tokenization function with labels
def tokenize_function(examples):  # Define a function to tokenize inputs and add labels
    tokenized_input = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=max_length)  # Use the max_length calculated above
    tokenized_input['labels'] = examples['Label']  # Add labels to the tokenized data for supervised training
    return tokenized_input  # Return tokenized data with labels


dataset = Dataset.from_pandas(df[['text', 'Label']])  # Convert the DataFrame into a HuggingFace Dataset

# Apply tokenization
dataset = dataset.map(tokenize_function, batched=True)  # Apply the tokenization function to the dataset

# Check tokenized data
dataset[0]  # Display the tokenized version of the first example in the dataset to verify the transformation


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Max Length: 21


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

{'text': 'use deezer service play opera',
 'Label': 0,
 '__index_level_0__': 2680,
 'input_ids': [101,
  2224,
  9266,
  6290,
  2326,
  2377,
  3850,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'labels': 0}

# Fine Tune Model

In [None]:

# Initialize DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(unique_labels))

# Move model to GPU if available
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
from transformers import TrainingArguments, Trainer  # Import necessary classes from Hugging Face

# Training arguments
training_args = TrainingArguments(  # Define the training configurations
    output_dir="./results",  # Directory to save results (model checkpoints, logs, etc.)
    evaluation_strategy="epoch",  # Evaluate the model at the end of each epoch
    learning_rate=2e-5,  # Set learning rate for the optimizer
    per_device_train_batch_size=16,  # Batch size for training (number of examples per device)
    per_device_eval_batch_size=64,  # Batch size for evaluation
    num_train_epochs=3,  # Number of epochs to train the model
    weight_decay=0.01,  # L2 regularization to avoid overfitting
    logging_dir="./logs",  # Directory to store training logs
    logging_steps=10,  # Log training information every 10 steps
)

# Trainer setup
trainer = Trainer(  # Initialize the Trainer with the model and training configurations
    model=model,  # Model to be trained
    args=training_args,  # Training arguments
    train_dataset=dataset,  # Training dataset
    eval_dataset=dataset,  # Validation dataset (in practice, this should be a separate dataset)
)

# Train the model
trainer.train()  # Start training the model based on the provided training arguments


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.1775,0.125945
2,0.0774,0.059546
3,0.0195,0.047547


TrainOutput(global_step=564, training_loss=0.3027312492700756, metrics={'train_runtime': 1653.6993, 'train_samples_per_second': 5.442, 'train_steps_per_second': 0.341, 'total_flos': 48903458562000.0, 'train_loss': 0.3027312492700756, 'epoch': 3.0})

# Evaluation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate the model on the test dataset
predictions, true_labels, _ = trainer.predict(dataset)

# Convert predictions to label indices
predicted_labels = predictions.argmax(axis=1)

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(true_labels, predicted_labels))

print("Confusion Matrix:")
print(confusion_matrix(true_labels, predicted_labels))


Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       436
           1       1.00      1.00      1.00       422
           2       1.00      0.98      0.99       442
           3       0.99      1.00      1.00       454
           4       1.00      1.00      1.00       425
           5       1.00      1.00      1.00       406
           6       0.99      0.95      0.97       415

    accuracy                           0.99      3000
   macro avg       0.99      0.99      0.99      3000
weighted avg       0.99      0.99      0.99      3000

Confusion Matrix:
[[434   2   0   0   0   0   0]
 [  1 421   0   0   0   0   0]
 [  1   0 434   0   1   0   6]
 [  0   0   0 454   0   0   0]
 [  0   0   0   2 423   0   0]
 [  0   0   0   1   0 405   0]
 [ 17   0   2   0   0   0 396]]


# Save Model

In [None]:
# Save the model and tokenizer
model.save_pretrained('./saved_fine_tuned_model')
tokenizer.save_pretrained('./saved_fine_tuned_model')

('./saved_fine_tuned_model/tokenizer_config.json',
 './saved_fine_tuned_model/special_tokens_map.json',
 './saved_fine_tuned_model/vocab.txt',
 './saved_fine_tuned_model/added_tokens.json')

# Inference: Prediction System

In [None]:
# Load the fine-tuned model and tokenizer for inference
model = DistilBertForSequenceClassification.from_pretrained('./saved_fine_tuned_model')
tokenizer = DistilBertTokenizer.from_pretrained('./saved_fine_tuned_model')

In [None]:
# Now create the reverse mapping for inference
id_to_label = {i: label for label, i in label_to_id.items()}  # Reverse the mapping
id_to_label

{0: 'playmusic',
 1: 'addtoplaylist',
 2: 'searchscreeningevent',
 3: 'bookrestaurant',
 4: 'getweather',
 5: 'ratebook',
 6: 'searchcreativework'}

In [None]:
# Function to make a prediction
def predict(text, model, tokenizer, max_length=21):
    # Preprocess the input text
    text = preprocess_text(text)
    # Tokenize the input text
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")

    # Make prediction
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model(**inputs)  # Get model output
        logits = outputs.logits  # Get logits from the output

    # Get the predicted label (highest logit)
    predicted_class_id = torch.argmax(logits, dim=-1).item()  # Get the index of the max logit
    return predicted_class_id

In [None]:
# Define the intent labels
id_to_label = {
    0: 'getweather',
    1: 'searchcreativework',
    2: 'searchscreeningevent',
    3: 'addtoplaylist',
    4: 'bookrestaurant',
    5: 'ratebook',
    6: 'playmusic'
}

# Example messages for testing
test_messages = [
    "What's the weather like today?",
    "Find me a creative project about AI.",
    "Are there any events screening this weekend?",
    "Add this song to my playlist.",
    "I'd like to book a table for two.",
    "Rate the book I just finished reading.",
    "Play some relaxing music.",
    "Can you find a documentary on climate change?",
    "What time does the movie start tonight?",
    "Add the new album to my library."
]

# Test the model with the example messages
for message in test_messages:
    predicted_label = predict(message, model, tokenizer)
    predicted_intent = id_to_label.get(predicted_label, "Unknown Intent")
    print(f"Message: {message}")
    print(f"Predicted Label: {predicted_label}, Predicted Intent: {predicted_intent}\n")


Message: What's the weather like today?
Predicted Label: 4, Predicted Intent: bookrestaurant

Message: Find me a creative project about AI.
Predicted Label: 6, Predicted Intent: playmusic

Message: Are there any events screening this weekend?
Predicted Label: 2, Predicted Intent: searchscreeningevent

Message: Add this song to my playlist.
Predicted Label: 1, Predicted Intent: searchcreativework

Message: I'd like to book a table for two.
Predicted Label: 3, Predicted Intent: addtoplaylist

Message: Rate the book I just finished reading.
Predicted Label: 5, Predicted Intent: ratebook

Message: Play some relaxing music.
Predicted Label: 0, Predicted Intent: getweather

Message: Can you find a documentary on climate change?
Predicted Label: 6, Predicted Intent: playmusic

Message: What time does the movie start tonight?
Predicted Label: 2, Predicted Intent: searchscreeningevent

Message: Add the new album to my library.
Predicted Label: 1, Predicted Intent: searchcreativework



# Download directly to your pc

In [None]:
import shutil

# Path to save the model
model_dir = "/content/saved_fine_tuned_model"
model.save_pretrained(model_dir)  # Save the model
tokenizer.save_pretrained(model_dir)  # Save the tokenizer

# Zip the model folder
shutil.make_archive("distilbert_model", 'zip', model_dir)

# Download the zipped model
from google.colab import files
files.download("distilbert_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>