In [1]:
import json

In [4]:
with open("../data/devel.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

In [27]:
filtered_data = [
    {
        "sentence": entry["sentence"],
        "sentence_annotation" : entry["sentence_annotation"],
        "intent" : entry["intent"],
        "action" : entry["action"],
        "scenario" : entry["scenario"],
    }
    for entry in data
]

In [28]:
import pandas as pd
df = pd.DataFrame(filtered_data)

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=512)

df["labels"] = df["intent"].astype("category").cat.codes

encoded_data = df["sentence"].apply(lambda x: tokenizer(x, padding="max_length", truncation=True, return_tensors="pt"))

In [31]:
import torch
from transformers import BertForSequenceClassification

num_labels = df['labels'].nunique()

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
df

Unnamed: 0,sentence,sentence_annotation,intent,action,scenario,labels
0,siri what is one american dollar in japanese yen,siri what is one [currency_name : american dol...,qa_currency,currency,qa,51
1,how many unread emails do i have,how many unread emails do i have,email_query,query,email,15
2,order me chinese food,order me [food_type : chinese] food,takeaway_order,order,takeaway,64
3,does the nearby chinese restaurant do delivery,does the nearby [food_type : chinese] [busines...,takeaway_query,query,takeaway,65
4,remove pepper from my grocery list,remove pepper from my [list_name : grocery] list,lists_remove,remove,lists,37
...,...,...,...,...,...,...
2028,can you give me local news on wayne county she...,can you give me local news on [news_topic : wa...,news_query,query,news,43
2029,every light of room increase its intensity,[device_type : every light] of [house_place : ...,iot_hue_lightup,hue_lightup,iot,31
2030,i would like some coffee now,i would like some coffee now,iot_coffee,coffee,iot,26
2031,what is the population of los angeles,what is the population of [place_name : los an...,qa_factoid,factoid,qa,53


In [None]:
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd

num_labels = df["labels"].nunique()

df["labels"] = df["labels"] - df["labels"].min()

if not df["labels"].between(0, num_labels - 1).all():
    raise ValueError(f"Labels out of range! Found: {df['labels'].unique()}")

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

encoded_data = tokenizer(
    df["sentence"].tolist(),
    padding="max_length",
    truncation=True,
    max_length=64,
    return_tensors="pt"
)

input_ids = encoded_data["input_ids"]
attention_masks = encoded_data["attention_mask"]
labels = torch.tensor(df["labels"].values, dtype=torch.long)

batch_size = 8
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, sampler=RandomSampler(dataset), batch_size=batch_size)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.train()
for batch in dataloader:
    b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]

    model.zero_grad()

    b_labels = b_labels.to(torch.long)

    outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
    loss = outputs.loss
    loss.backward()

    optimizer.step()

    print(f"Loss: {loss.item()}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loss: 4.200831890106201
Loss: 4.286402225494385
Loss: 4.2241644859313965
Loss: 4.284727573394775
Loss: 4.27799129486084
Loss: 4.363480091094971
Loss: 4.417878150939941
Loss: 4.27524471282959
Loss: 4.272261142730713
Loss: 4.406830310821533
Loss: 4.35929536819458
Loss: 4.390708923339844
Loss: 4.300981521606445
Loss: 4.318387031555176
Loss: 4.247747421264648
Loss: 4.457508563995361
Loss: 4.210601806640625
Loss: 4.198469638824463
Loss: 4.258347511291504
Loss: 4.353784084320068
Loss: 4.230840682983398
Loss: 4.169021129608154
Loss: 4.229461669921875
Loss: 4.31201696395874
Loss: 4.296504974365234
Loss: 4.093482494354248
Loss: 4.088507175445557
Loss: 4.117852687835693
Loss: 4.426088333129883
Loss: 4.1382670402526855
Loss: 4.217711925506592
Loss: 4.005492687225342
Loss: 4.15946626663208
Loss: 4.237598896026611
Loss: 4.209317207336426
Loss: 4.105384826660156
Loss: 4.2845635414123535
Loss: 4.219868183135986
Loss: 4.212799072265625
Loss: 3.9636118412017822
Loss: 4.244555473327637
Loss: 4.122553825

In [58]:
df.to_csv("processed_data.csv")

In [None]:
def predict_intent(text):
    model.eval()

    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    if "labels" not in df.columns:
        raise KeyError("The 'labels' column is missing from the DataFrame.")

    if "intent" not in df.columns:
        print("⚠️ Warning: 'intent' column not found, using default labels.")
        df["intent"] = df["labels"].astype(str)

    intent_mapping = dict(zip(df["labels"].unique(), df["intent"].unique()))

    intent_label = intent_mapping.get(predicted_class, "Unknown Intent")

    return intent_label

print(predict_intent("Could you put "))


calendar_set


In [53]:
df["labels"].unique()

array([51, 15, 64, 65, 37, 70,  6, 58, 16,  2, 17, 21, 52,  8, 20,  0, 47,
       40, 25, 62, 69, 46, 38, 55, 13, 63, 43, 44, 53, 67, 48, 59, 35, 49,
       11, 27, 56,  3,  7, 66, 68, 54, 50, 26, 28, 45, 36, 41, 60,  1,  9,
       14, 29,  5,  4, 33, 19, 24, 12, 57, 31, 42, 32, 30, 23, 10, 39, 18,
       61, 34, 22], dtype=int8)

In [55]:
import os

save_directory = "./bert_intent_model"

# Create directory if it does not exist
os.makedirs(save_directory, exist_ok=True)

# Save model
model.save_pretrained(save_directory)

# Save tokenizer
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved in {save_directory}")

Model and tokenizer saved in ./bert_intent_model


In [56]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

load_directory = "./bert_intent_model"

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained(load_directory)

# Load model
model = BertForSequenceClassification.from_pretrained(load_directory)

# Move model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Model and tokenizer loaded successfully!")

Model and tokenizer loaded successfully!


In [57]:
predict_intent("Hey")

'general_quirky'