In [1]:
# Step 1: Set up your development environment
# Install the necessary libraries if not already installed
#!pip install transformers

# Step 2: Import the required libraries
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch
#import pandas as pd

# Step 3: Load and preprocess your data
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": torch.tensor(label)
        }




In [6]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
import datasets

dataset = datasets.load_dataset("wikisql")
trainDataSet=dataset["train"]
testDataSet=dataset["test"]

train_questions = trainDataSet["question"]
train_categories= [ sql["agg"] for sql in trainDataSet["sql"]]


train_cleaned_questions = [q[:-1] if q.endswith("?") else q for q in train_questions]
train_cleaned_questions = [q.replace("\xa0", " ") for q in train_cleaned_questions]
train_cleaned_questions = [q.strip() for q in train_cleaned_questions]


test_questions = testDataSet["question"]


test_cleaned_questions = [q[:-1] if q.endswith("?") else q for q in test_questions]
test_cleaned_questions = [q.replace("\xa0", " ") for q in test_cleaned_questions]
test_cleaned_questions = [q.strip() for q in test_cleaned_questions]
#train_df = [a + ',' + b for a, b in zip(train_cleaned_questions, train_categories)]

# Load your train and test datasets into the notebook
#train_df = pd.read_csv("train.csv")
#test_df = pd.read_csv("test.csv")

# Split the train dataset into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_cleaned_questions,
    train_categories,
    test_size=0.2,
    random_state=42
)

#train_texts, val_texts, train_labels, val_labels = train_test_split(
#    train_df["text"].values,
#    train_df["label"].values,
#    test_size=0.2,
#    random_state=42
#)


Found cached dataset wikisql (/home/studio-lab-user/.cache/huggingface/datasets/wikisql/default/0.1.0/7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d)


  0%|          | 0/3 [00:00<?, ?it/s]

In [14]:

# Define the tokenizer and maximum sequence length
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
max_length = 128

# Create instances of the CustomDataset for train, validation, and test sets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_length)
test_dataset = CustomDataset(test_cleaned_questions, None, tokenizer, max_length)

# Step 4: Fine-tune a pre-trained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)
model.to(device)


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
batch_size = 32
epochs = 5

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

for epoch in range(epochs):
    model.train()
    train_loss = 0.0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        train_loss += loss.item()

        loss.backward()
        optimizer.step()

    train_loss /= len(train_loader)

    model.eval()
    val_loss = 0.0
    val_predictions = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            val_loss += loss.item()

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            val_predictions.extend(predictions.cpu().numpy())

    val_loss /= len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_predictions)

    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Accuracy: {val_accuracy:.4f}")


In [None]:

# Step 5: Evaluate your model
model.eval()
test_predictions = []

with torch.no_grad():
    for batch in DataLoader(test_dataset, batch_size=batch_size, shuffle=False):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        test_predictions.extend(predictions.cpu().numpy())


In [None]:

# Step 6: Save and use the model with Hugging Face pipeline
model.save_pretrained("path/to/save/model")

text_classifier = pipeline("text-classification", model="path/to/save/model")
example_text = "Example question"
predicted_label = text_classifier(example_text)

print(f"Predicted label for '{example_text}': {predicted_label[0]['label']}"