In [1]:
# Upgrade transformers to latest version to avoid TrainingArguments errors
!pip install --upgrade -q transformers datasets scikit-learn accelerate evaluate


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig


In [3]:
from google.colab import files
uploaded = files.upload()  # choose your CSV file
CSV_PATH = list(uploaded.keys())[0]
print("Uploaded file:", CSV_PATH)

Saving inductAI_softskills_dataset.csv to inductAI_softskills_dataset.csv
Uploaded file: inductAI_softskills_dataset.csv


In [4]:
import transformers
print(transformers.__version__)

4.57.3


In [5]:
df = pd.read_csv(CSV_PATH)
print(df.head())

# Combine question and answer into one text input
df['text'] = "Q: " + df['question'].astype(str) + " \nA: " + df['answer'].astype(str)
label_cols = ['communication', 'confidence', 'teamwork', 'problem_solving']

# Ensure labels are numeric and drop missing rows
for c in label_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')
df = df.dropna(subset=label_cols + ['text']).reset_index(drop=True)

# Split train and validation
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

train_ds = Dataset.from_pandas(train_df[['text'] + label_cols])
val_ds   = Dataset.from_pandas(val_df[['text'] + label_cols])

dataset = DatasetDict({"train": train_ds, "validation": val_ds})
print(dataset)


                                            question  \
0  Describe a situation where you had to take a b...   
1  Describe a situation where you had to take a b...   
2  Describe a situation where you had to take a b...   
3  Describe a situation where you had to take a b...   
4  Describe a situation where you had to take a b...   

                                              answer  communication  \
0  I had to convince management to invest in a ne...              5   
1  Ahh, I once suggested a different approach in ...              5   
2  I recommended a client solution which was unus...              4   
3  Uh, I told my team to skip a minor step in a r...              3   
4  I suggested delaying a campaign due to market ...              4   

   confidence  problem_solving  teamwork  
0           5                4         4  
1           5                4         5  
2           5                4         4  
3           3                3         3  
4           4        

In [6]:
MODEL_NAME = "microsoft/deberta-v3-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(examples):
    tokenized = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=256)
    labels = [[examples[c][i] for c in label_cols] for i in range(len(examples['text']))]
    tokenized['labels'] = labels
    return tokenized

tokenized_ds = dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/691 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

In [7]:
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=4, problem_type="regression")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)


pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

def compute_metrics(pred):
    preds = pred.predictions
    labels = pred.label_ids
    preds = np.clip(preds, 1.0, 5.0)  # keep predictions in 1-5
    metrics = {}
    metrics['mae_overall'] = mean_absolute_error(labels.flatten(), preds.flatten())
    metrics['mse_overall'] = mean_squared_error(labels.flatten(), preds.flatten())
    for i, name in enumerate(label_cols):
        metrics[f'mae_{name}'] = mean_absolute_error(labels[:,i], preds[:,i])
    return metrics


In [9]:
from torch.utils.data import DataLoader, Dataset
import torch

class SoftSkillDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = label
        return item


In [10]:
train_texts = train_df['text'].tolist()
train_labels = train_df[label_cols].values.tolist()
val_texts = val_df['text'].tolist()
val_labels = val_df[label_cols].values.tolist()

train_dataset = SoftSkillDataset(train_texts, train_labels, tokenizer)
val_dataset = SoftSkillDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


In [11]:
from transformers import AutoModel
import torch.nn as nn
import torch.optim as optim

class SoftSkillModel(nn.Module):
    def __init__(self, model_name, num_labels=4):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.regressor = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:,0,:]  # CLS token
        return self.regressor(pooled)

model = SoftSkillModel(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.MSELoss()


In [12]:
epochs = 4

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print(f"Epoch {epoch+1}, Train Loss: {train_loss/len(train_loader)}")


Epoch 1, Train Loss: 8.655356458642267
Epoch 2, Train Loss: 0.7845197929577394
Epoch 3, Train Loss: 0.7832007963549007
Epoch 4, Train Loss: 0.6974895806475119


# Task
Please execute the code in the cell with ID `TORH4L5JFkMU`.

## Execute Upgrade Cell

### Subtask:
Ensure cell `TORH4L5JFkMU` (the `!pip install --upgrade` command) is executed. You might need to explicitly run it if Colab's 'Run all' didn't catch it.


**Reasoning**:
The subtask is to execute the specified cell, which contains the pip install command. The command should be run directly.



In [13]:
torch.save(model.state_dict(), "/content/softskills_model.pt")
tokenizer.save_pretrained("/content/softskills_model")
print("Model saved!")


Model saved!


In [16]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
from google.colab import files
files.download("/content/softskills_model.pt")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn

MODEL_NAME = "microsoft/deberta-v3-small"

class SoftSkillModel(nn.Module):
    def __init__(self, model_name, num_labels=4):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.regressor = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:,0,:]   # CLS token
        return self.regressor(pooled)


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("./softskills_model")  # folder path

# Load model
model = SoftSkillModel(MODEL_NAME)
model.load_state_dict(torch.load("softskills_model.pt", map_location="cpu"))
model.eval()


SoftSkillModel(
  (bert): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-5): 6 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, e

In [19]:
def predict_softskills(question, answer):
    text = f"Q: {question}\nA: {answer}"

    encoding = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=256
    )

    with torch.no_grad():
        outputs = model(
            input_ids=encoding["input_ids"],
            attention_mask=encoding["attention_mask"]
        )

    preds = outputs.squeeze().tolist()

    # Clip predictions to 1–5 range
    preds = [max(1, min(5, p)) for p in preds]

    return {
        "communication": round(preds[0], 2),
        "confidence": round(preds[1], 2),
        "teamwork": round(preds[2], 2),
        "problem_solving": round(preds[3], 2)
    }


In [20]:
result = predict_softskills(
    "Describe a time when you had to speak up for yourself",
    "When I was wrong I have corrected the manager"
)

print(result)


{'communication': 4.64, 'confidence': 4.58, 'teamwork': 4.25, 'problem_solving': 4.39}
