In [1]:
import pandas as pd
import numpy as np
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import accuracy_score, classification_report, f1_score, roc_auc_score

from transformers import (
    BertTokenizer,
    BertModel,
    get_linear_schedule_with_warmup
)

In [2]:
def load_questions(f_csv='Processed_data.csv'):
  df = pd.read_csv(f_csv)
  df = df[['question', 'target']]
  df = df[~df['target'].isin(['?', 'unit 2'])]
  encoder = LabelEncoder()
  df['class_label'] = encoder.fit_transform(df['target'])

  label_target = pd.DataFrame()
  label_target['class_label'] = np.unique(df['class_label'])
  label_target['target'] = encoder.inverse_transform(label_target['class_label'])

  questions = df['question'].to_list()
  labels = df['class_label'].to_list()
  return questions, labels, label_target.to_dict(orient='dict')['target']

In [4]:
questions, labels, label_target = load_questions()

In [5]:
label_target

{0: 'entrepreneurs and leaders',
 1: 'managing people',
 2: 'marketing mix and strategy',
 3: 'meeting customer needs',
 4: 'the market'}

In [6]:
class QuestionsDataset(Dataset):
  def __init__(self, questions, labels, tokenizer, max_length):
          self.questions = questions
          self.labels = labels
          self.tokenizer = tokenizer
          self.max_length = max_length


  def __len__(self):
      return len(self.questions)


  def __getitem__(self, idx):
      question = self.questions[idx]
      label = self.labels[idx]
      encoding = self.tokenizer(question, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
      return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [7]:
class BERTClassifier(nn.Module):
  def __init__(self, bert_model_name, num_classes):
      super(BERTClassifier, self).__init__()
      self.bert = BertModel.from_pretrained(bert_model_name)
      self.dropout = nn.Dropout(0.1)
      self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)


  def forward(self, input_ids, attention_mask):
          outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
          pooled_output = outputs.pooler_output
          x = self.dropout(pooled_output)
          logits = self.fc(x)
          return logits

In [8]:
def train(model, data_loader, optimizer, scheduler, device):
  model.train()
  for batch in data_loader:
      optimizer.zero_grad()
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      loss = nn.CrossEntropyLoss()(outputs, labels)
      loss.backward()
      optimizer.step()
      scheduler.step()

def evaluate(model, data_loader, num_labels, device):
    model.eval()
    all_labels = []
    all_probs  = []
    all_preds  = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Получаем логиты
            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask)

            logits = outputs if isinstance(outputs, torch.Tensor) else outputs.logits

            # probability distribution
            probs = F.softmax(logits, dim=1)

            # жёсткие предсказания
            preds = torch.argmax(probs, dim=1)

            all_labels.append(labels.cpu().numpy())
            all_probs .append(probs.cpu().numpy())
            all_preds .append(preds.cpu().numpy())

    # склеиваем всё в один массив
    all_labels = np.concatenate(all_labels, axis=0)
    all_probs  = np.concatenate(all_probs,  axis=0)
    all_preds  = np.concatenate(all_preds,  axis=0)

    # 1) F1-score
    f1 = f1_score(all_labels, all_preds, average='weighted')

    # 2) ROC-AUC

    # one-hot true labels
    labels_onehot = label_binarize(all_labels, classes=list(range(num_labels)))
    roc_auc = roc_auc_score(
        labels_onehot,
        all_probs,
        multi_class='ovr',
        average='weighted'
    )

    return {'f1': f1, 'roc_auc': roc_auc}

In [9]:
len(max(questions, key=len))

289

In [10]:
train_questions, val_questions, train_labels, val_labels = train_test_split(questions, labels, test_size=0.2, random_state=42)

In [18]:
bert_model_name = 'bert-base-uncased'
num_classes = len(np.unique(labels))
max_length = 300
batch_size = 16
num_epochs = 20
learning_rate = 2e-5

In [19]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = QuestionsDataset(train_questions, train_labels, tokenizer, max_length)
val_dataset = QuestionsDataset(val_questions, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

In [23]:
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
total_steps = len(train_dataloader) * num_epochs
warmup_steps = int(0.2 * total_steps)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

In [24]:
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        metrics = evaluate(model, val_dataloader, num_classes, device)
        metrics_str = '\n'.join(f'{k}={v}' for k, v in metrics.items())
        print(f"Validation metrics: {metrics_str}")


Epoch 1/20
Validation metrics: f1=0.673245122224714
roc_auc=0.9482117979184778
Epoch 2/20
Validation metrics: f1=0.7113217098438773
roc_auc=0.9450241376229602
Epoch 3/20
Validation metrics: f1=0.8101820197390132
roc_auc=0.961999472752807
Epoch 4/20
Validation metrics: f1=0.7930099975212757
roc_auc=0.9657663414722168
Epoch 5/20
Validation metrics: f1=0.7164773601748392
roc_auc=0.9555277115051236
Epoch 6/20
Validation metrics: f1=0.8124477120040562
roc_auc=0.9681182688511777
Epoch 7/20
Validation metrics: f1=0.7555406045201963
roc_auc=0.9610374006625358
Epoch 8/20
Validation metrics: f1=0.7930099975212757
roc_auc=0.9726834901651674
Epoch 9/20
Validation metrics: f1=0.7930099975212757
roc_auc=0.9666672388046894
Epoch 10/20
Validation metrics: f1=0.7930099975212757
roc_auc=0.9671649988843309
Epoch 11/20
Validation metrics: f1=0.7930099975212757
roc_auc=0.9706836477231767
Epoch 12/20
Validation metrics: f1=0.7930099975212757
roc_auc=0.9763134858653303
Epoch 13/20
Validation metrics: f1=0.79

In [27]:
test_metrics = evaluate(model, val_dataloader, num_classes, device)
print('\n'.join(f'{k} = {v}' for k, v in test_metrics.items()))

f1 = 0.7930099975212757
roc_auc = 0.9742022965620226


# ТЕСТ на новых вопросах

In [29]:
label_target

{0: 'entrepreneurs and leaders',
 1: 'managing people',
 2: 'marketing mix and strategy',
 3: 'meeting customer needs',
 4: 'the market'}

In [30]:
target_label = {v:k for k, v in label_target.items()}
target_label

{'entrepreneurs and leaders': 0,
 'managing people': 1,
 'marketing mix and strategy': 2,
 'meeting customer needs': 3,
 'the market': 4}

In [32]:
df = pd.read_excel('new_questions.xlsx')
new_questions = df['question'].to_list()
new_questions

['Define the term ‘quantitative data’ . (Extract A, line 12)',
 'Construct a supply and demand diagram to show the impact of increasing publicity on the biodegradable packaging market.',
 'Analyse two entrepreneurial characteristics shown by Vaibhav Anant that may have helped Bambrew to succeed.',
 'Discuss the benefits for Vaibhav Anant of using secondary market research when starting his business .',
 'Assess if there is likely to be a trade-off between an ethical stance and profit for Bambrew.',
 'Define the term ‘social trends’ . (Extract D, line 2)',
 'Using the data, calculate the increase in monthly revenue for Aldi  between January 2019 and January 2020. You are advised to show your workings.',
 'Analyse two benefits for Aldi  of having employee welfare as one of its main business objectives.',
 'Assess the benefits for Aldi  of sponsoring the UK Olympic team.',
 'Assess the likely impact of price comparison websites on supermarkets in the UK.']

In [35]:
def predict_topic(text, model, tokenizer, device, max_length=300):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
    return label_target[preds.item()]

In [38]:
df_new = pd.DataFrame()
q_topic = {
    'question': [],
    'target': [],
}
for q in new_questions:
  pred = predict_topic(q, model, tokenizer, device,)
  q_topic['question'].append(q)
  q_topic['target'].append(pred)
df_new = pd.DataFrame(q_topic)
df_new

Unnamed: 0,question,target
0,Define the term ‘quantitative data’ . (Extract...,meeting customer needs
1,Construct a supply and demand diagram to show ...,the market
2,Analyse two entrepreneurial characteristics sh...,entrepreneurs and leaders
3,Discuss the benefits for Vaibhav Anant of usin...,meeting customer needs
4,Assess if there is likely to be a trade-off be...,entrepreneurs and leaders
5,"Define the term ‘social trends’ . (Extract D, ...",meeting customer needs
6,"Using the data, calculate the increase in mont...",meeting customer needs
7,Analyse two benefits for Aldi of having emplo...,managing people
8,Assess the benefits for Aldi of sponsoring th...,marketing mix and strategy
9,Assess the likely impact of price comparison w...,marketing mix and strategy


In [39]:
df_new.to_excel('new_questions_pred_bert.xlsx')

In [40]:
torch.save(model.state_dict(), "bert_classifier.pth")