In [1]:
# imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn
from collections import Counter
from sklearn.utils import resample

In [2]:
torch.cuda.is_available()

True

In [3]:
df = pd.read_csv('cleaned_labelled_data.csv')
text_column = 'review_text'
label_column = 'label'

# Handle potential missing values in text column
df[text_column] = df[text_column].fillna('')

X = df[['review_text', 'rating_person', 'main_category', 'can_claim', 'is_local_guide', 'sentiment_polarity', 'sentiment_subjectivity']]
y = df['label']

X = pd.get_dummies(X, columns=['main_category', 'can_claim', 'is_local_guide'], drop_first=True)

text_features = X['review_text']
numerical_features = X.drop(columns=['review_text'])

text_train, text_test, numerical_train, numerical_test, y_train, y_test = train_test_split(
    text_features, numerical_features, y, test_size=0.2, random_state=42, stratify=y
)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
train_df = pd.DataFrame({
    'review_text': text_train,
    'label': y_train
}).reset_index(drop=True)
train_df = pd.concat([train_df, numerical_train.reset_index(drop=True)], axis=1)
unique_labels = train_df['label'].unique()
max_count = max(train_df['label'].value_counts())
resampled_dfs = []
for label in unique_labels:
    df_minority = train_df[train_df['label'] == label]
    df_resampled = resample(df_minority,
                            replace=True,
                            n_samples=max_count,
                            random_state=42)
    resampled_dfs.append(df_resampled)
train_df_resampled = pd.concat(resampled_dfs)
X_train_resampled = train_df_resampled.drop(columns=['label'])
y_train_resampled = train_df_resampled['label']

print("Resampled training label distribution:", Counter(y_train_resampled))
print(f"Resampled features shape: {X_train_resampled.shape}")

Resampled training label distribution: Counter({'Relevant': 986, 'Vague': 986, 'Relevant and quality': 986, 'no review': 986, 'Advertisement': 986, 'Rants without visit': 986, 'Irrelevant content': 986})
Resampled features shape: (6902, 82)


In [5]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, numerical_features, tokenizer, label_map):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.numerical_features = numerical_features.values
        self.tokenizer = tokenizer
        self.label_map = label_map

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        numerical_feature = self.numerical_features[idx]

        tokenized_input = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        return {
            'input_ids': tokenized_input['input_ids'].squeeze(),
            'attention_mask': tokenized_input['attention_mask'].squeeze(),
            'labels': torch.tensor(self.label_map[label]),
            'numerical_features': torch.tensor(numerical_feature, dtype=torch.float)
        }

class CustomClassifier(nn.Module):
    def __init__(self, text_model_name, num_labels, num_additional_features):
        super(CustomClassifier, self).__init__()

        self.text_model = AutoModelForSequenceClassification.from_pretrained(
            text_model_name,
            num_labels=num_labels
        )

        self.additional_features_fc = nn.Linear(num_additional_features, 64)

        # The input size is the text model's output size (768) + the output of your new layer (64)
        self.final_classifier = nn.Linear(768 + 64, num_labels)

    def forward(self, input_ids, attention_mask, numerical_features, labels=None):
        text_outputs = self.text_model.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        cls_hidden_state = text_outputs[0][:, 0, :]

        # Pass the numerical features through their linear layer
        numerical_output = self.additional_features_fc(numerical_features.view(-1, numerical_features.shape[1]))

        # Concatenate the text and numerical outputs
        combined_output = torch.cat((cls_hidden_state, numerical_output), dim=1)

        logits = self.final_classifier(combined_output)

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.text_model.config.num_labels), labels.view(-1))
            return (loss, logits)
        return (logits,)

In [6]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [7]:
# Re-create the training dataset with the oversampled data
text_train_resampled = X_train_resampled['review_text']
numerical_train_resampled = X_train_resampled.drop(columns=['review_text'])

numerical_train_resampled = numerical_train_resampled.astype(np.float32)
numerical_test = numerical_test.astype(np.float32)

label_map = {label: i for i, label in enumerate(y.unique())}
id_map = {i: label for label, i in label_map.items()}

train_dataset = CustomDataset(
    text_train_resampled,
    y_train_resampled,
    numerical_train_resampled,
    tokenizer,
    label_map
)

test_dataset = CustomDataset(
    text_test,
    y_test,
    numerical_test,
    tokenizer,
    label_map
)

num_additional_features = len(numerical_train.columns)
model = CustomClassifier("distilbert-base-uncased", num_labels=len(y.unique()), num_additional_features=num_additional_features)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    dataloader_pin_memory=False
)

# Initialize the Trainer with the oversampled dataset and model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.596,0.608188,0.829942,0.824449,0.836728,0.829942
2,0.1664,0.674651,0.857558,0.850919,0.854802,0.857558
3,0.1143,0.729275,0.851744,0.843874,0.847334,0.851744
4,0.1122,0.722432,0.856105,0.847686,0.850722,0.856105


TrainOutput(global_step=3452, training_loss=0.21285675132813348, metrics={'train_runtime': 1334.9038, 'train_samples_per_second': 20.682, 'train_steps_per_second': 2.586, 'total_flos': 0.0, 'train_loss': 0.21285675132813348, 'epoch': 4.0})

In [10]:
predictions = trainer.predict(test_dataset)
y_pred_distilbert = np.argmax(predictions.predictions, axis=1)
y_true_distilbert = predictions.label_ids

y_pred_distilbert_labels = [id_map[pred] for pred in y_pred_distilbert]
y_true_distilbert_labels = [id_map[label] for label in y_true_distilbert]

print(classification_report(y_true_distilbert_labels, y_pred_distilbert_labels))

                      precision    recall  f1-score   support

       Advertisement       0.75      0.75      0.75        12
  Irrelevant content       0.80      0.67      0.73         6
 Rants without visit       0.48      0.34      0.40        29
            Relevant       0.90      0.69      0.78       247
Relevant and quality       0.78      0.91      0.84       193
               Vague       0.68      1.00      0.81        45
           no review       0.94      1.00      0.97       156

            accuracy                           0.83       688
           macro avg       0.76      0.77      0.75       688
        weighted avg       0.84      0.83      0.82       688



In [11]:
save_directory = "./new-fine-tuned-distilbert2"

trainer.save_model(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"\nModel and tokenizer saved to {save_directory}")


Model and tokenizer saved to ./new-fine-tuned-distilbert2


In [12]:
    !zip -r /content/results.zip /content/results

  adding: content/results/ (stored 0%)
  adding: content/results/checkpoint-863/ (stored 0%)
  adding: content/results/checkpoint-863/rng_state.pth (deflated 26%)
  adding: content/results/checkpoint-863/trainer_state.json (deflated 57%)
  adding: content/results/checkpoint-863/scheduler.pt (deflated 61%)
  adding: content/results/checkpoint-863/optimizer.pt (deflated 34%)
  adding: content/results/checkpoint-863/model.safetensors (deflated 8%)
  adding: content/results/checkpoint-863/training_args.bin (deflated 54%)
  adding: content/results/checkpoint-1726/ (stored 0%)
  adding: content/results/checkpoint-1726/rng_state.pth (deflated 26%)
  adding: content/results/checkpoint-1726/trainer_state.json (deflated 64%)
  adding: content/results/checkpoint-1726/scheduler.pt (deflated 61%)
  adding: content/results/checkpoint-1726/optimizer.pt (deflated 34%)
  adding: content/results/checkpoint-1726/model.safetensors (deflated 8%)
  adding: content/results/checkpoint-1726/training_args.bin (

In [None]:
    !zip -r /content/fine-tuned-distilbert.zip /content/fine-tuned-distilbert

  adding: content/new-fine-tuned-distilbert2/ (stored 0%)
  adding: content/new-fine-tuned-distilbert2/vocab.txt (deflated 53%)
  adding: content/new-fine-tuned-distilbert2/special_tokens_map.json (deflated 42%)
  adding: content/new-fine-tuned-distilbert2/tokenizer_config.json (deflated 75%)
  adding: content/new-fine-tuned-distilbert2/tokenizer.json (deflated 71%)
  adding: content/new-fine-tuned-distilbert2/model.safetensors (deflated 8%)
  adding: content/new-fine-tuned-distilbert2/training_args.bin (deflated 54%)
