In [None]:
! pip install evaluate
! pip install shap
! pip install accelerate -U

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from evaluate)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import evaluate
import torch
import shap
from evaluate import evaluator
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, \
    DataCollatorWithPadding, Trainer, TrainingArguments, \
    AutoModelForSequenceClassification, AutoTokenizer, create_optimizer, \
    TFAutoModelForSequenceClassification, pipeline
from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback
from datasets import Dataset, DatasetDict
import matplotlib.pyplot as plt

In [None]:
# Check if CUDA is available and set the device accordingly
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
dep = "dp_posts.tsv"
nondep = "nondp_posts.tsv"
model_name = "distilbert-base-uncased"


def read_file(filename):
    text = []
    with open(filename, 'r',) as file:
        for line in file:
            if line != '\n':
                line = line.replace("\t", "")
                text.append(line.rstrip())
    return text


depressed_posts = read_file(dep)
non_depressed_posts = read_file(nondep)

# create labels
depressed_labels = [1] * len(depressed_posts)
non_depressed_labels = [0] * len(non_depressed_posts)

all_posts = depressed_posts + non_depressed_posts
all_labels = depressed_labels + non_depressed_labels

# Create dataset
data = {'text': all_posts, 'label': all_labels}


In [None]:
# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["text"], data["label"], test_size=0.2, random_state=42
)

# Prepare dictionaries for train and test sets
train_data = {
    "text": train_texts,
    "label": train_labels
}

test_data = {
    "text": test_texts,
    "label": test_labels
}

# Convert each dictionary to a Dataset
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)

# Create a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Print the DatasetDict to verify
print(dataset_dict["test"][0])


{'text': 'Betting you probably pay all the bills too.  What he has done is crazy & invasive, which means he is dangerous.  Get out now, take all important papers, computer and such.   All your jewelry and anything of major value.  Then have him legally evicted.', 'label': 0}


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


tokenized_data = dataset_dict.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/6799 [00:00<?, ? examples/s]

Map:   0%|          | 0/1700 [00:00<?, ? examples/s]

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
id2label = {0: "non-depressed", 1: "depressed"}
label2id = {"non-depressed": 0, "depressed": 1}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id).cuda()

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    use_mps_device=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# save the model
model.save_pretrained("./my_model")
tokenizer.save_pretrained("./my_model")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.257913,0.883529
2,0.345100,0.228745,0.915294
3,0.149400,0.276471,0.92
4,0.070900,0.322433,0.924118
5,0.039700,0.330257,0.924118


('./my_model/tokenizer_config.json',
 './my_model/special_tokens_map.json',
 './my_model/vocab.txt',
 './my_model/added_tokens.json',
 './my_model/tokenizer.json')

**TensorFlow model below**\
(Go to Evaluate if using a PyTorch model)

In [None]:
# finetune the model
batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_data["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=2e-5)

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [None]:
# Convert datasets to the tf.data.Dataset format:

tf_train_set = model.prepare_tf_dataset(
    tokenized_data["train"],
    shuffle=True,
    batch_size=4,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_data["test"],
    shuffle=False,
    batch_size=4,
    collate_fn=data_collator,
)

In [None]:
# Configure the model for training with "compile"
model.compile(optimizer=optimizer)

# Pass compute_matrics function to KerasMetricCallback
metric_callback = KerasMetricCallback(
    metric_fn=compute_metrics, eval_dataset=tf_validation_set)


In [None]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='./my_model',
    save_weights_only=True,
    save_best_only=True,
    monitor='val_accuracy',  # or any other metric you prefer
    mode='max'
)

callbacks = [metric_callback, checkpoint_callback]

# Start training the model

model.fit(x=tf_train_set, validation_data=tf_validation_set,
          epochs=3, callbacks=callbacks)

Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported



Epoch 2/3



Epoch 3/3





KeyboardInterrupt: 

**Evaluate the PyTorch Model**

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("my_model").cuda()

task_evaluator = evaluator("text-classification")
results = task_evaluator.compute(
    model_or_pipeline=model.config._name_or_path,
    data=test_dataset,
    metric="accuracy",
    label_mapping=model.config.label2id,
    strategy="bootstrap",
    n_resamples=10,
    random_state=42
)
print(results["accuracy"]["score"])

0.9152941176470588


**Use SHAP to explain the PyTorch model**

In [None]:
texts = test_dataset["text"][:10]
print(texts)

['Betting you probably pay all the bills too.  What he has done is crazy & invasive, which means he is dangerous.  Get out now, take all important papers, computer and such.   All your jewelry and anything of major value.  Then have him legally evicted.', 'Sorry if this is a really stupid question, but although I have used the simpler functions of LaTeX for a year or so, starting to write an actual book is massive for me.', "She was shocked that a guy would have sex with me. I confronted her later and she profusely apologized. She said I misinterpreted her intentions. Well, I didn't. She must think I am an unfuckable monster.", 'My hope for you is that by the time the SHTF for the both of them, you will have healed enough that you can confidently tell him to go to hell when he comes crawling back.You will win the war but it will take some time, your sister rushing to hurt you with pics and everything shows to me that stealing your man was one of his main interest, which might lead to d

In [None]:
tokenizer = AutoTokenizer.from_pretrained("my_model")
model.eval()

texts = test_dataset["text"][:5]
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Create a SHAP explainer
explainer = shap.Explainer(classifier)

# Compute SHAP values for the inputs
shap_values = explainer(texts)

# Visualize the explanations

shap.plots.text(shap_values)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
PartitionExplainer explainer: 6it [00:22,  5.63s/it]
