# 1. Activate GPU and Install Dependencies

In [1]:
# Activate GPU for faster training by clicking on 'Runtime' > 'Change runtime type' and then selecting GPU as the Hardware accelerator
# Then check if GPU is available
import torch
torch.cuda.is_available()

False

# Load imdb dataset for training

In [2]:
from datasets import load_dataset
imdb = load_dataset("imdb")

Found cached dataset imdb (/Users/kartiksharma/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

# Check one sample

In [3]:
train = imdb['train']
test = imdb['test']
print(train[0])

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

# Insert some data in the database.

In [5]:
import pymongo
db = pymongo.MongoClient("mongodb://testmongodbuser:testmongodbpassword@localhost:27018/admin",) 
db = db['test_db']

# Make its superdb

In [6]:
import superduperdb
from superduperdb import superduper


db = superduper(db)

INFO:faiss.loader:Loading faiss.
INFO:faiss.loader:Successfully loaded faiss.


In [None]:
db

In [17]:
from superduperdb.core.documents import Document as D
from superduperdb.datalayer.mongodb.query import Collection

In [None]:

train_df = train.to_pandas()
train_df = train_df.head(100)

In [None]:
data = [D({"text":sample[0], "label":sample[1]}) for sample in train_df.to_numpy()]

In [None]:
len(data)

In [None]:
db.execute(Collection('documents').insert_many(data))

# Create a tokenizer

In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# create a preproces function.

In [157]:
def transform(sample):
    tokenizer_fn = lambda x: tokenizer(x, truncation=True)
    tokenized_data = tokenizer_fn(sample['text'])
    sample.update(**tokenized_data)
    return sample

In [10]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create Model (DistilBert)

## It is a smaller version of BERT

In [11]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.

In [None]:
# Define the evaluation metrics
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [278]:
from superduperdb.models.transformers.wrapper import TransformersTrainerConfiguration, Pipeline

In [None]:
from transformers import TrainingArguments, Trainer
repo_name = "superduperdb-sentiment-analysis"
training_args = TransformersTrainerConfiguration(
    identifier=repo_name,
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    use_mps_device=True
)

In [None]:
device = torch.device('mps')
model.to(device)

In [281]:
trainer = Pipeline(
    identifier='my-sentiment-analysis',
    tokenizer=transform,
    object=model,
    train_X='text',
    train_y='label',
    device="mps"
    
)

In [280]:
from superduperdb.core.dataset import Dataset
trainer.fit(                                                                             │[I 02:27:48.721 NotebookApp] Serving notebooks from local directory: /Users/kartiksharma
    X='text',                                                                            │/Work/superduperdb/superduperdb-stealth/superduperdb-stealth
    y='label',                                                                           │[I 02:27:48.721 NotebookApp] Jupyter Notebook 6.5.4 is running at:
    db=db,                                                                               │[I 02:27:48.721 NotebookApp] http://localhost:8888/?token=587a1c61b8a26adfa3337b58d65e0d
    select=Collection('documents').find(),                                               │99c30701cd6be1b9b5
    configuration=training_args,                                                         │[I 02:27:48.721 NotebookApp]  or http://127.0.0.1:8888/?token=587a1c61b8a26adfa3337b58d6
    validation_sets=[                                                                    │5e0d99c30701cd6be1b9b5
        Dataset(                                                                         │[I 02:27:48.721 NotebookApp] Use Control-C to stop this server and shut down all kernels
            identifier='my-eval',                                                        │ (twice to skip confirmation).
            select=Collection(name='documents').find({'_fold': 'valid'}),                │[C 02:27:48.728 NotebookApp]
            db=db,                                                                       │
        )                                                                                │    To access the notebook, open this file in a browser:
    ],                                                                                   │        file:///Users/kartiksharma/Library/Jupyter/runtime/nbserver-85935-open.html
    data_collator=data_collator,                                                         │    Or copy and paste one of these URLs:
    compute_metrics=compute_metrics                                                      │        http://localhost:8888/?token=587a1c61b8a26adfa3337b58d65e0d99c30701cd6be1b9b5
    )

Step,Training Loss


KeyboardInterrupt: 