In [8]:
from datasets import load_dataset 
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, BertForSequenceClassification, TrainingArguments, Trainer

In [9]:
df = pd.read_csv("./data/disco_vs_not_tokenized.csv").rename(columns={"song": "text"}) 
df_train, df_test = train_test_split(df, test_size=0.33)
df_test.to_csv("./data/disco_vs_not_tokenized_test.csv", index=False)
df_train.to_csv("./data/disco_vs_not_tokenized_train.csv", index=False)

In [10]:
del df_train, df_test, df

In [11]:
ds = load_dataset("csv", data_files={"train": "./data/disco_vs_not_tokenized_train.csv",
                                     "test": "./data/disco_vs_not_tokenized_test.csv"}) 
ds

Using custom data configuration default-0dd63ec2b6567fad


Downloading and preparing dataset csv/default to /home/jupyter/.cache/huggingface/datasets/csv/default-0dd63ec2b6567fad/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/jupyter/.cache/huggingface/datasets/csv/default-0dd63ec2b6567fad/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2303
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1135
    })
})

In [12]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
ds = ds.map(tokenize_function, batched=True)
ds

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /home/jupyter/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file https://huggingface.co/bert-base-cased/resolve/

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2303
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1135
    })
})

In [8]:
model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

training_args = TrainingArguments(
    output_dir='./results_disco_vs_not',         
    num_train_epochs=3,              
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=8,  
    warmup_steps=500,               
    weight_decay=0.01,               
    logging_dir='./logs_disco_vs_not',          
)

trainer = Trainer(
    model=model,                        
    args=training_args,                  
    train_dataset=ds['train'],        
    eval_dataset=ds['test']         
)

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /home/jupyter/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file https://huggingface.co/bert-base-cased/resolve/main/pytorch_model.bin from ca

In [9]:
trainer.train()
model.save_pretrained('disco_vs_not_disco')

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2303
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1728


Step,Training Loss
500,0.5748
1000,0.5443
1500,0.5378


Saving model checkpoint to ./results_disco_vs_not/checkpoint-500
Configuration saved in ./results_disco_vs_not/checkpoint-500/config.json
Model weights saved in ./results_disco_vs_not/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results_disco_vs_not/checkpoint-1000
Configuration saved in ./results_disco_vs_not/checkpoint-1000/config.json
Model weights saved in ./results_disco_vs_not/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results_disco_vs_not/checkpoint-1500
Configuration saved in ./results_disco_vs_not/checkpoint-1500/config.json
Model weights saved in ./results_disco_vs_not/checkpoint-1500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in disco_vs_not_disco/config.json
Model weights saved in disco_vs_not_disco/pytorch_model.bin


In [13]:
from datasets import load_metric
metric = load_metric('accuracy')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred 
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)
model = BertForSequenceClassification.from_pretrained("disco_vs_not_disco")

training_args = TrainingArguments(
    output_dir='./results_disco_vs_not',         
    num_train_epochs=3,              
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=8,  
    warmup_steps=500,               
    weight_decay=0.01,               
    logging_dir='./logs_disco_vs_not',          
)
trainer = Trainer(
    model=model,                        
    args=training_args,                  
    train_dataset=ds['train'],        
    eval_dataset=ds['test'],
    compute_metrics=compute_metrics
    
)

loading configuration file disco_vs_not_disco/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file disco_vs_not_disco/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassificat

In [14]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1135
  Batch size = 8


{'eval_loss': 0.4603591561317444,
 'eval_accuracy': 0.8405286343612335,
 'eval_runtime': 981.0585,
 'eval_samples_per_second': 1.157,
 'eval_steps_per_second': 0.145}