In [1]:
path = '/home/hv2237/GYAFC_Corpus/Entertainment_Music'
data = {}
for split in ['train']:
    data[split] = []
    for f, i in zip(open(f'{path}/{split}/formal').readlines(),open(f'{path}/{split}/informal').readlines()):
        data[split].append({'formal':f[:-1], 'informal':i[:-1]})
        
for split in ['tune', 'test']:
    data[split] = []
    refs = [open(f'{path}/{split}/formal.ref{i}').readlines() for i in range(4)]
    inp = open(f'{path}/{split}/informal').readlines()
    for f in range(len(inp)):
        temp = {}
        temp['informal'] = inp[f][:-1]
        for i in range(4):
            temp[f'formal.ref{i}'] = refs[i][f][:-1]
        data[split].append(temp)

## Labels
- Formal -> 1 
- Infromal -> 0 

In [3]:
from datasets import Dataset
import pandas as pd
datasets = {}
for split in ['train', 'test', 'tune']:
    data_split = data[split]
    if split == 'train':
        formal = list(map(lambda x: x['formal'], data_split))
    else:
        formal = []
        for i in range(4):
            formal += list(map(lambda x: x[f'formal.ref{i}'], data_split))
    informal = list(map(lambda x: x['informal'], data_split))
    df = pd.DataFrame({'text':formal+informal, 'labels':[1]*len(formal)+[0]*len(informal)})
    datasets[split] = Dataset.from_pandas(df)

In [4]:
datasets['train']

Dataset({
    features: ['text', 'labels'],
    num_rows: 105190
})

In [8]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer 

In [9]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifi

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
def tokenize_function(example):
    return tokenizer(example['text'], truncation=True,max_length=30)

In [11]:
tokenized_datasets = {}
for split in datasets:
    tokenized_datasets[split] = datasets[split].map(tokenize_function, batched=True)

  0%|          | 0/106 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/15 [00:00<?, ?ba/s]

In [12]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [15]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="style-classifier",
                                 num_train_epochs=3,              # total number of training epochs
                                per_device_train_batch_size=16,  # batch size per device during training
                                per_device_eval_batch_size=64,   # batch size for evaluation
                                warmup_steps=500,                # number of warmup steps for learning rate scheduler
                                weight_decay=0.01,  
                                  evaluation_strategy="steps",
                                save_strategy="steps",
                                eval_steps=500,
                                save_steps=1000,# strength of weight decay
                                 )

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['tune'],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [17]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 105190
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 19725
  Number of trainable parameters = 66955010
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
500,0.42,0.2253,0.914216
1000,0.3413,0.264018,0.902398
1500,0.3282,0.290302,0.895586
2000,0.3161,0.207435,0.919847
2500,0.3066,0.220456,0.913938
3000,0.3075,0.196656,0.922072
3500,0.2861,0.258222,0.880222
4000,0.3043,0.212694,0.915328
4500,0.2976,0.227165,0.911575
5000,0.2856,0.213606,0.911088


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 14385
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 14385
  Batch size = 64
Saving model checkpoint to style-classifier/checkpoint-1000
Configuration saved in style-classifier/checkpoint-1000/config.json
Model weights saved in style-classifier/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassific

KeyboardInterrupt: 

In [39]:
1

1

In [31]:
out1 = tokenizer(['Example']*3, truncation=True, padding=True, max_length=30, return_tensors='pt').to('cuda:0')

In [32]:
model(**out1)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.4465,  0.7403],
        [-0.1843,  0.3337],
        [ 0.0056,  0.0948]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)