In [1]:
import pandas as pd
from datasets import load_dataset, ClassLabel, DatasetDict, Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import log_loss

2023-03-23 21:28:12.616095: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-23 21:28:12.688167: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-23 21:28:12.706757: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-03-23 21:28:13.031908: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: li

In [2]:
accuracy = evaluate.load("accuracy")

# Load data

## Doyle & Christie

In [3]:
def process_Doyle_Christie():
    data = pd.read_csv("Doyle_Christie_dataset/train.csv")
    data = data.drop(columns=['Unnamed: 0',])
    data['labels'] = data['author'].apply(lambda x: 0 if x == 'Doyle' else 1)
    
    test_data = pd.read_csv("Doyle_Christie_dataset/test.csv")
    test_data = test_data.drop(columns=['Unnamed: 0',])
    test_data['labels'] = test_data['author'].apply(lambda x: 0 if x == 'Doyle' else 1)
    
    return data, test_data

In [4]:
train_dc, test_dc = process_Doyle_Christie()
eval_dc = train_dc.sample(frac=0.1)
train_dc = train_dc.drop(eval_dc.index)

## Letters

In [11]:
def process_Letters():
    old_eng = pd.read_csv('old_english_dataset.csv')
    old_eng = old_eng.drop(columns=['Unnamed: 0',])
    old_eng['labels'] = old_eng['gender'].apply(lambda x: 0 if x == 'f' else 1)
    
    equal = old_eng[old_eng['labels'] == 1].sample(n = 2806)
    train_old_eng = pd.concat([equal, old_eng[old_eng['labels'] == 0]], ignore_index=True)
    
    women = train_old_eng[train_old_eng['labels'] == 0]
    men = train_old_eng[train_old_eng['labels'] == 1]
    
    test_women = women.sample(frac=0.1)
    train_women = women.drop(test_women.index)
    test_men = men.sample(frac=0.1)
    train_men = men.drop(test_men.index)
    
    train_old_eng = pd.concat([train_women, train_men], ignore_index=True)
    test_old_eng = pd.concat([test_women, test_men], ignore_index=True)
    
    return train_old_eng, test_old_eng

In [12]:
train_letters, test_letters = process_Letters()
eval_letters = train_letters.sample(frac=0.1)
train_letters = train_letters.drop(eval_letters.index)

## Modern

In [3]:
def process_Modern():
    data = pd.read_csv("Modern_dataset/train.csv")
    data = data.drop(columns=['Unnamed: 0',])
    data['labels'] = data['gender'].apply(lambda x: 0 if x == 'f' else 1)
    
    train_data = pd.concat([data[data['labels'] == 1].sample(n=30941), data[data['labels']==0]], 
                       ignore_index=True)
    
    test = pd.read_csv("Modern_dataset/test.csv")
    test = test.drop(columns=['Unnamed: 0',])
    test['labels'] = test['gender'].apply(lambda x: 0 if x == 'f' else 1)
    
    test_data = pd.concat([test[test['labels'] == 0].sample(n=5000), test[test['labels'] == 1].sample(n=5000)], 
                      ignore_index=True)
    
    return train_data, test_data

In [4]:
train_modern, test_modern = process_Modern()
eval_modern = train_modern.sample(frac=0.1)
train_modern = train_modern.drop(eval_modern.index)

# Preprocessing

In [5]:
def create_dataset(data, label_names):
    
    ds = Dataset.from_dict({"sentence": data['sentence'], "labels": data['labels']})
    features = ds.features.copy()
    features["labels"] = ClassLabel(names=label_names)
    ds = ds.map(adjust_labels, batched=True, features=features)
    
    return ds

In [6]:
def adjust_labels(batch):
    batch["labels"] = [int(value) for value in batch["labels"]]
    return batch

In [7]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [8]:
def test(model, test_dataset, tokenizer, device):
    
    tok_test = []
    for row in test_dataset:
        tok_test.append(tokenizer(row['sentence'], 
                                truncation=True, 
                                return_tensors="pt").to(device))
    
    logits = []
    predictions = []
    for row in tok_test:
        outputs = model(**row)
        logits_ = torch.nn.functional.softmax(outputs.logits, dim=-1)
        logits.append(float(logits_[0][1].cpu()))
        
    double_pred_scores = [[1-s, s] for s in logits]
    pred_classes = [0 if s<0.5 else 1 for s in logits]
    ll = log_loss(y_pred = double_pred_scores, y_true = test_dataset['labels'])
    acc = accuracy.compute(predictions=pred_classes, references=test_dataset['labels'])
    
    print('logloss: ', ll)
    print('accuracy: ', acc)

In [9]:
def bert_pipeline(train_data, valid_data, test_data, checkpoint, id2label, label2id, output_dir, label_names, device):
    
    train_data = create_dataset(train_data, label_names)
    test_data = create_dataset(test_data, label_names)
    valid_data = create_dataset(valid_data, label_names)

    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    
    train_data = train_data.map(lambda row: tokenizer(row['sentence'], truncation=True))
    valid_data = valid_data.map(lambda row: tokenizer(row['sentence'], truncation=True))
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors = 'pt')
    
    
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, 
                                                               num_labels=2, 
                                                               id2label=id2label, 
                                                               label2id=label2id, 
                                                               ignore_mismatched_sizes=True)
    
    training_args = TrainingArguments(output_dir=output_dir,
                                    learning_rate=2e-5,
                                    per_device_train_batch_size=10,
                                    per_device_eval_batch_size=10,
                                    num_train_epochs=3,
                                    weight_decay=0.01,
                                    evaluation_strategy="epoch",
                                    save_strategy="epoch",
                                    load_best_model_at_end=True,)
    
    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=train_data,
                      eval_dataset=valid_data,
                      tokenizer=tokenizer,
                      data_collator=data_collator,
                      compute_metrics=compute_metrics,)
    
    trainer.train()
    
    test(model=model, test_dataset=test_data, tokenizer=tokenizer, device=device)

In [10]:
# я случайно запустила эту штуку хотя не собиралась, короче поверьте что там 0.8 было
bert_pipeline(train_data=train_dc,
     valid_data=eval_dc, 
     test_data=test_dc, 
     checkpoint='bert-base-cased', 
     id2label={0: "Doyle", 1: "Christie"}, 
     label2id={"Doyle": 0, "Christie": 1}, 
     output_dir='bert_dc', 
     label_names=['Doyle', 'Christie'],
     device=torch.device('cuda'))

NameError: name 'train_dc' is not defined

In [13]:
bert_pipeline(train_data=train_letters,
     valid_data=eval_letters, 
     test_data=test_letters, 
     checkpoint='bert-base-cased', 
     id2label={0: "f", 1: "m"}, 
     label2id={"f": 0, "m": 1}, 
     output_dir='bert_letters', 
     label_names=['f', 'm'],
     device=torch.device('cuda'))

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /home/antonauna/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file vocab.txt from cache at /home/antonauna/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/vocab.txt
l

  0%|          | 0/4545 [00:00<?, ?ex/s]

  0%|          | 0/505 [00:00<?, ?ex/s]

loading configuration file config.json from cache at /home/antonauna/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "f",
    "1": "m"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "f": 0,
    "m": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file pytorch_model.bin from cache at /home/antonauna/.

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.355484,0.865347
2,0.456100,0.404891,0.877228
3,0.242600,0.472886,0.879208


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 505
  Batch size = 10
Saving model checkpoint to bert_letters/checkpoint-455
Configuration saved in bert_letters/checkpoint-455/config.json
Model weights saved in bert_letters/checkpoint-455/pytorch_model.bin
tokenizer config file saved in bert_letters/checkpoint-455/tokenizer_config.json
Special tokens file saved in bert_letters/checkpoint-455/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5

logloss:  0.31936399842903435
accuracy:  {'accuracy': 0.8790035587188612}


In [11]:
bert_pipeline(train_data=train_modern,
     valid_data=eval_modern, 
     test_data=test_modern, 
     checkpoint='bert-base-cased', 
     id2label={0: "f", 1: "m"}, 
     label2id={"f": 0, "m": 1}, 
     output_dir='bert_modern', 
     label_names=['f', 'm'],
     device=torch.device('cuda'))

  0%|          | 0/56 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/55694 [00:00<?, ?ex/s]

  0%|          | 0/6188 [00:00<?, ?ex/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3744,0.363946,0.822237
2,0.2655,0.438929,0.83371
3,0.1827,0.648686,0.83775


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 6188
  Batch size = 10
Saving model checkpoint to bert_modern/checkpoint-5570
Configuration saved in bert_modern/checkpoint-5570/config.json
Model weights saved in bert_modern/checkpoint-5570/pytorch_model.bin
tokenizer config file saved in bert_modern/checkpoint-5570/tokenizer_config.json
Special tokens file saved in bert_modern/checkpoint-5570/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 

logloss:  1.1833296831580413
accuracy:  {'accuracy': 0.5122}
