In [1]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import torch
import transformers
import json
from tqdm import tqdm
import logging
import pandas as pd

logging.basicConfig(level=logging.ERROR)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [2]:
with open('../data/twibot20/processed/train.json') as f:
    train_data = json.loads(f.read())

with open('../data/twibot20/processed/val.json') as f:
    val_data = json.loads(f.read())
    
def get_dataset(data):
    data = [[x['description'], 1 if x['label']=='bot' else 0] for x in data]
    df = pd.DataFrame(data)
    df.columns = ["text", "labels"]
    return df

train_data = get_dataset(train_data)
val_data = get_dataset(val_data)

In [38]:
model_args = ClassificationArgs(num_train_epochs=1, 
                                do_lower_case=False,
                                early_stopping_consider_epochs=False,
                                eval_batch_size=8,
                                evaluate_during_training=False,
                                learning_rate=4e-5,
                                output_dir='outputs/',
                                overwrite_output_dir=False,
                                manual_seed=None,
                                use_early_stopping=True,
                                sliding_window=False,
                                stride=0.8
                                )


In [41]:
model_args

ClassificationArgs(adafactor_beta1={'num_train_epochs': 1}, adafactor_clip_threshold=1.0, adafactor_decay_rate=-0.8, adafactor_eps=(1e-30, 0.001), adafactor_relative_step=True, adafactor_scale_parameter=True, adafactor_warmup_init=True, adam_betas=(0.9, 0.999), adam_epsilon=1e-08, best_model_dir='outputs/best_model', cache_dir='cache_dir/', config={}, cosine_schedule_num_cycles=0.5, custom_layer_parameters=[], custom_parameter_groups=[], dataloader_num_workers=0, do_lower_case=False, dynamic_quantize=False, early_stopping_consider_epochs=False, early_stopping_delta=0, early_stopping_metric='eval_loss', early_stopping_metric_minimize=True, early_stopping_patience=3, encoding=None, eval_batch_size=8, evaluate_during_training=False, evaluate_during_training_silent=True, evaluate_during_training_steps=2000, evaluate_during_training_verbose=False, evaluate_each_epoch=True, fp16=True, gradient_accumulation_steps=1, learning_rate=4e-05, local_rank=-1, logging_steps=50, loss_type=None, loss_ar

In [3]:
# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=1)

# Create a ClassificationModel
model = ClassificationModel(
    "distilbert", "distilbert-base-uncased", args=model_args, use_cuda=False
)

# Train the model
model.train_model(train_data)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

  0%|          | 0/8278 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1035 [00:00<?, ?it/s]

(1035, 0.6462568436267871)

In [5]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(val_data)

  0%|          | 0/2365 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Running Evaluation:   0%|          | 0/296 [00:00<?, ?it/s]

In [13]:
def get_metrics(result):
    
    acc = (result['tp'] + result['tn']) / (result['tp'] + result['tn'] + result['fp'] + result['fn'])
    pr = result['tp'] / (result['tp'] + result['fp'])
    rec = result['tp'] / (result['tp'] + result['fn'])
    
    print("Accuracy = ", acc)
    print("Precision = ", pr)
    print("Recall = ", rec)  

In [14]:
get_metrics(result)

Accuracy =  0.6684989429175475
Precision =  0.6620861961274204
Recall =  0.8135072908672295


In [40]:
result

{'mcc': 0.32344156034537175,
 'tp': 1060,
 'tn': 521,
 'fp': 541,
 'fn': 243,
 'auroc': 0.7197843452672595,
 'auprc': 0.7444564388873873,
 'eval_loss': 0.6104240386771995}