## *For Google Colab*

In [None]:
!pip install transformers
!pip install wordfreq

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 31.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 46.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 37.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [None]:
cd drive/MyDrive/Sujan_Dutta/BERT-issue-classifier/

/content/drive/MyDrive/Sujan_Dutta/BERT-issue-classifier


## *Importng libraries*

In [None]:
from bs4 import BeautifulSoup as soup
import re
import requests
import pandas as pd
import os
from nltk import tokenize
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import time
import random

import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from wordfreq import zipf_frequency

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## *Loading data ($D_{unbalanced}$)*

In [None]:
data = pd.read_csv("github-labels-data.csv", header=None)
data.head()
data = data.rename(columns={0: 'issue'})
data.head()
data["label"] = data["issue"].apply(lambda x : x.split(" ")[0])
data["text"] = data["issue"].apply(lambda x : " ".join(x.split(" ")[1:]))

## *Data cleaning*

In [None]:
def clean(txt):

    # remove emojis
    emoji_pattern = re.compile(
    u"(\ud83d[\ude00-\ude4f])|"  
    u"(\ud83c[\udf00-\uffff])|"  
    u"(\ud83d[\u0000-\uddff])|"  
    u"(\ud83d[\ude80-\udeff])|"
    u"(\ud83c[\udde0-\uddff])"  
    "+", flags=re.UNICODE)
    
    # remove code lines
    regex_code = "```[^```]*```"
    # remove image/gif
    regex_image = r"!\[[^!\[\)]*\)"
    # remove links
    regex_link = r"http\S+"
    # remove markdown comments
    regex_comm = r"<!--[^!]*-->"
    # remove version
    regex_ver_4 =  r"(\d{,4}\.\d{,4}\.\d{,4}\.\d{,4})" 
    regex_ver_3 =  r"\d{,4}\.\d{,4}\.\d{,4}"   
    # remove @user
    regex_usr = r"\B@[a-zA-Z0-9_-]{,39}"
    # remove punctuations
    regex_punc = r"[->:<@#*!`\"’\(\_)\[\]/=\\${}%&+~,;|─-]+"
    
    txt = re.sub(emoji_pattern, ' ', txt)
    txt = re.sub(regex_code, ' ', txt)
    txt = re.sub(regex_image, 'IMAGE', txt)
    txt = re.sub(regex_link, 'LINK', txt)
    txt = re.sub(regex_comm, ' ', txt)
    txt = re.sub(regex_ver_4, ' ', txt)
    txt = re.sub(regex_ver_3, ' ', txt)
    txt = re.sub(regex_usr, 'USER', txt)
    
    
    
    txt = re.sub(regex_punc, ' ', txt)
    
    txt = re.sub('\n+', ' ', txt)
    txt = re.sub(' +', ' ', txt)
    txt = re.sub(' \.+', '.', txt)

    return txt.strip()

def clean_2(txt):
    # removes a data point if it too few meaningful english wowrds
    arr = txt.split(" ")
    n = len(arr)

    if n==1 and arr[0] in {"LINK", "USER", "IMAGE"}: return "NULL"

    thr = min(n*0.5, 5) 
    cnt = 0
    for word in arr:
        if zipf_frequency(word, 'en')>3 and word not in {"LINK", "USER", "IMAGE"}:
            cnt+=1
    
    return txt if cnt>thr else "NULL"

In [None]:
data["cleaned_text"] = data["text"].apply(lambda x: clean(x))
# remove duplicates
data = data.drop_duplicates(subset=["cleaned_text"], keep="first")
data["label"].value_counts()

__label__bug            16056
__label__enhancement    13662
__label__question        3110
Name: label, dtype: int64

In [None]:
data["cleaned_text"] = data["cleaned_text"].apply(lambda x: clean_2(x))
df = data[data["cleaned_text"]!="NULL"]
df["label"].value_counts()

__label__bug            15392
__label__enhancement    12965
__label__question        3010
Name: label, dtype: int64

## *Under sampling*

In [None]:
bug = df[df["label"]=="__label__bug"].sample(n=5000, random_state=42)
enh = df[df["label"]=="__label__enhancement"].sample(n=5000, random_state=42)
que = df[df["label"]=="__label__question"].sample(n=3000, random_state=42)
df = pd.concat([bug, enh, que])

## *Encoding the target variable*

In [None]:
X = list(df["cleaned_text"].values)
dic = {'__label__bug':0, '__label__enhancement':1, '__label__question':2}
df["label"] = df["label"].apply(lambda x : dic[x])
Y = list(df["label"].values)

## *Training the BERT model*

In [None]:
model_name = "bert-base-uncased"
max_length = 512
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
train_texts, valid_texts, train_labels, valid_labels = train_test_split(X, Y, test_size=0.2, random_state=16022022)

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = NewsDataset(train_encodings, train_labels)
valid_dataset = NewsDataset(valid_encodings, valid_labels)

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3).to("cuda")

from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  f1_macro = f1_score(labels, preds, average="macro")
  length = len(labels)
  return {
      'accuracy': acc,
      'F1': f1_macro,
      'len' : length,
  }
 
training_args = TrainingArguments(
    output_dir='./BERT_classification_results',          # output directory
    num_train_epochs=6,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./BERT_classification_logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    metric_for_best_model="accuracy",
    logging_steps=400,               # log & save weights each logging_steps
    save_steps=400,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

trainer.train()

trainer.evaluate() 

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model_best = BertForSequenceClassification.from_pretrained("BERT_classification_results/model_best/")
model_best.to(device)
model_best.eval()

cuda:0


loading configuration file BERT_classification_results/model_best/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
preds = []
for i in range(len(valid_texts)):
    tokenized_text = tokenizer.tokenize(valid_texts[i], truncation=True, padding=True, max_length=max_length)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Create the segments tensors.
    segments_ids = [0] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # Predict all tokens
    with torch.no_grad():
        predictions = model_best(tokens_tensor.to(device), segments_tensors.to(device))[0]
        x = int(torch.argmax(torch.nn.functional.softmax(predictions)).cpu().numpy())
        # if valid_labels[i] != x:
        print(valid_texts[i])
        print("model :", x )
        print("ground truth :", valid_labels[i] )
        print("------------------------------------------------")
        preds.append(x)

## *Computing MCC*

In [None]:
from sklearn.metrics import matthews_corrcoef, roc_auc_score
print(matthews_corrcoef(valid_labels, preds))

0.6692367113726212


## *Computing odd's ratio between BERT and FastText*

In [None]:
res_df = pd.DataFrame()
res_df["actual"] = valid_labels
res_df["bert"] = preds
# pred_numbs.csv contains fasttext predictions 
ft = pd.read_csv("preds_numbers.csv")
res_df["ft"] = ft.iloc[:, 1].values

In [None]:
print(len(res_df[ (res_df["actual"]  == res_df["bert"]) & (res_df["actual"] == res_df["ft"]) ]))
print(len(res_df[ (res_df["actual"]  == res_df["bert"]) & (res_df["actual"] != res_df["ft"]) ]))
print(len(res_df[ (res_df["actual"]  != res_df["bert"]) & (res_df["actual"] == res_df["ft"]) ]))
print(len(res_df[ (res_df["actual"]  != res_df["bert"]) & (res_df["actual"] != res_df["ft"]) ]))

1555
475
253
317


In [None]:
(1555*317)/(253*475)

4.101809860619929

## *McNemar's test*

In [None]:
from statsmodels.stats.contingency_tables import mcnemar
table = [[1555, 475],
		 [253, 317]]

result = mcnemar(table, exact=False, correction=True)
print('statistic=%.5f, p-value=%.5f' % (result.statistic, result.pvalue))

statistic=67.08929, p-value=0.00000


## *Confusion matrix*

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
confusion_matrix(valid_labels, preds)

array([[799, 102,  96],
       [161, 710, 110],
       [ 53,  48, 521]])

## *Classification report*

In [None]:
print(classification_report(valid_labels, preds))

              precision    recall  f1-score   support

           0       0.79      0.80      0.80       997
           1       0.83      0.72      0.77       981
           2       0.72      0.84      0.77       622

    accuracy                           0.78      2600
   macro avg       0.78      0.79      0.78      2600
weighted avg       0.79      0.78      0.78      2600



## *Creating $D_{domain-specific}$ data*



In [None]:
train = pd.read_csv("github-labels-top3-803k-train.csv")
test = pd.read_csv("github-labels-top3-803k-test.csv")
data_big = pd.concat([train, test])

In [None]:
# list of opensource text editors 
text_editors = [
    "/vscode",
    "/brackets",
    "/lime",
    "/textmate",
    "/neovim",
    "/overleaf",
    "/slap",
    "/Caret",
    "/KomodoEdit",
    "/micro",
    "/SpaceVim",
    "/atom",
    "/LightTable",
    "/kakoune",
    "/Notepads",
    "/alm",
    "/spacemacs",
    "/leo-editor",
    "/zed",
    "/neoeedit",
    "/notepad-plus-plus",
    "/intellij-community",
    "/kate",
    "/textadept",
    "/Notepad3",
    "/neditor",
]

text_editors_df = pd.DataFrame()
for repo in text_editors:
    text_editors_df = pd.concat([text_editors_df, data_big[data_big["repository_url"].str.contains(repo)]])

In [None]:
text_editors_df["label"] = text_editors_df["issue_label"].apply(lambda x : "__label__"+x)
text_editors_df["text"] = text_editors_df.apply(lambda x : str(x.issue_title) + " " + str(x.issue_body), axis=1)
text_editors_df = text_editors_df[["text", "label"]]
text_editors_df.head(5)

Unnamed: 0,text,label
0,Welcome screen on every editor window is very ...,__label__bug
1274,Semantic prompt failed Type: Debugger\r\n<!---...,__label__bug
1328,Feature: detect custom component suffixes in E...,__label__enhancement
1695,Extension causes high cpu load - Issue Type: `...,__label__question
1710,"""Open Bundle"" command: Match on bundle parent ...",__label__enhancement


In [None]:
text_editors_df["label"].value_counts()

__label__bug            12701
__label__enhancement     6381
__label__question        1947
Name: label, dtype: int64

## *Data cleaning*

In [None]:
text_editors_df["cleaned_text"] = text_editors_df["text"].apply(lambda x: clean(x))
text_editors_df["label"].value_counts()

__label__bug            12701
__label__enhancement     6381
__label__question        1947
Name: label, dtype: int64

In [None]:
text_editors_df["cleaned_text"] = text_editors_df["cleaned_text"].apply(lambda x: clean_2(x))
text_editors_df = text_editors_df.drop_duplicates(subset=["cleaned_text"], keep="first")
df = text_editors_df[text_editors_df["cleaned_text"]!="NULL"]
df["label"].value_counts()

__label__bug            9370
__label__enhancement    3435
__label__question       1782
Name: label, dtype: int64

In [None]:
df.isnull().values.any()

False

In [None]:
df.head(5)

Unnamed: 0,text,label,cleaned_text
0,Welcome screen on every editor window is very ...,__label__bug,Welcome screen on every editor window is very ...
1274,Semantic prompt failed Type: Debugger\r\n<!---...,__label__bug,Semantic prompt failed Type Debugger\r \r \r \...
1328,Feature: detect custom component suffixes in E...,__label__enhancement,Feature detect custom component suffixes in ES...
1695,Extension causes high cpu load - Issue Type: `...,__label__question,Extension causes high cpu load Issue Type Perf...
1710,"""Open Bundle"" command: Match on bundle parent ...",__label__enhancement,Open Bundle command Match on bundle parent pat...


## *Under sampling*

In [None]:
bug = df[df["label"]=="__label__bug"].sample(n=3000, random_state=42)
enh = df[df["label"]=="__label__enhancement"].sample(n=3000, random_state=42)
que = df[df["label"]=="__label__question"].sample(n=1750, random_state=42)
df = pd.concat([bug, enh, que])

X = list(df["cleaned_text"].values)
dic = {'__label__bug':0, '__label__enhancement':1, '__label__question':2}
df["label"] = df["label"].apply(lambda x : dic[x])
Y = list(df["label"].values)

## *Training and evaluating BERT on $D_{domain-specific}$*

In [None]:
model_name = "bert-base-uncased"
max_length = 512
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
train_texts, valid_texts, train_labels, valid_labels = train_test_split(X, Y, test_size=0.2, random_state=16022022)

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = NewsDataset(train_encodings, train_labels)
valid_dataset = NewsDataset(valid_encodings, valid_labels)

torch.cuda.empty_cache()
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3).to("cuda")

from sklearn.metrics import accuracy_score, f1_score, classification_report

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  f1_macro = f1_score(labels, preds, average="macro")
  length = len(labels)
#   report = classification_report(labels, preds)
  return {
      'accuracy': acc,
      'F1': f1_macro,
      'len' : length,
    #   'report' : report
  }
 
training_args = TrainingArguments(
    output_dir='./BERT_classification_results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./BERT_classification_logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    metric_for_best_model="accuracy",
    logging_steps=400,               # log & save weights each logging_steps
    save_steps=400,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

trainer.train()
trainer.evaluate() 

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Step,Training Loss,Validation Loss,Accuracy,F1,Len
400,0.7567,0.521456,0.810323,0.802551,1550
800,0.4917,0.520517,0.833548,0.825435,1550
1200,0.3194,0.590419,0.828387,0.823344,1550
1600,0.2051,0.806377,0.825161,0.819095,1550
2000,0.1108,0.998484,0.812903,0.807917,1550
2400,0.0723,1.054247,0.831613,0.825344,1550
2800,0.0427,1.1794,0.823226,0.816609,1550
3200,0.0146,1.226916,0.828387,0.820094,1550
3600,0.0045,1.239658,0.829032,0.822068,1550


***** Running Evaluation *****
  Num examples = 1550
  Batch size = 20
Saving model checkpoint to ./BERT_classification_results/checkpoint-400
Configuration saved in ./BERT_classification_results/checkpoint-400/config.json
Model weights saved in ./BERT_classification_results/checkpoint-400/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1550
  Batch size = 20
Saving model checkpoint to ./BERT_classification_results/checkpoint-800
Configuration saved in ./BERT_classification_results/checkpoint-800/config.json
Model weights saved in ./BERT_classification_results/checkpoint-800/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1550
  Batch size = 20
Saving model checkpoint to ./BERT_classification_results/checkpoint-1200
Configuration saved in ./BERT_classification_results/checkpoint-1200/config.json
Model weights saved in ./BERT_classification_results/checkpoint-1200/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1550
  Batch size = 20

{'epoch': 10.0,
 'eval_F1': 0.8254352179383074,
 'eval_accuracy': 0.8335483870967741,
 'eval_len': 1550,
 'eval_loss': 0.5205166935920715,
 'eval_runtime': 28.4986,
 'eval_samples_per_second': 54.389,
 'eval_steps_per_second': 2.737}