**1. Mount drive and install libararies**

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**2. Import libraries**

In [None]:
!pip install torch torchvision
!pip install transformers==2.10.0
!pip install seqeval
!pip install tensorboardx
!pip install simpletransformers==0.9.1

import pandas as pd
import numpy as np

import gc
import requests
import random 
import os

from sklearn.model_selection import KFold
from simpletransformers.classification import ClassificationModel
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, recall_score, precision_score
from scipy.special import softmax

import torch

print("Cuda available" if torch.cuda.is_available() is True else "CPU")
print("PyTorch version: ", torch.__version__)

**3. Download the data into PD DataFrame**

In [None]:
# read base training set (two coders)

path = '/content/drive/MyDrive/spap_state/spap_state_vaccine/classification/' # set own path
df_random_with_non_relevant = pd.read_csv(path + 'state_twitter_vaccine_hand_label_9200_19_april.csv', encoding = 'latin1') # this contains tweets unrelated to vaccine
df_random_with_non_relevant.rename({'full_text_final_entire': 'full_text_final'}, # column name change
                                   axis = 1, 
                                   inplace = True)

df_random_with_non_relevant = df_random_with_non_relevant[df_random_with_non_relevant['full_text_final'].isna()==False] # remove junk rows where text cannot be read

df_random = df_random_with_non_relevant[df_random_with_non_relevant['stance_final'] != 9] # remove tweets unrelated to vaccine

print(df_random['stance_final'].value_counts()) # three-fold labels

df_random['stance_final_anti'] = np.where(df_random['stance_final'] == -1, 1, 0) # anti-vax is re-labeled as 1 while rest as 0
print(df_random['stance_final_anti'].value_counts()) # two-fold labels (anti vs. rest)

df_random = df_random[['full_text_final', 'stance_final_anti']] # subset only relevant columns
df_random['label_anti'] = df_random['stance_final_anti'].astype(float)
df_random = df_random[['full_text_final','label_anti']]

# read training set from active learning (one coder)

df_active_all = []
round = 23

for i in range(1, round + 1):
  df_active_with_non_relevant = pd.read_csv(path + 'active_learning_round_' + str(i) + '.csv', encoding = 'latin1')
  df_active_with_non_relevant['full_text_final'] = df_active_with_non_relevant['full_text'] 
  df_active = df_active_with_non_relevant[df_active_with_non_relevant['stance_final'] != 9]
  df_active['label_anti'] = np.where(df_active['stance_final'] == -1, 1, 0)
  df_active = df_active[['full_text_final', 'label_anti']]
  df_active_all.append(df_active)

df_active_all = pd.concat(df_active_all)

# join two training sets

df = pd.concat([df_random, df_active_all])
df['full_text_alphabet'] = df.full_text_final.str.replace('[^a-zA-Z]', '')
df = df.drop_duplicates(subset = 'full_text_alphabet', keep = 'first', inplace = False, ignore_index = False) # drop duplicates
del df['full_text_alphabet']
df = df.sample(frac = 1).reset_index(drop = True) # shuffle

print(len(df['label_anti']))
print(df['label_anti'].value_counts(normalize=True))
print(df['label_anti'].value_counts(normalize=False))

**4. Performance metrics**

In [6]:
def report_results(A, B):
  A_name = A.name
  B_name = B.name
    
  df = pd.DataFrame({'A':A,
                       'B':B})
  df = df.dropna()
  A = df['A']
  B = df['B']
    
  acc = accuracy_score(B, A)
  f1 = f1_score(B, A)
  prec = precision_score(B, A)
  rec = recall_score(B, A)
  ROC = roc_auc_score(B, A)
    
  print('Candidate: '+A_name+' | Ground Truth: '+B_name+'\n')
  print('accuracy: %0.2f \nprecision: %0.2f \nrecall: %0.2f \nF1 score: %0.2f \nROC AUC: %0.2f \n' % (acc, prec, rec, f1, ROC))

  performance = [prec, rec, f1, ROC, acc]

  return performance

**5. Set seed for deterministic modeling**

In [9]:
def set_seed(seed):
  
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  
  np.random.seed(seed)
  
  random.seed(seed)
  
  os.environ['PYTHONHASHSEED'] = str(seed)

**6. Base model with CV (K=5)**

In [None]:
for learning_rate in [3e-5, 4e-5, 5e-5]:

  for num_train_epochs in [3, 4, 5]:

    print('\n\nWorking on', learning_rate, '+', num_train_epochs)

    args = {
      'output_dir': 'outputs/',
      'cache_dir': 'cache/',
      'fp16': False,
      'fp16_opt_level': 'O1',
      'max_seq_length': 250,
      'train_batch_size': 8,
      'eval_batch_size': 8,
      'gradient_accumulation_steps': 1,
      'num_train_epochs': num_train_epochs,
      'weight_decay': 0,
      'learning_rate': learning_rate,
      'adam_epsilon': 1e-8,
      'warmup_ratio': 0.06,
      'warmup_steps': 0,
      'max_grad_norm': 1.0,
      'logging_steps': 50,
      'evaluate_during_training': False,
      'save_steps': 2000,
      'eval_all_checkpoints': True,
      'use_tensorboard': True,
      'overwrite_output_dir': True,
      'reprocess_input_data': True}

    set_seed = 888
    n = 5
    kf = KFold(n_splits = n, random_state = 888, shuffle = True)

    performance_cv = []
    for train_index, val_index in kf.split(df):
      # splitting Dataframe (dataset not included)
        train_df = df.iloc[train_index]
        val_df = df.iloc[val_index]
      # Defining Model
        model = ClassificationModel('bert', 'bert-base-uncased', args = args)
      # train the model
        model.train_model(train_df)
      # validate the model 
        result, model_outputs, wrong_predictions = model.eval_model(val_df)
        val_df['BERT_label'] = np.argmax(model_outputs, axis = 1)
      # performance
        performance = report_results(val_df['BERT_label'], val_df['label_anti'])
        performance_cv.append(performance)

    df_performance = pd.DataFrame(performance_cv)
    df_performance.columns = ['prec', 'rec', 'f1', 'ROC', 'acc']

    print('mean', df_performance['prec'].mean(), df_performance['rec'].mean())
    print('sd', df_performance['prec'].std(), df_performance['rec'].std(), '\n\n\n\n')



Working on 3e-05 + 3
Converting to features started.


  0%|          | 0/3600 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.298483



Running loss: 0.568151

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.005575

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.797977Training of bert model complete. Saved to outputs/.
Converting to features started.


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Candidate: BERT_label | Ground Truth: label_anti

accuracy: 0.85 
precision: 0.67 
recall: 0.48 
F1 score: 0.55 
ROC AUC: 0.71 

Converting to features started.


  0%|          | 0/3600 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.564060



Running loss: 0.177525

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.243113

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.001129Training of bert model complete. Saved to outputs/.
Converting to features started.


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Candidate: BERT_label | Ground Truth: label_anti

accuracy: 0.84 
precision: 0.72 
recall: 0.47 
F1 score: 0.57 
ROC AUC: 0.71 

Converting to features started.


  0%|          | 0/3600 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.604911



Running loss: 0.415425

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.696544

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.382166Training of bert model complete. Saved to outputs/.
Converting to features started.


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Candidate: BERT_label | Ground Truth: label_anti

accuracy: 0.83 
precision: 0.61 
recall: 0.50 
F1 score: 0.55 
ROC AUC: 0.71 

Converting to features started.


  0%|          | 0/3600 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.527006



Running loss: 0.220180

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.168169

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.580041Training of bert model complete. Saved to outputs/.
Converting to features started.


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Candidate: BERT_label | Ground Truth: label_anti

accuracy: 0.84 
precision: 0.77 
recall: 0.56 
F1 score: 0.65 
ROC AUC: 0.75 

Converting to features started.


  0%|          | 0/3600 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.513460



Running loss: 0.183898

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.930305

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.005334Training of bert model complete. Saved to outputs/.
Converting to features started.


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Candidate: BERT_label | Ground Truth: label_anti

accuracy: 0.83 
precision: 0.57 
recall: 0.55 
F1 score: 0.56 
ROC AUC: 0.73 

mean 0.6659330200557252 0.5107581587950567
sd 0.08215804186947377 0.042027258485916416 






Working on 3e-05 + 4
Converting to features started.


  0%|          | 0/3600 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.378015



Running loss: 0.545482

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.326266

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.001294

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.000355Training of bert model complete. Saved to outputs/.
Converting to features started.


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Candidate: BERT_label | Ground Truth: label_anti

accuracy: 0.85 
precision: 0.63 
recall: 0.56 
F1 score: 0.59 
ROC AUC: 0.74 

Converting to features started.


  0%|          | 0/3600 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.751731



Running loss: 0.605857

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 1.397240

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.645528

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.000755Training of bert model complete. Saved to outputs/.
Converting to features started.


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Candidate: BERT_label | Ground Truth: label_anti

accuracy: 0.83 
precision: 0.65 
recall: 0.51 
F1 score: 0.57 
ROC AUC: 0.71 

Converting to features started.


  0%|          | 0/3600 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.559677



Running loss: 0.244916

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.108682

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.491483

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.002704Training of bert model complete. Saved to outputs/.
Converting to features started.


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Candidate: BERT_label | Ground Truth: label_anti

accuracy: 0.85 
precision: 0.66 
recall: 0.55 
F1 score: 0.60 
ROC AUC: 0.74 

Converting to features started.


  0%|          | 0/3600 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.764951



Running loss: 0.564294

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.473724

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.003961

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.000465Training of bert model complete. Saved to outputs/.
Converting to features started.


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Candidate: BERT_label | Ground Truth: label_anti

accuracy: 0.82 
precision: 0.68 
recall: 0.59 
F1 score: 0.63 
ROC AUC: 0.75 

Converting to features started.


  0%|          | 0/3600 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.572635



Running loss: 0.409567

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.668915

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.004165

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.001901Training of bert model complete. Saved to outputs/.
Converting to features started.


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Candidate: BERT_label | Ground Truth: label_anti

accuracy: 0.86 
precision: 0.64 
recall: 0.58 
F1 score: 0.61 
ROC AUC: 0.75 

mean 0.6500875491979402 0.5587270529109076
sd 0.018705609584389445 0.03217888622895066 






Working on 3e-05 + 5
Converting to features started.


  0%|          | 0/3600 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.919158



Running loss: 0.485468

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.114329

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.004373

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.643838

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.000690Training of bert model complete. Saved to outputs/.
Converting to features started.


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Candidate: BERT_label | Ground Truth: label_anti

accuracy: 0.86 
precision: 0.71 
recall: 0.52 
F1 score: 0.60 
ROC AUC: 0.73 

Converting to features started.


  0%|          | 0/3600 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.567827



Running loss: 0.347673

Current iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Running loss: 0.269326