**1. Mount drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**2. Import libraries**

In [None]:
!pip install torch torchvision
!pip install transformers==2.10.0
!pip install seqeval
!pip install tensorboardx
!pip install simpletransformers==0.9.1

import pandas as pd
import numpy as np

import requests
import gc
import os
import random

from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, recall_score, precision_score
from scipy.special import softmax

import torch

print("Cuda available" if torch.cuda.is_available() is True else "CPU")
print("PyTorch version: ", torch.__version__)

Cuda available
PyTorch version:  1.10.0+cu111


**3. Read training data**

In [None]:
path = '/content/drive/MyDrive/spap_state/spap_state_vaccine/data/'
df = pd.read_csv(path + 'spap_vaccine_training_vaccine.csv')
df = df[['full_text_final', 'label_final']]

print(len(df))
print(df['label_final'].value_counts(normalize = True))

8400
0    0.909881
1    0.090119
Name: label_final, dtype: float64


**4. Performance metrics**

In [None]:
def report_results(A, B):

  A_name = A.name
  B_name = B.name
    
  df = pd.DataFrame({'A': A,
                     'B': B})
  df = df.dropna()
  A = df['A']
  B = df['B']
  
  prec = precision_score(B, A)
  rec = recall_score(B, A)
  f1 = f1_score(B, A)
  acc = accuracy_score(B, A)

  performance = [prec, rec, f1, acc]

  return performance

**4. Set seed for deterministic modeling**

In [None]:
def set_seed(seed):
  
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  
  np.random.seed(seed)
  
  random.seed(seed)
  
  os.environ['PYTHONHASHSEED'] = str(seed)

**5. Base model with CV (K=5)**

In [None]:
# hyper-parameters

args = {
   'output_dir': 'outputs/',
   'cache_dir': 'cache/',
   'fp16': False,
   'fp16_opt_level': 'O1',
   'max_seq_length': 250,
   'train_batch_size': 8,
   'eval_batch_size': 8,
   'gradient_accumulation_steps': 1,
   'num_train_epochs': 2,
   'weight_decay': 0,
   'learning_rate': 4e-5,
   'adam_epsilon': 1e-8,
   'warmup_ratio': 0.06,
   'warmup_steps': 0,
   'max_grad_norm': 1.0,
   'logging_steps': 50,
   'evaluate_during_training': False,
   'save_steps': 2000,
   'eval_all_checkpoints': True,
   'use_tensorboard': True,
   'overwrite_output_dir': True,
   'reprocess_input_data': True
   }


# set seed number

set_seed(777)


# cross validate

n = 5
kf = KFold(n_splits = n, random_state = 777, shuffle = True)

cv_results = []
for train_index, val_index in kf.split(df):
  # splitting dataframe 
    train_df = df.iloc[train_index]
    val_df = df.iloc[val_index]
  # defining model
    model = ClassificationModel('bert', 'bert-base-uncased', args = args)
  # train model
    model.train_model(train_df)
  # validate model 
    result, model_outputs, wrong_predictions = model.eval_model(val_df)
    val_df['label_final_BERT'] = np.argmax(model_outputs, axis = 1)
  # performance
    performance = report_results(val_df['label_final_BERT'], val_df['label_final'])
    print(performance)
    cv_results.append(performance) 

# report results 

df_cv_results = pd.DataFrame(cv_results, columns = ['precision', 'recall', 'f-1', 'accuracy'])
print(round(df_cv_results.mean(), 4))
print(round(df_cv_results.std(), 4))

Converting to features started.


  0%|          | 0/6720 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Current iteration:   0%|          | 0/840 [00:00<?, ?it/s]

Running loss: 0.757197



Running loss: 0.000420

Current iteration:   0%|          | 0/840 [00:00<?, ?it/s]

Running loss: 0.000295Training of bert model complete. Saved to outputs/.
Converting to features started.


  0%|          | 0/1680 [00:00<?, ?it/s]

  0%|          | 0/210 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0.9863013698630136, 0.9664429530201343, 0.9762711864406781, 0.9958333333333333]
Converting to features started.


  0%|          | 0/6720 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Current iteration:   0%|          | 0/840 [00:00<?, ?it/s]

Running loss: 0.481332



Running loss: 0.000306

Current iteration:   0%|          | 0/840 [00:00<?, ?it/s]

Running loss: 0.000183Training of bert model complete. Saved to outputs/.
Converting to features started.


  0%|          | 0/1680 [00:00<?, ?it/s]

  0%|          | 0/210 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0.9932885906040269, 0.9736842105263158, 0.9833887043189369, 0.9970238095238095]
Converting to features started.


  0%|          | 0/6720 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Current iteration:   0%|          | 0/840 [00:00<?, ?it/s]

Running loss: 0.095133



Running loss: 0.000901

Current iteration:   0%|          | 0/840 [00:00<?, ?it/s]

Running loss: 0.000301Training of bert model complete. Saved to outputs/.
Converting to features started.


  0%|          | 0/1680 [00:00<?, ?it/s]

  0%|          | 0/210 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[1.0, 0.9733333333333334, 0.9864864864864865, 0.9976190476190476]
Converting to features started.


  0%|          | 0/6720 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Current iteration:   0%|          | 0/840 [00:00<?, ?it/s]

Running loss: 0.573580



Running loss: 0.606543

Current iteration:   0%|          | 0/840 [00:00<?, ?it/s]

Running loss: 0.000270Training of bert model complete. Saved to outputs/.
Converting to features started.


  0%|          | 0/1680 [00:00<?, ?it/s]

  0%|          | 0/210 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0.9865771812080537, 0.9932432432432432, 0.9898989898989898, 0.9982142857142857]
Converting to features started.


  0%|          | 0/6720 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Current iteration:   0%|          | 0/840 [00:00<?, ?it/s]

Running loss: 0.449497



Running loss: 0.000449

Current iteration:   0%|          | 0/840 [00:00<?, ?it/s]

Running loss: 0.000306Training of bert model complete. Saved to outputs/.
Converting to features started.


  0%|          | 0/1680 [00:00<?, ?it/s]

  0%|          | 0/210 [00:00<?, ?it/s]

[0.9935483870967742, 0.9746835443037974, 0.9840255591054313, 0.9970238095238095]
precision    0.9919
recall       0.9763
f-1          0.9840
accuracy     0.9971
dtype: float64
precision    0.0057
recall       0.0100
f-1          0.0050
accuracy     0.0009
dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


**6. Prediction on new tweets**

In [None]:
args = {
   'output_dir': 'outputs/',
   'cache_dir': 'cache/',
   'fp16': False,
   'fp16_opt_level': 'O1',
   'max_seq_length': 250,
   'train_batch_size': 8,
   'eval_batch_size': 8,
   'gradient_accumulation_steps': 1,
   'num_train_epochs': 2,
   'weight_decay': 0,
   'learning_rate': 4e-5,
   'adam_epsilon': 1e-8,
   'warmup_ratio': 0.06,
   'warmup_steps': 0,
   'max_grad_norm': 1.0,
   'logging_steps': 50,
   'evaluate_during_training': False,
   'save_steps': 2000,
   'eval_all_checkpoints': True,
   'use_tensorboard': True,
   'overwrite_output_dir': True,
   'reprocess_input_data': True
   }

model = ClassificationModel('bert', 'bert-base-uncased', args = args)
model.train_model(df)

predictions, raw_outputs = model.predict()