<a href="https://colab.research.google.com/github/tahaShm/ACTon-compiler/blob/master/Vaccine_opposition_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Vaccine (Opposition, Neutral, Acceptance)  Classification

Required Packages

In [None]:
# Install required packages

!pip install transformers
!pip install datasets
!pip install fairseq
!pip install sentencepiece



In [None]:
# Download xlmr.large model
!wget https://dl.fbaipublicfiles.com/fairseq/models/xlmr.large.tar.gz
!tar -xzvf xlmr.large.tar.gz

--2022-02-17 16:46:06--  https://dl.fbaipublicfiles.com/fairseq/models/xlmr.large.tar.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 172.67.9.4, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1028340964 (981M) [application/x-tar]
Saving to: ‘xlmr.large.tar.gz.1’


2022-02-17 16:46:32 (38.3 MB/s) - ‘xlmr.large.tar.gz.1’ saved [1028340964/1028340964]

xlmr.large/
xlmr.large/dict.txt
xlmr.large/model.pt
xlmr.large/sentencepiece.bpe.model


## Initialization

In [None]:
# Import required packages

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.utils import shuffle

from numpy.lib.function_base import average

from tqdm.notebook import tqdm

from collections import Counter

import os
import re
import json
import copy
import collections
import time

from transformers import BertConfig, BertTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AutoTokenizer, XLMRobertaTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from fairseq.models.roberta import XLMRModel

In [None]:
tqdm.pandas()

## Train, evaluation, and test sets

In [None]:
train = pd.read_csv('/content/drive/MyDrive/Chaman/train_vaccine_labeled.csv', usecols = ['origin_tweet', 'vaccine_state'])

# cleaning label values
train['vaccine_state'] = train['vaccine_state'].progress_apply(lambda l: '0' if l == '#' else l)
train['vaccine_state'] = train['vaccine_state'].progress_apply(lambda l: '-1' if l == '-' else l)
train = train[(train['vaccine_state'] == '0') | (train['vaccine_state'] == '1') | (train['vaccine_state'] == '-1')]
train['vaccine_state'] = train['vaccine_state'].progress_apply(lambda l: int(l)+1) # set (-1, 0, 1) to (0, 1, 2)

train.head()

  0%|          | 0/5002 [00:00<?, ?it/s]

  0%|          | 0/5002 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

Unnamed: 0,origin_tweet,vaccine_state
0,@Lazerz_AR اونی میتونه رعایت کنه که بدونه تو ب...,1
1,صندوق ذخیره ارزی کشور مال ما مردم نیست! حواسمو...,1
2,@____te94 فازش حمایت از واکسیناسیونه. خیلی از ...,2
3,واردات و تولید #واکسن و گسترش دامنه #واکسیناسی...,2
4,اگه همین ما بودیم واکسن آلوده صادر کرده بودیم ...,0


In [None]:
test = pd.read_csv('/content/drive/MyDrive/Chaman/test_vaccine_data.csv', usecols = ['origin_tweet'])
test.head()

Unnamed: 0,origin_tweet
0,@rezashah2021 نکنه اینام میخوان ادای آمریکا در...
1,میگن هنوز مشخص نیست که آیا واکسن های موجود در ...
2,@hasan_81075 اگه داد ببین واکسن هاری زده دیوص 😂😂😂
3,هیچکس: دکترا تو سوشال مدیا: خب حالا میخوام خیل...
4,ببینید رفقا! تولید در فضای آزمایشگاهی و کارگاه...


In [None]:
x_train, y_train = train['origin_tweet'].values.tolist(), train['vaccine_state'].values.tolist()
x_test = test['origin_tweet'].values.tolist()
y_test = [1]* len(x_test)

In [None]:
x_train, x_eval, y_train, y_eval = train_test_split(x_train, y_train, test_size=0.2,random_state=42,stratify=y_train)

## Configuration values

In [None]:
# general config
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 128
TEST_BATCH_SIZE = 128

EPOCHS = 5
EVERY_EPOCH = 500
LEARNING_RATE = 2e-5

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = '/content/drive/MyDrive/Chaman/vaccine_model1.bin'

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-large', num_labels=3)
# model = XLMRModel.from_pretrained('/content/xlmr.large', checkpoint_file='model.pt', num_labels=3)

Downloading:   0%|          | 0.00/2.09G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

In [None]:
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')
# tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')

## Creating dataset

In [None]:
class VaccineDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

## Tokenization

In [None]:
train_encodings = tokenizer(x_train, padding=True, truncation=True, max_length=MAX_LEN)

In [None]:
train_encodings.keys()
# train_encodings['input_ids'][0]

dict_keys(['input_ids', 'attention_mask'])

In [None]:
eval_encodings = tokenizer(x_eval, padding=True, truncation=True, max_length=MAX_LEN)

In [None]:
start_time = time.time()

test_encodings = tokenizer(x_test, padding=True, truncation=True, max_length=MAX_LEN)

end_time = time.time()
print(end_time - start_time)

80.11198616027832


In [None]:
train_dataset = VaccineDataset(train_encodings, y_train)

In [None]:
eval_dataset = VaccineDataset(eval_encodings, y_eval)

In [None]:
test_dataset = VaccineDataset(test_encodings, y_test)

## Fine tuning

In [None]:
# training_args = TrainingArguments("test_trainer")
training_args = TrainingArguments(
    output_dir=OUTPUT_PATH,
    evaluation_strategy="epoch",
    eval_steps=EVERY_EPOCH,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE
)
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.EPOCH,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=2e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=/content/drive/MyDriv

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(labels, pred, average='micro')

    return {"accuracy": accuracy,"f1_score":f1}

In [None]:
trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 4000
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1250


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,No log,1.084911,0.436563,0.436563


***** Running Evaluation *****
  Num examples = 1001
  Batch size = 128


## Evaluation

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1001
  Batch size = 128


{'epoch': 5.0,
 'eval_accuracy': 0.5694305694305695,
 'eval_f1_score': 0.5694305694305695,
 'eval_loss': 1.5240815877914429,
 'eval_runtime': 3.7146,
 'eval_samples_per_second': 269.474,
 'eval_steps_per_second': 2.154}

## Prediction

In [None]:
predictions = trainer.predict(test_dataset=test_dataset)

***** Running Prediction *****
  Num examples = 1032438
  Batch size = 128


In [None]:
temp_preds = predictions[0]

In [None]:
temp_preds

array([[ 1.4602524 ,  0.40381682, -2.192508  ],
       [ 1.6709789 , -3.2045212 ,  1.5367272 ],
       [-1.4007853 ,  3.9812024 , -2.578633  ],
       ...,
       [-2.214426  ,  3.4278464 , -0.981017  ],
       [ 0.6800879 , -2.6010213 ,  1.6187539 ],
       [-0.771452  , -2.6521053 ,  3.756805  ]], dtype=float32)

In [None]:
results = np.argmax(temp_preds,axis=1)

In [None]:
keys = Counter(results).keys() # equals to list(set(results))
values = Counter(results).values() # counts the elements' frequency
print(keys)
print(values)

dict_keys([0, 1, 2])
dict_values([214515, 511735, 306188])


In [None]:
pred_df = pd.DataFrame(results,columns=['predictions'])
pred_df.head(50)

Unnamed: 0,predictions
0,0
1,0
2,1
3,0
4,1
5,0
6,2
7,2
8,2
9,1


In [None]:
pred_df.to_csv('/content/drive/MyDrive/Chaman/prediction_1.csv')