# Main imports and code

In [16]:
import os

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import sys
#PROJECT_DIR = '..'
PROJECT_DIR = '/content/drive/MyDrive/SemEval-2022'
sys.path.insert(0, PROJECT_DIR)

from scripts.pcl.dont_patronize_me import DontPatronizeMe

DATASET_DIR = 'dataset/pcl'
DATASET_DIR_PCL = f'{PROJECT_DIR}/{DATASET_DIR}'
ROBERTA_DIR = f'{PROJECT_DIR}/roberta'

In [6]:
# check which gpu we're using
!nvidia-smi

Wed Dec  8 23:59:36 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8    26W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [7]:
%%capture
!pip install simpletransformers
!pip install tensorboardx

In [8]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch
from collections import Counter
from ast import literal_eval

In [9]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

Cuda available?  True


In [10]:
if cuda_available:
    import tensorflow as tf
    # Get the GPU device name.
    device_name = tf.test.gpu_device_name()
    # The device name should look like the following:
    if device_name == '/device:GPU:0':
        print('Found GPU at: {}'.format(device_name))
    else:
        raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


# Fetch Don't Patronize Me! data manager module

In [None]:
#module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
#module_name = module_url.split('/')[-1]
#print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
#with request.urlopen(module_url) as f, open(module_name,'w') as outf:
#  a = f.read()
#  outf.write(a.decode('utf-8'))

In [11]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [12]:
dpm = DontPatronizeMe(DATASET_DIR_PCL, None)

In [13]:
dpm.load_task1()

# Load paragraph IDs

In [17]:
trids = pd.read_csv(os.path.join(DATASET_DIR_PCL, 'practice_splits', 'train_semeval_parids-labels.csv'))
teids = pd.read_csv(os.path.join(DATASET_DIR_PCL, 'practice_splits', 'dev_semeval_parids-labels.csv'))

In [18]:
trids.head()

Unnamed: 0,par_id,label
0,4341,"[1, 0, 0, 1, 0, 0, 0]"
1,4136,"[0, 1, 0, 0, 0, 0, 0]"
2,10352,"[1, 0, 0, 0, 0, 1, 0]"
3,8279,"[0, 0, 0, 1, 0, 0, 0]"
4,1164,"[1, 0, 0, 1, 1, 1, 0]"


In [19]:
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

# Rebuild training set (Task 1)

In [20]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):  
    parid = trids.par_id[idx]
    #print(parid)
    # select row from original dataset to retrieve `text` and binary label
    text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
    label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
    rows.append({
        'par_id':parid,
        'text':text,
        'label':label
    })
  

In [21]:
trdf1 = pd.DataFrame(rows)
trdf1.tail()

Unnamed: 0,par_id,text,label
8370,8380,rescue teams search for survivors on the rubbl...,0
8371,8381,the launch of ' happy birthday ' took place la...,0
8372,8382,"the unrest has left at least 20,000 people dea...",0
8373,8383,you have to see it from my perspective . i may...,0
8374,8384,yet there was one occasion when we went to the...,0


# Rebuild test set (Task 1)

In [22]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):  
    parid = teids.par_id[idx]
    #print(parid)
    # select row from original dataset
    text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
    label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
    rows.append({
        'par_id':parid,
        'text':text,
        'label':label
    })
  

In [23]:
len(rows)

2094

In [24]:
tedf1 = pd.DataFrame(rows)
tedf1.tail()

Unnamed: 0,par_id,text,label
2089,10462,"the sad spectacle , which occurred on saturday...",0
2090,10463,""""""" the pakistani police came to our house and...",0
2091,10464,"""when marie o'donoghue went looking for a spec...",0
2092,10465,"""sri lankan norms and culture inhibit women fr...",0
2093,10466,he added that the afp will continue to bank on...,0


# RoBERTa Baseline for Task 1

In [25]:
# positive vs negative sample in original dataset
len(trdf1[trdf1.label==1]), len(trdf1[trdf1.label==0])

INFO:numexpr.utils:NumExpr defaulting to 2 threads.


(794, 7581)

In [26]:
# downsample negative instances
pcldf = trdf1[trdf1.label==1]
npos = len(pcldf)

training_set1 = pd.concat([pcldf, trdf1[trdf1.label==0][: npos*2]])

In [27]:
# positive vs negative sample in training dataset after downsampling negative instances
len(training_set1[training_set1.label==1]), len(training_set1[training_set1.label==0])

(794, 1588)

In [49]:
task1_model_args = ClassificationArgs(num_train_epochs=10, 
                                      no_save=False, 
                                      no_cache=False, 
                                      overwrite_output_dir=True,
                                      manual_seed=1,
                                      train_batch_size=32,
                                      use_cached_eval_features=True,
                                      evaluate_during_training_verbose=True,
                                      max_seq_length=512,
                                      sliding_window=True,
                                      tie_value=0)
task1_model = ClassificationModel("roberta", 
                                  'roberta-base', 
                                  args = task1_model_args, 
                                  num_labels=2, 
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(training_set1[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(tedf1.text.tolist())

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

  0%|          | 0/2382 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_roberta_128_2_2


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/75 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/75 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/75 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/75 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/75 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/75 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/75 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/75 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/75 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/75 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2094 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

In [50]:
Counter(preds_task1)

Counter({0: 1729, 1: 365})

In [30]:
!mkdir /content/drive/MyDrive/SemEval-2022/roberta/{ref,res}

In [51]:
# store reference labels
labels2file(tedf1.label.apply(lambda x:[x]).tolist(), os.path.join(f'{ROBERTA_DIR}/ref/', 'task1.txt'))
# store predicted labels
labels2file([[k] for k in preds_task1], os.path.join(f'{ROBERTA_DIR}/res/', 'task1.txt'))

## Prepare submission

In [52]:
!cat /content/drive/MyDrive/SemEval-2022/roberta/res/task1.txt | head -n 10

0
1
0
1
0
0
1
1
0
0


In [53]:
!python /content/drive/MyDrive/SemEval-2022/scripts/pcl/evaluation.py /content/drive/MyDrive/SemEval-2022/roberta /content/drive/MyDrive/SemEval-2022/roberta

In [None]:
!zip submission.zip task1.txt

  adding: task1.txt (deflated 92%)
  adding: task2.txt (deflated 97%)
