# Main imports and code

In [116]:
# check which gpu we're using
!nvidia-smi

Tue Dec 14 22:53:15 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    74W / 149W |   8255MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [117]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [118]:
%cd /content/
!cp /content/drive/MyDrive/task4-hw4/sushma/nlp-final-project/dontpatronizeme_categories.tsv ./
!cp /content/drive/MyDrive/task4-hw4/sushma/nlp-final-project/dontpatronizeme_pcl.tsv ./
!cp /content/drive/MyDrive/task4-hw4/sushma/nlp-final-project/README.txt ./

/content


In [119]:
!pip install simpletransformers
!pip install tensorboardx



In [120]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch
from collections import Counter
from ast import literal_eval

In [121]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

Cuda available?  True


In [122]:
if cuda_available:
  import tensorflow as tf
  # Get the GPU device name.
  device_name = tf.test.gpu_device_name()
  # The device name should look like the following:
  if device_name == '/device:GPU:0':
      print('Found GPU at: {}'.format(device_name))
  else:
      raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


# Fetch Don't Patronize Me! data manager module

In [123]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [124]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [125]:
from dont_patronize_me import DontPatronizeMe

In [126]:
dpm = DontPatronizeMe('.', '.')

In [127]:
dpm.load_task1()
dpm.load_task2(return_one_hot=True)

Map of label to numerical label:
{'Unbalanced_power_relations': 0, 'Shallow_solution': 1, 'Presupposition': 2, 'Authority_voice': 3, 'Metaphors': 4, 'Compassion': 5, 'The_poorer_the_merrier': 6}


In [128]:
! git clone https://github.com/Perez-AlmendrosC/dontpatronizeme.git

fatal: destination path 'dontpatronizeme' already exists and is not an empty directory.


# Load paragraph IDs

In [129]:
trids = pd.read_csv('/content/dontpatronizeme/semeval-2022/practice splits/train_semeval_parids-labels.csv')
teids = pd.read_csv('/content/dontpatronizeme/semeval-2022/practice splits/dev_semeval_parids-labels.csv')

In [130]:
trids.head()

Unnamed: 0,par_id,label
0,4341,"[1, 0, 0, 1, 0, 0, 0]"
1,4136,"[0, 1, 0, 0, 0, 0, 0]"
2,10352,"[1, 0, 0, 0, 0, 1, 0]"
3,8279,"[0, 0, 0, 1, 0, 0, 0]"
4,1164,"[1, 0, 0, 1, 1, 1, 0]"


In [131]:
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

# Rebuild training set (Task 1)

In [132]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):  
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'text':text,
      'label':label
  })
  

In [133]:
trdf1 = pd.DataFrame(rows)

# Rebuild test set (Task 1)

In [134]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):  
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'text':text,
      'label':label
  })
  

In [135]:
len(rows)

2094

In [136]:
tedf1 = pd.DataFrame(rows)

# SpanBERT Baseline for Task 1

In [137]:
# downsample negative instances
pcldf = trdf1[trdf1.label==1]
npos = len(pcldf)

training_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])

In [138]:
training_set1

Unnamed: 0,par_id,text,label
0,4341,"the scheme saw an estimated 150,000 children f...",1
1,4136,durban 's homeless communities reconciliation ...,1
2,10352,the next immediate problem that cropped up was...,1
3,8279,far more important than the implications for t...,1
4,1164,to strengthen child-sensitive social protectio...,1
...,...,...,...
2377,1775,last but not the least element of culpability ...,0
2378,1776,"then , taking the art of counter-intuitive non...",0
2379,1777,kagunga village was reported to lack necessary...,0
2380,1778,"""after her parents high-profile divorce after ...",0


In [139]:

task1_model_args = ClassificationArgs(num_train_epochs=1, 
                                      no_save=True, 
                                      no_cache=True, 
                                      overwrite_output_dir=True)
task1_model = ClassificationModel("bert", 
                                  'SpanBERT/spanbert-base-cased', 
                                  args = task1_model_args, 
                                  num_labels=2, 
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(training_set1[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(tedf1.text.tolist())

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SpanBERT/spanbert-base-cased and are newly initialized: ['classifier.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2382 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/298 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2094 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

In [140]:
Counter(preds_task1)

Counter({1: 337, 0: 1757})

In [141]:
labels2file([[k] for k in preds_task1], 'task1.txt')

# Rebuild training set (Task 2)

In [None]:
rows2 = [] # will contain par_id, label and text
for idx in range(len(trids)):  
  parid = trids.par_id[idx]
  label = trids.label[idx]
  # select row from original dataset to retrieve the `text` value
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  rows2.append({
      'par_id':parid,
      'text':text,
      'label':label
  })
  

In [None]:
trdf2 = pd.DataFrame(rows2)

In [None]:
trdf2

Unnamed: 0,par_id,text,label
0,4341,"the scheme saw an estimated 150,000 children f...","[1, 0, 0, 1, 0, 0, 0]"
1,4136,durban 's homeless communities reconciliation ...,"[0, 1, 0, 0, 0, 0, 0]"
2,10352,the next immediate problem that cropped up was...,"[1, 0, 0, 0, 0, 1, 0]"
3,8279,far more important than the implications for t...,"[0, 0, 0, 1, 0, 0, 0]"
4,1164,to strengthen child-sensitive social protectio...,"[1, 0, 0, 1, 1, 1, 0]"
...,...,...,...
8370,8380,rescue teams search for survivors on the rubbl...,"[0, 0, 0, 0, 0, 0, 0]"
8371,8381,the launch of ' happy birthday ' took place la...,"[0, 0, 0, 0, 0, 0, 0]"
8372,8382,"the unrest has left at least 20,000 people dea...","[0, 0, 0, 0, 0, 0, 0]"
8373,8383,you have to see it from my perspective . i may...,"[0, 0, 0, 0, 0, 0, 0]"


In [None]:
trdf2.label = trdf2.label.apply(literal_eval)

# Rebuild test set (Task 2)

In [None]:
rows2 = [] # will contain par_id, label and text
for idx in range(len(teids)):  
  parid = teids.par_id[idx]
  label = teids.label[idx]
  #print(parid)
  # select row from original dataset to access the `text` value
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  rows2.append({
      'par_id':parid,
      'text':text,
      'label':label
  })
  

In [None]:
tedf2 = pd.DataFrame(rows2)

In [None]:
tedf2

Unnamed: 0,par_id,text,label
0,4046,we also know that they can benefit by receivin...,"[1, 0, 0, 1, 0, 0, 0]"
1,1279,pope francis washed and kissed the feet of mus...,"[0, 1, 0, 0, 0, 0, 0]"
2,8330,many refugees do n't want to be resettled anyw...,"[0, 0, 1, 0, 0, 0, 0]"
3,4063,"""budding chefs , like """" fred """" , """" winston ...","[1, 0, 0, 1, 1, 1, 0]"
4,4089,"""in a 90-degree view of his constituency , one...","[1, 0, 0, 0, 0, 0, 0]"
...,...,...,...
2089,10462,"the sad spectacle , which occurred on saturday...","[0, 0, 0, 0, 0, 0, 0]"
2090,10463,""""""" the pakistani police came to our house and...","[0, 0, 0, 0, 0, 0, 0]"
2091,10464,"""when marie o'donoghue went looking for a spec...","[0, 0, 0, 0, 0, 0, 0]"
2092,10465,"""sri lankan norms and culture inhibit women fr...","[0, 0, 0, 0, 0, 0, 0]"


In [None]:
tedf2.label = tedf2.label.apply(literal_eval)

In [None]:
all_negs = trdf2[trdf2.label.apply(lambda x:sum(x) == 0)]
all_pos = trdf2[trdf2.label.apply(lambda x:sum(x) > 0)]

training_set2 = pd.concat([all_pos,all_negs[:round(len(all_pos)*0.5)]])

In [None]:
training_set2

Unnamed: 0,par_id,text,label
0,4341,"the scheme saw an estimated 150,000 children f...","[1, 0, 0, 1, 0, 0, 0]"
1,4136,durban 's homeless communities reconciliation ...,"[0, 1, 0, 0, 0, 0, 0]"
2,10352,the next immediate problem that cropped up was...,"[1, 0, 0, 0, 0, 1, 0]"
3,8279,far more important than the implications for t...,"[0, 0, 0, 1, 0, 0, 0]"
4,1164,to strengthen child-sensitive social protectio...,"[1, 0, 0, 1, 1, 1, 0]"
...,...,...,...
1186,434,""""""" i was absolutely useless at school , hopel...","[0, 0, 0, 0, 0, 0, 0]"
1187,435,i also noticed the change in socio-economic le...,"[0, 0, 0, 0, 0, 0, 0]"
1188,436,"can donald trump win ? it 's possible , but ce...","[0, 0, 0, 0, 0, 0, 0]"
1189,437,he added that any introduction of new law must...,"[0, 0, 0, 0, 0, 0, 0]"


In [None]:
task2_model_args = MultiLabelClassificationArgs(num_train_epochs=1,
                                                no_save=True, 
                                                no_cache=True, 
                                                overwrite_output_dir=True
                                                )
task2_model = MultiLabelClassificationModel("roberta", 
                                            'roberta-base', 
                                            num_labels=7,
                                            args = task2_model_args, 
                                            use_cuda=cuda_available)
# train model
task2_model.train_model(training_set2[['text', 'label']])
# run predictions
preds_task2, _ = task2_model.predict(tedf2.text.tolist())

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultiLabelSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'cla

  0%|          | 0/1191 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/149 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2094 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

In [None]:
labels2file(preds_task2, 'task2.txt')

## Prepare submission

In [None]:
!cat task1.txt | head -n 10

1
1
0
1
0
0
1
1
0
1


In [None]:
!cat task2.txt | head -n 10

1,0,0,0,0,0,0
1,0,0,0,0,0,0
0,0,0,0,0,0,0
1,0,0,0,0,1,0
0,0,0,0,0,0,0
0,0,0,0,0,1,0
1,0,0,0,0,0,0
1,0,0,0,0,0,0
0,0,0,0,0,0,0
0,0,0,0,0,1,0


In [None]:
!zip submission.zip task1.txt task2.txt

  adding: task1.txt (deflated 91%)
  adding: task2.txt (deflated 97%)


# SpanBERT baseline for Task 2

In [None]:
task2_model_args = MultiLabelClassificationArgs(num_train_epochs=5,
                                                no_save=True, 
                                                no_cache=True, 
                                                overwrite_output_dir=True
                                                )
task2_model = MultiLabelClassificationModel("bert", 
                                            'SpanBERT/spanbert-base-cased', 
                                            num_labels=7,
                                            args = task2_model_args, 
                                            use_cuda=cuda_available)
# train model
task2_model.train_model(training_set2[['text', 'label']])
# run predictions
preds_task2_sp, _ = task2_model.predict(tedf2.text.tolist())

Some weights of BertForMultiLabelSequenceClassification were not initialized from the model checkpoint at SpanBERT/spanbert-base-cased and are newly initialized: ['classifier.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/1191 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/149 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/149 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/149 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/149 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/149 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2094 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

In [None]:
labels2file(preds_task2_sp, 'task2.txt')

## Evaluate and Generate submissions

In [None]:
!cat task2.txt | head -n 10

1,0,0,0,0,0,0
1,1,0,0,0,0,0
0,0,0,0,0,0,0
1,0,0,0,0,1,0
0,0,0,0,0,0,0
0,0,0,0,0,0,0
1,0,0,0,0,0,0
1,1,0,0,0,0,0
0,0,0,0,0,1,0
0,0,0,0,0,1,0


In [None]:
!zip submission.zip task1.txt task2.txt

updating: task1.txt (deflated 91%)
updating: task2.txt (deflated 97%)


In [None]:
!mkdir ref res

In [None]:
import os
labels2file(tedf2.label.tolist(), os.path.join('ref/', 'task2.txt'))
#labels2file(tedf2.label.apply(lambda x:[x]).tolist(), os.path.join('ref/', 'task2.txt'))

In [None]:
labels2file(preds_task2_sp, os.path.join('res/', 'task2.txt'))

In [None]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/evaluation.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/evaluation.py


In [None]:
!python3 evaluation.py . .

In [None]:
!cat scores.txt

task2_unb:0.37521514629948366
task2_sha:0.17647058823529413
task2_pre:0.29166666666666663
task2_aut:0.0
task2_met:0.05797101449275363
task2_com:0.34810126582278483
task2_the:0.0
task2_avg:0.17848924021671184


In [142]:
import os
labels2file([[k] for k in tedf1.label.tolist()], os.path.join('ref/', 'task1.txt'))

In [143]:
labels2file([[k] for k in preds_task1], 'res/task1.txt')

In [150]:
!python3 evaluation.py . .

In [151]:
!cat scores.txt

task1_precision:0.3798219584569733
task1_recall:0.6432160804020101
task1_f1:0.4776119402985075
task2_unb:0.37521514629948366
task2_sha:0.17647058823529413
task2_pre:0.29166666666666663
task2_aut:0.0
task2_met:0.05797101449275363
task2_com:0.34810126582278483
task2_the:0.0
task2_avg:0.17848924021671184


In [146]:
!cp -r /content/outputs/ /content/drive/MyDrive/ColabNotebooks/

In [147]:
!cp -r /content/res/ /content/drive/MyDrive/ColabNotebooks/outputs/

In [148]:
!cp -r /content/ref/ /content/drive/MyDrive/ColabNotebooks/outputs/

In [149]:
!cp -r /content/scores.txt /content/drive/MyDrive/ColabNotebooks/outputs/

In [None]:
! cp submission.zip /content/drive/MyDrive/ColabNotebooks/outputs/