<a href="https://colab.research.google.com/github/wilsoncwc/dontpatronizeme/blob/main/NLP_coursework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Helpful Documentation Links

Huggingface https://huggingface.co/docs/transformers/v4.16.2/en/index

Simple Transformers https://simpletransformers.ai/docs/classification-models/

Pandas https://pandas.pydata.org/docs/reference/frame.html




# Main imports and code

In [None]:
# check which gpu we're using
!nvidia-smi

Sat Feb 12 21:22:36 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8    28W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install transformers simpletransformers tensorboardx requests nlpaug matplotlib==3.4.0



In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from ast import literal_eval
import spacy

# print the entire text
pd.set_option('display.max_colwidth', None)

In [None]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

Cuda available?  True


In [None]:
if cuda_available:
  import tensorflow as tf
  # Get the GPU device name.
  device_name = tf.test.gpu_device_name()
  # The device name should look like the following:
  if device_name == '/device:GPU:0':
      print('Found GPU at: {}'.format(device_name))
  else:
      raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


2022-02-12 21:24:35.872892: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.


# Fetch Don't Patronize Me! data manager module

In [None]:
def load_module(module_url):
  module_name = module_url.split('/')[-1]
  print(f'Fetching {module_url}')
  #with open("file_1.txt") as f1, open("file_2.txt") as f2
  with request.urlopen(module_url) as f, open(module_name,'w') as outf:
    a = f.read()
    outf.write(a.decode('utf-8'))
load_module("https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py")

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [None]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [None]:
from dont_patronize_me import DontPatronizeMe

In [None]:
dpm = DontPatronizeMe('.', '.')

In [None]:
# Load dataset and splits
baseurl = 'https://raw.githubusercontent.com/wilsoncwc/dontpatronizeme/main/'
filenames = ['data/dontpatronizeme_pcl.tsv',
             'data/dontpatronizeme_categories.tsv',
             'practice%20splits/train_semeval_parids-labels.csv',
             'practice%20splits/dev_semeval_parids-labels.csv']
for filename in filenames:
  load_module(f'{baseurl}{filename}')

Fetching https://raw.githubusercontent.com/wilsoncwc/dontpatronizeme/main/data/dontpatronizeme_pcl.tsv
Fetching https://raw.githubusercontent.com/wilsoncwc/dontpatronizeme/main/data/dontpatronizeme_categories.tsv
Fetching https://raw.githubusercontent.com/wilsoncwc/dontpatronizeme/main/practice%20splits/train_semeval_parids-labels.csv
Fetching https://raw.githubusercontent.com/wilsoncwc/dontpatronizeme/main/practice%20splits/dev_semeval_parids-labels.csv


In [None]:
dpm.load_task1()
dpm.load_task2(return_one_hot=True)

Map of label to numerical label:
{'Unbalanced_power_relations': 0, 'Shallow_solution': 1, 'Presupposition': 2, 'Authority_voice': 3, 'Metaphors': 4, 'Compassion': 5, 'The_poorer_the_merrier': 6}


# Load paragraph IDs

In [None]:
trids = pd.read_csv('train_semeval_parids-labels.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')

In [None]:
trids.head()

Unnamed: 0,par_id,label
0,4341,"[1, 0, 0, 1, 0, 0, 0]"
1,4136,"[0, 1, 0, 0, 0, 0, 0]"
2,10352,"[1, 0, 0, 0, 0, 1, 0]"
3,8279,"[0, 0, 0, 1, 0, 0, 0]"
4,1164,"[1, 0, 0, 1, 1, 1, 0]"


In [None]:
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

# Rebuild training set (Task 1)

In [None]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):  
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'text':text,
      'label':label
  })
  

In [None]:
trdf1 = pd.DataFrame(rows)

# Rebuild test set (Task 1)

In [None]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):  
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'text':text,
      'label':label
  })
  

In [None]:
len(rows)

2094

In [None]:
tedf1 = pd.DataFrame(rows)

# Data Analysis

Found this cheatsheet useful: https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/cheatsheet/Pandas_Cheat_Sheet.pdf

In [None]:
from transformers import AutoTokenizer

base_model = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(base_model)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# def tokenizer(text): 
#   doc = spacy_nlp(text)
#   # Remove stop words, punctuation symbols and non alphabetic characters
#   tokens = [token.text.lower() for token in doc if not token.is_stop 
#             and not token.is_punct
#             and token.is_alpha] #keep only alphabetic characters
#   return tokens
tokenizer.tokenize("I have a new GPU!")


In [None]:
df = dpm.train_task1_df
print(f'Number of paragraphs: {len(df)}')
positive_df = df.query("label == 1")
print(f'Number of positive paragraphs: {len(positive_df)}')
print(f'Number of negative paragraphs: {len(df) - len(positive_df)}')
print(f'Columns {df.columns}')

def plot_stacked_counts(data, group, title=""):
  counts = data.groupby([group, 'label']).size().unstack()
  counts['sum'] = counts[1] + counts[0]
  counts = counts.sort_values(by='sum')
  counts.drop('sum', axis=1)
  counts = counts[[1,0]]
  percentages = round(100 * counts[0] / (counts[1] + counts[0]), 2)
  percent_strs = [f'{percent}%' for percent in percentages]
  ax = counts.plot.barh(stacked=True, figsize=(10,10))
  if title:
    plt.title(title, fontsize=16)
    # for container in ax.containers:
    #   ax.bar_label(container)
    ax.bar_label(ax.containers[1], labels=percent_strs, label_type='center')
    plt.ylabel('Keyword', fontsize=12)
    plt.xlabel('Paragraph count', fontsize=12)
    plt.legend(title='Label', fontsize=12, loc='lower right')
    # plt.savefig(f'{group}-stacked-counts')
  plt.show()

def plot_analysis_graphs():
  plot_stacked_counts(df, 'keyword', title='Number of paragraphs in the dataset by keyword and label')
  # plot_stacked_counts(df, 'country')

  count_chars = (lambda par: len(par))
  count_tokens = (lambda par: len(tokenizer.tokenize(par)))

  df['token_count'] = df['text'].apply(count_tokens)
  # df.boxplot('token_count', by=['keyword', 'label'], figsize=(30,10))
  # plt.yscale('log')
  # plt.show()

  df.boxplot('token_count', by=['keyword'], figsize=(15,10))
  plt.suptitle('')
  plt.yscale('log')
  plt.ylabel('Token count (log scale)', fontsize=12)
  plt.xlabel('Keyword', fontsize=12)
  plt.title('Distribution of token counts by keyword', fontsize=16)
  plt.savefig('token-count-keyword')
  plt.show()


  # df.boxplot('token_count', by=['orig_label'], figsize=(15,10))
  # plt.yscale('log')
  # plt.show()

# plot_analysis_graphs()

# Augmentation & Spellcheck

In [None]:
# import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
# import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
""" augmenters """
# can take a while to download models

synonym_aug = naw.SynonymAug(aug_src='wordnet', aug_min=1, aug_max=3)

back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de',  # translate to german
    to_model_name='facebook/wmt19-de-en',
    device='cuda',
    max_length=512 # sub with our own max length??
)

# insert a contextually appropriate word
aug = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', action="insert")

aug = naw.RandomWordAug(action="swap")

# random delete
aug = naw.RandomWordAug()

In [None]:
df = dpm.train_task1_df
df.head()
df_plain = df[df['label'] == 1].sample(20)

### Synonyms

In [None]:
df_aug = df_plain.copy()
df_aug['text'] = df_aug['text'].apply(lambda x: synonym_aug.augment(x))

# put the diffs side by side
df_tgt = pd.concat([df_plain["text"], df_aug["text"]], keys=["original", "augmented"], axis=1)

In [None]:
# examples of augmentation
df_tgt.head()

Unnamed: 0,original,augmented
9463,""""""" There was concern about the level of unemployment in the area and also poverty . We 've seen in other projects , as soon as poverty ... or a feeling of desperation or hopelessness appears in a community , people start drinking more , """" Olivier said .""",""" "" "" There was business concern about the level of unemployment in the area and also poverty. We ' ve seen in other projects, as soon as poverty. .. or a feeling of desperation or hopelessness come out in a community, masses start drinking more, "" "" Olivier said. """
10165,"The figures compiled by the Guardian , which include an average of more than one death a week in 2017 , are likely to be a substantial underestimate , as no part of the UK government records homeless death statistics at a national level , and local authorities are not required to count rough sleeper deaths .","The figures compiled by the Guardian, which include an average of more than one death a week in 2017, are likely to be a substantial underestimate, as no part of the UK government records homeless death statistics at a national level, and local authorities are not required to count rough sleeper deaths."
8071,"The LGBTI community , asylum seekers and refugees , people who misuse alcohol and people who are homeless are identified as those most at risk by the HSE .","The LGBTI community, asylum seekers and refugees, people who misuse alcohol and hoi polloi who are homeless are identified as those most at risk of exposure by the HSE."
10188,"Antoine Griezmann would have left Atl ? tico Madrid this summer were it not for a transfer ban which prevented Atleti from replacing him . So he stayed , likely with the understanding that he could join Manchester United in summer 2018 when Atleti could sign players again and would be in need of a new start as Diego Simeone 's contract would have expired then too .","Antoine Griezmann would have left Atl? tico Madrid this summer were it not for a conveyance forbiddance which prevented Atleti from replacing him. So he stayed, likely with the understanding that he could join Manchester United in summer 2018 when Atleti could sign players again and would be in need of a new start as Diego Simeone ' s contract would have expired then too."
10173,United are in need of midfield reinforcements with Michael Carrick retiring and Marouane Fellaini out of contract .,United are in need of midfield reinforcements with Michael Carrick retiring and Marouane Fellaini knocked out of contract.


### Back-translation (random)

In [None]:
df_aug = df_plain.copy()
# it's very slow
df_aug['text'] = df_aug['text'].apply(lambda x: back_translation_aug.augment(x))

# put the diffs side by side
df_back_tgt = pd.concat([df_plain["text"], df_aug["text"]], keys=["original", "augmented"], axis=1)

In [None]:
df_back_tgt

Unnamed: 0,original,augmented
9922,"The first point worth mentioning is low levels of living , characterized by low incomes , inequality , poor health and inadequate education . In developing countries , the levels of living tend to be very low for the vast majority of people . These low levels of living are manifested quantitatively and qualitatively in the form of low life and work expectancies , high infant mortality rate and in many cases a general sense of malaise and hopelessness .","The first point worth mentioning is the low standard of living, which is characterised by low income, inequality, poor health and inadequate education. In developing countries, the level of living for the vast majority of people tends to be very low, and this low level of living manifests itself in quantitative and qualitative terms in the form of low life expectancy and working life expectancy, high infant mortality and, in many cases, general malaise and hopelessness."
468,"""For Mexicans , the question of colonization had two profoundly different meanings . For a country that had long perceived itself at risk of being swallowed up by its more populous northern neighbor , colonization signified the attracting of new immigrants . As the Mexico City newspaper El Siglo Diez y Nueve asserted in 1881 , colonization was one of Mexico 's """" great projects . """" New immigrants would """" not only increase the scant population that we possess , """" but also aid in """" the exploitation of our agricultural elements , whose richness will pour out . . . in the principal markets of the world . """"""","""For Mexicans, the question of colonization had two profoundly different meanings: for a country that for a long time was in danger of being swallowed up by its more populous northern neighbor, colonization meant attracting new immigrants. As the newspaper El Siglo Diez y Nueve in Mexico City asserted in 1881, colonization was one of Mexico's"" great projects. ""New immigrants would"" not only increase the sparse population we have, ""but also"" help exploit our agricultural elements, the wealth of which... will flow into the most important markets in the world. """""
5280,"The temperature is significantly rising from the last week . It does not go below 30 Degree Celsius since June 29 . However , most of the causalities in Montreal were men with age 50 and above , who were living in vulnerable conditions .","The temperature has not risen below 30 degrees Celsius since June 29. However, most of the deaths in Montreal were men aged 50 and over who lived in vulnerable conditions."
8407,"She said , on the commendations of the women wing , the PHF has barred women players of over thirty years of age from taking part in all national level hockey events to give ample opportunity to the young hockey talent to play hockey .",She said the PHF has excluded women over the age of 30 from participating in all national hockey events to give young hockey talents ample opportunity to play hockey.
7080,"That 's right : In a world where millions of talented people are hopelessly idle , a shortage of qualified workers threatens Germany 's economic performance .","That is true: in a world where millions of talented people are hopelessly idle, a shortage of skilled labor threatens Germany's economic performance."
2214,Former Hungarian refugee tells story of escape to UP,Former Hungarian refugee tells of escape to UP
9307,""""""" Based on the data gathered from Orang Asli Development Department ( JAKOA ) , a total of 12,467 hardcore poor families are entitled to receive the aid , """" he told the Dewan Rakyat on Thursday .""","""Based on data collected by the Orangutan Asli Development Department (JAKOA), a total of 12,467 hardcore poor families are eligible for assistance,"" he told the Dewan Rakyat on Thursday."
4034,"Soon after , Green was a regular with the Relief Gang on off days , doing everything from unloading supplies at a warehouse the city temporarily allowed them to use , to meeting with families in need and working with the Rockets to secure corporate assistance from companies such as Rooms To Go .","Soon after, Green was regularly with the relief gang on days off to do everything from unloading supplies at a warehouse the city had temporarily allowed them to meeting with needy families and working with the Rockets to secure the support of companies like Rooms To Go."
9002,"A 20-year-old man indicted in an alleged immigrant smuggling scheme was under federal supervision when he led authorities in Texas on a high-speed chase and crashed an SUV , killing five of the 14 people inside , according to court records .","A 20-year-old man charged in an alleged immigration smuggling program was under federal surveillance when he led authorities in Texas on a high-speed chase and rammed an SUV, killing five of the 14 people, according to court records."
4924,"The world is no longer safe for women , Vice President Leni Robredo lamented and have become more vulnerable in social media . The digital era , she said have made safety and protection difficult to achieve .","The world is no longer safe for women, complained Vice President Leni Robredo, and has become more vulnerable on social media. Robredo said the digital age has made security and protection more difficult."


### Back-translation (PCL)#

In [None]:
df_aug = df_plain.copy()
# it's very slow
df_aug['text'] = df_aug['text'].apply(lambda x: back_translation_aug.augment(x))

# put the diffs side by side
df_back_2 = pd.concat([df_plain["text"], df_aug["text"]], keys=["original", "augmented"], axis=1)

In [None]:
df_back_2

Unnamed: 0,original,augmented
10466,""""""" She has one huge platform , and information can go out to places that really need to know what 's going on in her hometown . She has always availed not only her platform , her voice , but also resources -- tangible resources -- to help those most in need in Houston and around . """"""",""""" She has a huge platform, and information can get to places that really need to know what's going on in her hometown. She has always used not only her platform, her voice, but also resources - material resources - to help those most in need in and around Houston. """""
4683,"While male prisoners in Pakistan also suffer , the female prisoners ' plight is truly worrying . Like male offenders , the majority of female offenders are poor . However , women enter the criminal justice system with a host of unique medical , psychological , and financial problems and needs that distinguish them from male offenders . Addiction , poverty , unemployment , physical and mental illness , physical and sexual abuse , and homelessness trap women in a cycle of hopelessness and crime . In particular , female offenders are often young mothers who face the additional trauma of threatened separation from their children .","Although male prisoners also suffer in Pakistan, the plight of female prisoners is truly worrying. Like male offenders, most female offenders are poor. However, women enter the criminal justice system with a variety of unique medical, psychological and financial problems and needs that distinguish them from male offenders. Addiction, poverty, unemployment, physical and mental illness, physical and sexual abuse, and homelessness trap women in a cycle of hopelessness and criminality."
9625,"""In his final year as president , Mr S R Nathan - together with a few of his close friends - started discussing with me the idea of starting a philanthropic fund to help """" uplift """" children from poor families .""","""In his last year as President, Mr. S. R. Nathan - along with some of his close friends - began discussing with me the idea of creating a philanthropic fund to help"" children from poor families. """
8796,"School for the Blind , Deaf and Dumb , Isulo , Anambra State , which parades a number of beautiful structures , is one of the schools battling with lack of facilities to meet the special educational needs of the children . According to Felix Nwaochi , President-General of Isulo Community , the school is seriously in need of water supply as many of the blind students have to fetch water from a stream to survive in the school .","The school for the blind, deaf and mute in Isulo, in the state of Anambra, which boasts a number of beautiful buildings, is one of the schools struggling with the lack of facilities to meet the special educational needs of the children. Felix Nwaochi, the general president of the municipality of Isulo, says that the school is in urgent need of water supply, as many blind pupils have to fetch water from a stream in order to survive in school."
4135,Durban 's homeless communities reconciliation lunch,Reconciliation of homeless communities in Durban
4929,""""""" New Zealand could expand its sponsored refugee trial to accommodate those refugees with whom America has broken faith . This need not be at any particularly large cost to the Government . All the government needs to do is let caring New Zealanders help """" - see : Doing good , and doing well as a consequence .""",""""" New Zealand could expand its sponsored refugee process to accommodate those refugees with whom America has broken faith. This need not come at a particularly high cost to the government. All the government needs to do is let the caring New Zealanders help ""- see: Doing Good and subsequently Doing Good."""
2716,"The Central Market of Thessaloniki ( CMT ) is working on the launch of a project with Bulgarian organizations , which is meant to reduce the waste of fruits and vegetables . The goal of the project is to collect fresh produce that is unfit for commerce but is still perfectly fine for consumption . These volumes of produce are then redistributed to initiatives for the support of people in need , such as food banks . The project still has to be approved by the European Commission .","The Central Market of Thessaloniki (CMT) is working with Bulgarian organisations on a project to reduce the waste of fruit and vegetables. The aim of the project is to collect fresh products that are not commercially viable but are still perfectly fit for consumption, and these quantities will then be passed on to initiatives supporting people in need, such as food banks, subject to approval by the European Commission."
8227,"About the same time , she gave an interview to the Invisible People project , which documents the lives of the homeless ; she seems hopeless , resigned to her fate and dubious future .","Around the same time, she gave an interview to the Invisible People project documenting the lives of the homeless; she seems hopeless, resigned to her fate and her doubtful future."
440,It is often a painful and frustrating experience for your visa to be refused . This is even more worrying in immigrant visa cases where the applicant 's plans of permanently settling with their family member in the U.S. or the DV lottery winner 's plans of making a new life in the U.S. takes a hit .,"Refusing a visa is often a painful and frustrating experience. This is even more worrying in cases of immigration visas, where the applicant's plans to settle permanently with his family member in the US or the DV lottery winner's plans to start a new life in the US suffer a setback."
1265,"""This discarded corpse , latterly a boy who had been taught to sing about being """" a bundle of potentiality """" , she says , was erratic in school attendance and behaviour , most often hungry and unkempt and demonstrably lacking in affirmation and attention at home , foisted on teachers , who , obsessed with curriculum , hopefully willing but hopelessly ill-equipped to save him from his ( and our ) final resting place .""","""This discarded corpse, recently a boy who had been taught to sing about being"" a bundle of potential, """" she says, ""was unpredictable in terms of school attendance and behavior, mostly hungry and unkempt and demonstrably without confirmation or attention at home, and imposed on teachers who, obsessed with the curriculum, were hopefully willing but hopelessly ill-equipped to save him from his (and our) final resting place."""


## Spellcheck

In [None]:
!pip install pyspellchecker

In [None]:
from spellchecker import SpellChecker
spell = SpellChecker()
df_spellcheck = df.copy()

def has_typos(line):
    misspelled = spell.unknown(line.split(" "))
    return misspelled
    # return len(misspelled) > 0

df_spellcheck['text'] = df_spellcheck['text'].apply(lambda x: has_typos(x))

In [None]:
# browse a random selection of the "typos"
# honestly there aren't any I can see
df_spellcheck.sample(10)

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
1780,1781,@@17146573,women,hk,{},0,0
1998,1999,@@22549718,women,hk,"{""toback, ."", """", toback}",0,0
5195,5196,@@1789276,homeless,gh,{},0,1
8049,8050,@@2612281,immigrant,gb,"{""it, """""", jc, """", 's, jacobson}",0,0
7106,7107,@@8223039,poor-families,ie,"{temer, rousseff, 's}",0,0
5920,5921,@@9005999,in-need,bd,"{."", """", """"""}",1,3
911,912,@@21891082,immigrant,my,{jazlan},0,0
1187,1188,@@15822119,in-need,ie,"{johannah, ."", n't, """"""}",1,4
4018,4019,@@17147273,immigrant,sg,"{550,000, 1990s, 's}",0,0
6284,6285,@@24827308,homeless,bd,"{eight-year, hazera, mid-1970s}",0,1


## Augment dataset

This should work

In [None]:
all_df = trdf1.copy()
pcldf = all_df[all_df.label==1]

def augment_multiple(df_plain, n=2):
    """ n is the number of copies to make.
    total will have (n + 1) versions of the same data, including the original. """

    # TODO: haven't added the other augs

    dfs = []

    # synonyms
    for i in range(n - 1):
        df_aug = df_plain.copy()
        df_aug['text'] = df_aug['text'].apply(lambda x: synonym_aug.augment(x))
        dfs.append(df_aug)
    
    # back-translation
    df_aug = df_plain.copy()
    df_aug['text'] = df_aug['text'].apply(lambda x: synonym_aug.augment(x))
    dfs.append(df_aug)

    dfs.append(df_plain)
    return pd.concat(dfs)

# get an augmented dataset
upsampled_poss = augment_multiple(pcldf, synonym_aug)
npos = len(pcldf)
downsampled_negs = all_df[all_df.label==0][:npos*2]

training_set_aug = pd.concat([upsampled_poss, downsampled_negs])

# Task 1

In [None]:
# downsample negative instances
pcldf = trdf1[trdf1.label==1]
npos = len(pcldf)

training_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])

In [None]:
training_set1

Unnamed: 0,par_id,text,label
0,4341,"The scheme saw an estimated 150,000 children from poor families being sent to parts of the British Empire between 1920 and 1974 , by religious orders and charities who said they would lead better lives .",1
1,4136,Durban 's homeless communities reconciliation lunch,1
2,10352,"The next immediate problem that cropped up was how to assist the unfortunate couple , as neither of them possessed a birth certificate , a marriage certificate , or even an identity card . The Samurdhi Officer Dhanapala lamented explaining how agonizing it was for him to bear , when he came across the majority of poor families in the village did not possess even an ID to assist them officially .",1
3,8279,"Far more important than the implications for the Economy Gods ( is the dollar up or down ? ) last night 's outcome will also mean many , many vulnerable New Zealanders will have a better life over the next three years at least .",1
4,1164,"To strengthen child-sensitive social protection systems , including cash transfer programmes that directly help poor families to pay for food , health care , education and other services that protect children from the impact of poverty and improve their chances of breaking the cycle in their own lives .",1
...,...,...,...
2377,1775,Last but not the least element of culpability relates to isolating professional groups from publicly voicing the grievances and ill-treatment of the refugees .,0
2378,1776,"Then , taking the art of counter-intuitive nonsense to supreme heights , we had people saying this is proof that we need to take in more Syrian refugees - conveniently ignoring all the links to Syria shared by the attackers in both Paris and Brussels .",0
2379,1777,"Kagunga village was reported to lack necessary social services to meet the growing demand of refugees . The village has neither reliable , clean and safe water nor sanitation facilities that include latrines and critical medical services .",0
2380,1778,"""After her parents high-profile divorce after over 40 years of being married , in which her father was ordered by the UK court to pay her mother ? 64mil ( RM355mil ) in settlement , and despite Angeline feeling happy over all she has achieved in life , she admits that it is sad that things have not quite worked out for her """" vulnerable """" father regardless all the riches in the world .""",0


# RoBERTa Baseline

In [None]:
def train_baseline():
  task1_model_args = ClassificationArgs(num_train_epochs=1, 
                                        no_save=True, 
                                        no_cache=True, 
                                        overwrite_output_dir=True)
  task1_model = ClassificationModel("roberta", 
                                    'roberta-base', 
                                    args = task1_model_args, 
                                    num_labels=2, 
                                    use_cuda=cuda_available)
  # train model
  task1_model.train_model(training_set1[['text', 'label']])
  
  # run predictions on test data
  preds_task1, _ = task1_model.predict(tedf1.text.tolist())
  Counter(preds_task1)
  labels2file([[k] for k in preds_task1], 'task1.txt')

  return task1_model

In [None]:
# Task 1 Evaluation
from sklearn.metrics import classification_report, precision_score

def print_base_evaluation(model, dataset, analysis=False):
  preds, _ = model.predict(dataset.text.tolist())
  gold = np.array(dataset['label'].tolist())

  # Note the official evaluator only considers label 1 score
  print(classification_report(gold, preds))

  if analysis:
    # Get texts with wrongly predicted labels
    print('Data with wrongly predicted labels')
    w_indices = (preds != gold).nonzero()
    w_df = tedf1.iloc[w_indices].merge(df)

    plot_stacked_counts(w_df, 'keyword')
    plot_stacked_counts(w_df, 'country')

# Don't run baseline to save gpu memory. TOGGLE TO OVERRIDE
run_baseline = False

if run_baseline:
  task1_model = train_baseline()
  print_base_evaluation(task1_model, tedf1, analysis=True)

# HuggingFace Pipeline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Load shared NLP CW drive to save models
# (should be located at this path, lmk if it doesn't work)
%cd '/content/drive/MyDrive/NLP CW'

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1Rca6I-sv-ayMNUzCRMui3bEnvTl9W2d9/NLP CW


In [None]:
# Dataset preparation
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import Dataset

base_model = "microsoft/deberta-v3-base" # tokenizer, model and save directory
max_length = 512

tokenizer = AutoTokenizer.from_pretrained(base_model)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length",
                     max_length=max_length, truncation=True)


train_dataset = Dataset.from_pandas(training_set1[['text', 'label']])
tokenized_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = Dataset.from_pandas(tedf1[['text', 'label']]).map(tokenize_function, batched=True)

# see the effect of tokenizer
print(tokenized_dataset[0])
print(eval_dataset[0])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

{'text': 'The scheme saw an estimated 150,000 children from poor families being sent to parts of the British Empire between 1920 and 1974 , by religious orders and charities who said they would lead better lives .', 'label': 1, '__index_level_0__': 0, 'input_ids': [1, 279, 3630, 1053, 299, 3502, 3732, 261, 528, 572, 292, 1970, 1549, 411, 1253, 264, 1273, 265, 262, 1668, 6251, 457, 8547, 263, 9723, 366, 293, 2765, 3146, 263, 11872, 328, 357, 306, 338, 917, 493, 1131, 323, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
# Model fine-tuning
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_metric
import os

os.environ["WANDB_DISABLED"] = "true" # breaks if not included
# use the HuggingFace classifier
model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=2)

# evaluation strategy
metric_f = load_metric('f1')
metric_p = load_metric("precision")
metric_r = load_metric("recall")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = metric_p.compute(predictions=predictions, references=labels)["precision"]
    recall = metric_r.compute(predictions=predictions, references=labels)["recall"]
    f1 = metric_f.compute(predictions=predictions, references=labels)["f1"]
    return {"precision": precision, "recall": recall, "f1": f1}

# hyperparameters
learning_rate = 0.00002
epochs = 5
weight_decay = 0.01
training_args = TrainingArguments(
    "results",
    learning_rate=learning_rate,
    num_train_epochs=epochs,
    weight_decay=weight_decay,
    evaluation_strategy="epoch",
    save_strategy="no", # no checkpointing (takes up way too much space)
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model(f'./models/{base_model}/')

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.5691,0.302072,0.348837,0.753769,0.476948
2,0.4892,0.663835,0.320896,0.864322,0.468027
3,0.3887,0.543315,0.385366,0.79397,0.518883
4,0.2973,0.518664,0.430769,0.703518,0.534351
5,0.182,0.630348,0.416201,0.748744,0.535009


The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2094
  Batch size = 4
The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2094
  Batch size = 4
The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2094
  Batch size = 4
The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2094
  Batch size = 4
The following columns in the evaluation set  don't have a corresponding argument in `Deb

In [None]:
# def print_evaluation(model, dataset, analysis=False):
#   texts = dataset['text'].tolist()
#   tokens = tokenizer(texts, padding="max_length", truncation=True, return_tensors="pt")
#   tokens = tokens.to('cuda:0')
#   logits = model(**tokens).logits
#   results = torch.softmax(logits, dim=1).tolist()[0]
#   print(results.shape)
#   gold = np.array(dataset['label'].tolist())
#   print(gold.shape)

#   # Note the official evaluator only considers label 1 score
#   print(classification_report(gold, preds))

#   if analysis:
#     # Get texts with wrongly predicted labels
#     print('Data with wrongly predicted labels')
#     w_indices = (preds != gold).nonzero()
#     w_df = tedf1.iloc[w_indices].merge(df)

#     plot_stacked_counts(w_df, 'keyword')
#     plot_stacked_counts(w_df, 'country')

# print_evaluation(model, tedf1, analysis=True)

# trainer.evaluate()