Using kernel `conda_pytorch_latest_p36`

In [1]:
# !pip install transformers

In [2]:
import torch

In [3]:
from pathlib import Path
import os
import random

In [4]:
import pandas as pd
import numpy as np
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments
from sklearn.metrics import classification_report

In [5]:
data = Path('data_prep/final_data/en/')

In [6]:
raw = pd.read_csv('data_prep/data/entries_raw.csv')

In [7]:
train = pd.read_csv(data / 'sentences_en_train.csv')
test = pd.read_csv(data / 'sentences_en_test.csv')

In [8]:
train

Unnamed: 0,doc_id,sentence_id,sentence_text,is_relevant,sector_ids
0,51787,38,Climate Change and Agriculture: Subsistence Fa...,0,[]
1,51787,44,"Bohorquez-Penuela, C., & Otero-Cortes, A (2020).",0,[]
2,51787,45,Blame it on the Rain: The Effects of Weather S...,0,[]
3,51787,49,Increasing frequency of extreme El Ni o events...,0,[]
4,51787,53,What do we learn from the weather?,0,[]
...,...,...,...,...,...
191932,34512,112,Emergency shelters in India had reportedly bee...,0,[]
191933,34512,116,In these situations humanitarian operations ma...,0,[]
191934,34512,120,Governments of EU member states and several Af...,0,[]
191935,34512,124,You can find an overview of all ACAPS resource...,0,[]


In [9]:
def process_for_sector(df, sector, train):
    relevant_train = df[df.is_relevant == 1]
    relevant_train.sector_ids = relevant_train.sector_ids.apply(eval)
    relevant_train = relevant_train[relevant_train.sector_ids.apply(len) > 0]
    
    positive_train = relevant_train[relevant_train.sector_ids.apply(lambda x: sector in x)]
    negative_train = relevant_train[relevant_train.sector_ids.apply(lambda x: sector not in x)]

    positive_train.sector_ids = 1
    negative_train.sector_ids = 0
    train_df = pd.concat([positive_train, negative_train])
    train_df = train_df.sample(frac=1).reset_index(drop=True)
    
    train_df['is_valid'] = False if train else True
        
    return train_df

In [10]:
train_df = process_for_sector(train, 4, True)
test_df = process_for_sector(test, 4, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

In [11]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [12]:
train_df

Unnamed: 0,doc_id,sentence_id,sentence_text,is_relevant,sector_ids,is_valid
0,43292,130,Global WASH Cluster critical times Context-spe...,1,0,False
1,45535,5,"In March, shortly after the country reported i...",1,0,False
2,40202,18,COVID-19 messages reached 283 479 persons betw...,1,1,False
3,47614,12,"Sources say, Bangladesh can now look forward t...",1,0,False
4,50728,217,While it is likely that there are numerous und...,1,1,False
...,...,...,...,...,...,...
22967,42360,763,Recommendations To the Government of Banglades...,1,0,False
22968,43681,121,This will give opportunity to the children to ...,1,0,False
22969,37852,2,"Difficult field access, the destruction of pro...",1,0,False
22970,41382,1,The ministry added in a statement that the tot...,1,1,False


In [13]:
train_encodings = tokenizer(list(train_df.sentence_text), truncation=True, padding=True)
train_labels = list(train_df.sector_ids)

In [14]:
test_encodings = tokenizer(list(test_df.sentence_text), truncation=True, padding=True)
test_labels = list(test_df.sector_ids)

In [15]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
test_dataset = Dataset(test_encodings, test_labels)

In [16]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=train_dataset            # evaluation dataset
)

In [17]:
len(train_dataset)

22972

In [18]:
torch.cuda.is_available()

True

In [None]:
trainer.train()

Step,Training Loss


In [None]:
outputs = trainer.predict(test_dataset)

In [None]:
preds = np.argmax(outputs.predictions, axis=1)
labels = outputs.label_ids

In [None]:
rep = classification_report(labels, preds)

In [24]:
print(rep)

              precision    recall  f1-score   support

           0       0.86      0.93      0.89      1646
           1       0.90      0.80      0.85      1260

    accuracy                           0.88      2906
   macro avg       0.88      0.87      0.87      2906
weighted avg       0.88      0.88      0.87      2906

