In [1]:
import pandas as pd

import json

In [2]:
def load_data(path, type):

  df = []
  with open(path) as f:
    for i in f:
      i = json.loads(i)
      tweet = i['postText']
      article_title = i['targetTitle']
      article = ' '.join(i['targetParagraphs'])
      label = i['tags']

      tweet = tweet[0]
      label = label[0]

      final_label = 0

      if type == "1vs1" and label != 'multi':
        final_label = (label == 'phrase' or label == 'phrases') 
      elif type == "012":
        if label == 'passage':
          final_label = 1
        elif label == 'mutli':
          final_label = 2
      else:
        continue

      df += [
          {
              'text': tweet + " - " + article_title + article,
              'labels': final_label
          }
      ]

    return pd.DataFrame(df)

In [5]:
train_dataset_path = "/content/train.jsonl"
validation_dataset_path = "/content/validation.jsonl"

class_type = "1vs1"

train_dataset = load_data(train_dataset_path, class_type)
train_dataset

Unnamed: 0,text,labels
0,"Wes Welker Wanted Dinner With Tom Brady, But P...",False
1,NASA sets date for full recovery of ozone hole...,True
2,This is what makes employees happy -- and it's...,True
3,The perfect way to cook rice so that it's perf...,True
4,What happens if your new AirPods get lost or s...,False
...,...,...
2636,If You See A Purple Butterfly Sticker At The H...,False
2637,Has Facebook's video explosion completely shak...,False
2638,Cop Is Eating At A Chili's When Teen Hands Him...,False
2639,You need to see this Twitter account that pred...,True


In [6]:
validation_dataset = load_data(validation_dataset_path, class_type)
validation_dataset

Unnamed: 0,text,labels
0,Five Nights at Freddy’s Sequel Delayed for Wei...,False
1,Here’s how much you should be tipping your hai...,True
2,A man swallowed a microSD card and you won't b...,False
3,This popular soda could cure your hangovers sc...,True
4,The anytime snack you won't feel guilty about ...,True
...,...,...
652,"Dog Dies One Hour After Hiking With His Owner,...",False
653,This is what happens when you leave a hotel cl...,False
654,This Texas GOP elector announces that he won't...,True
655,WikiLeaks' Julian Assange Reported Dead - Wiki...,False


In [13]:
!pip install simpletransformers
from simpletransformers.classification import ClassificationModel
import sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
import torch
torch.cuda.empty_cache()

final_model = None

config = {
    'overwrite_output_dir': True,
    'num_train_epochs': 5,
    'fp16': False,
    'train_batch_size': 8,
    'gradient_accumulation_steps': 4,
    'evaluate_during_training': True,
    'max_seq_length': 512,
    'learning_rate': 4e-05,
    'early_stopping_consider_epochs': True,
    'early_stopping_delta': 0.01,
    'early_stopping_metric': 'acc',
    'early_stopping_metric_minimize': False,
    'early_stopping_patience': 3,
    'evaluate_during_training_steps': 331,
    'output_dir': 'outputs/',
    'warmup_ratio': 0.06,
    'save_steps': 2000,
    'manual_seed': 12345,
    }

model = ClassificationModel("roberta", "roberta-base", args = config)
model.train_model(train_dataset, eval_df=train_dataset, acc=sklearn.metrics.accuracy_score)
result = model.eval_model(validation_dataset, acc=sklearn.metrics.accuracy_score)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/2641 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/331 [00:00<?, ?it/s]

  0%|          | 0/2641 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/331 [00:00<?, ?it/s]

  0%|          | 0/2641 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/331 [00:00<?, ?it/s]

  0%|          | 0/2641 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/331 [00:00<?, ?it/s]

  0%|          | 0/2641 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/331 [00:00<?, ?it/s]

  0%|          | 0/2641 [00:00<?, ?it/s]

  0%|          | 0/2641 [00:00<?, ?it/s]

  0%|          | 0/657 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/83 [00:00<?, ?it/s]

In [None]:
result

({'mcc': 0.5708036060628551,
  'tp': 262,
  'tn': 254,
  'fp': 68,
  'fn': 73,
  'auroc': 0.8435987763048113,
  'auprc': 0.8529141567588812,
  'acc': 0.7853881278538812,
  'eval_loss': 0.6808889564631664},
 array([[ 1.60195839, -1.466519  ],
        [-0.36549085,  0.60118169],
        [ 1.80777204, -1.87308753],
        ...,
        [-2.48731208,  3.09903932],
        [ 0.59462023, -0.45115712],
        [-2.49748158,  3.06732607]]),
 [{'guid': 10, 'text_a': 'What are mosquito-control workers spraying in Miami? - What are mosquito-control workers spraying in Miami?(CNN) After the first locally transmitted cases of the Zika virus were identified in a small area north of downtown Miami, officials began to spray an insecticide over a 10-square-mile portion of Miami-Dade County. But what was in the insecticide, and how does it work? The main ingredient of the insecticide, Dibrom, is the chemical naled. It works by killing mosquitoes on contact. Sprayers produce very fine droplets that are s