In [1]:
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import PreTrainedModel
from transformers.pipelines.pt_utils import KeyDataset
import torch

from tqdm.auto import tqdm

from datasets import Dataset, DatasetDict, load_dataset
from datasets import load_metric

from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

import pandas as pd
import numpy as np
import logging
from glob import glob
from os import path
import functools

from IPython.display import HTML, display

import spacy
from spacy import displacy

In [2]:
model_checkpoint = "distilbert-base-uncased"
category_codes = dict(enumerate(['Claim', 'Concluding Statement', 'Counterclaim', 'Evidence', 'Lead', 'Position', 'Rebuttal']))
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [3]:
model_path = r"models_gitignored/distilbert-base-uncased-finetuned-sentence-classification/checkpoint-12626"
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_path, id2label=category_codes)
loaded_model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [4]:
pipe = pipeline("text-classification", model=loaded_model, tokenizer=tokenizer)
pipe

<transformers.pipelines.text_classification.TextClassificationPipeline at 0x1f3b8de5c70>

In [5]:
articles_dataset = load_dataset("text", data_files={"valid":[r"datagen/nytimes/metaverse-politics-disinformation-society.txt"]})['valid']

# filter out empty strings
articles_dataset = articles_dataset.filter(lambda example: example['text'])

articles_dataset

Using custom data configuration default-d6e247263eac0dbb


Downloading and preparing dataset text/default to C:\Users\Prannaya\.cache\huggingface\datasets\text\default-d6e247263eac0dbb\0.0.0\08f6fb1dd2dab0a18ea441c359e1d63794ea8cb53e7863e6edf8fc5655e47ec4...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to C:\Users\Prannaya\.cache\huggingface\datasets\text\default-d6e247263eac0dbb\0.0.0\08f6fb1dd2dab0a18ea441c359e1d63794ea8cb53e7863e6edf8fc5655e47ec4. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['text'],
    num_rows: 21
})