In [2]:
import numpy as np
import pandas as pd
from datasets import Dataset,load_dataset,dataset_dict
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from pathlib import Path

In [3]:
path =Path("D:\workspace\kaggle\data\\feedback-prize-effectiveness")
train = path/"train.csv"
data = pd.read_csv(train)

In [4]:
data.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [5]:
data.iloc[0].discourse_text

"Hi, i'm Isaac, i'm going to be writing about how this face on Mars is a natural landform or if there is life on Mars that made it. The story is about how NASA took a picture of Mars and a face was seen on the planet. NASA doesn't know if the landform was created by life on Mars, or if it is just a natural landform. "

In [6]:
data.iloc[1].discourse_text

"On my perspective, I think that the face is a natural landform because I dont think that there is any life on Mars. In these next few paragraphs, I'll be talking about how I think that is is a natural landform "

In [7]:
data["label"] = data.discourse_effectiveness.astype("category").cat.codes

In [9]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
data["inputs"] = data.discourse_type+tokenizer.sep_token+data.discourse_text

In [11]:
data.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,label,inputs
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,0,"Lead[SEP]Hi, i'm Isaac, i'm going to be writin..."
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,0,"Position[SEP]On my perspective, I think that t..."
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,0,Claim[SEP]I think that the face is a natural l...
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,0,"Evidence[SEP]If life was on Mars, we would kno..."
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,0,Counterclaim[SEP]People thought that the face ...


In [12]:
dataset = Dataset.from_pandas(data)

In [13]:
def tokenize_function(source):
    return tokenizer(source["inputs"],truncation=True)

In [14]:
tokenized_dataset = dataset.map(tokenize_function,batched=True,remove_columns = ["discourse_id",\
                                                                              "essay_id",\
                                                                             "discourse_text",\
                                                                              "discourse_type",\
                                                                             "discourse_effectiveness",\
                                                                             ])

  0%|          | 0/37 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [15]:
essay_id = data.essay_id.unique()
np.random.shuffle(essay_id)
split = int(len(essay_id)*0.9)

val_essay = essay_id[split:]
val_id = np.isin(data["essay_id"],val_essay)
train_id = ~val_id
idx = np.arange(data.shape[0])
train_idx = idx[train_id]
val_idx = idx[val_id]

In [16]:
train_idx

array([    0,     1,     2, ..., 36762, 36763, 36764])

In [17]:
val_idx

array([  130,   131,   132, ..., 36727, 36728, 36729])

In [18]:
train_Dataset = tokenized_dataset.select(train_idx)

In [19]:
val_Dataset = tokenized_dataset.select(val_idx)

In [20]:
train_Dataset

Dataset({
    features: ['label', 'inputs', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 33111
})

In [21]:
train_Dataset.save_to_disk("train")
val_Dataset.save_to_disk("val")

Flattening the indices:   0%|          | 0/34 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/4 [00:00<?, ?ba/s]

In [22]:
from datasets import load_from_disk
dd = load_from_disk("train")

In [23]:
dd

Dataset({
    features: ['label', 'inputs', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 33111
})