In [1]:
import torch
from torch.utils.data import Dataset

import pandas as pd
import numpy as np

from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class MedsDataset(Dataset):
    """Medications mention dataset"""

    def __init__(self, data_path, max_len):
        """
        Args:
            data_path (string): Path to the file containg texts with labels.
        """
        self.df = pd.read_csv(data_path, header=None)
        self.MAX_LEN = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        """
            Output:
            text: 
            attention_mask: 
            label: 
        """
        label = self.df.iloc[idx, 0]
        text = self.df.iloc[idx, 1]
        tokens = tokenizer.encode(text, add_special_tokens=True)
        padded = np.array(tokens + [0]*(self.MAX_LEN-len(tokens)))
        text = torch.tensor(padded)
        attention_mask = np.where(padded != 0, 1, 0)
        attention_mask = torch.tensor(attention_mask)
        return (label, text, attention_mask)

In [2]:
train = MedsDataset('../data/processed/train.csv', 128)
valid = MedsDataset('../data/processed/valid.csv', 128)

In [14]:
train[1016]

ValueError: Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

In [16]:
df_train = pd.read_csv('../data/processed/train.csv', header=None)

In [17]:
df_train.shape

(55419, 2)

In [18]:
text = df_train.iloc[1016, 1]

In [19]:
text

nan

In [29]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokens = tokenizer.encode(text, add_special_tokens=True)

In [30]:
tokens

[101,
 2122,
 2047,
 3424,
 1011,
 10089,
 19960,
 2015,
 2191,
 2033,
 2061,
 17056,
 2025,
 2469,
 2065,
 2009,
 2097,
 2147,
 2041,
 2005,
 2033,
 1012,
 1012,
 2021,
 1045,
 1005,
 1049,
 2061,
 5458,
 1997,
 2467,
 11991,
 19960,
 2015,
 102]

In [27]:
label = df_train.iloc[0, 0]

In [28]:
label

1