In [11]:
import argparse
import numpy as np
import random
from model import build_model_tokenizer
from dataset import FeedbackPrizeDataset
import pandas as pd
from torch.utils.data import DataLoader

from data_proprocess import preprocess
from config import *


# parser = argparse.ArgumentParser()


# parser.add_argument('--data_path', type=str, default="")
# parser.add_argument("--is_training", type=bool, default=True)
# parser.add_argument("--model_name", type=str, default="roberta-base")
# parser.add_argument("--epochs", type=int, default=2)
# parser.add_argument("--n_folds", type=int, default=5)
# parser.add_argument("--train_batch_size", type=int, default=4)
# parser.add_argument("--valid_batch_size", type=int, default=4)
# 
# args = parser.parse_args()
class Args:
    def __init__(self):
        self.data_path = ""
        self.is_training = True
        self.model_name = "roberta-base"
        self.epochs = 2
        self.n_folds = 5
        self.train_batch_size = 4
        self.valid_batch_size = 4


args = Args()

config = Config(args)


In [2]:
def seed_everything(seed=64):
    # os.environ['PYTHONSEED'] = str(seed)
    np.random.seed(seed % (2 ** 32 - 1))
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


seed_everything()

In [3]:

IGNORE_INDEX = -100
all_train_df = pd.read_csv(config.data_dir + "/corrected_train.csv")
all_train_texts = preprocess(config)

100%|██████████| 15595/15595 [00:02<00:00, 6029.29it/s]


Completed tokenizing texts.


100%|██████████| 15595/15595 [02:00<00:00, 129.80it/s]

Completed mapping discourse to each token.





In [6]:
oof = pd.DataFrame()

i_fold = 1
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(config.model_name, add_prefix_space=True)


In [47]:
config.max_length

512

In [7]:

df_train = all_train_texts[all_train_texts['fold'] != i_fold].reset_index(drop=True)
df_train

Unnamed: 0,id,text,text_split,entities,fold
0,3321A3E87AD3,I do agree that some students would benefit fr...,"[I, do, agree, that, some, students, would, be...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea...",0.0
1,DFEAEC512BAB,Should students design a summer project for sc...,"[Should, students, design, a, summer, project,...","[O, O, O, O, O, O, O, O, B-Position, I-Positio...",2.0
2,EB6C2AF20BFE,People sometimes have a different opinion than...,"[People, sometimes, have, a, different, opinio...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea...",2.0
3,A91A08E523D5,"Dear senator,\n\nAs you know the Electoral Col...","[Dear, senator,, As, you, know, the, Electoral...","[O, O, B-Lead, I-Lead, I-Lead, I-Lead, I-Lead,...",0.0
4,616F8E0EFABF,"""Can you imagine a time in the future when no ...","[""Can, you, imagine, a, time, in, the, future,...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea...",0.0
...,...,...,...,...,...
12471,ACCD71550365,Do you think it is a good idea to succeed? I b...,"[Do, you, think, it, is, a, good, idea, to, su...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea...",3.0
12472,418A275E8556,"Dear Principle,\n\nI understand that some stud...","[Dear, Principle,, I, understand, that, some, ...","[O, O, B-Lead, I-Lead, I-Lead, I-Lead, I-Lead,...",3.0
12473,4453444AF383,There has been a strong arguement going on wea...,"[There, has, been, a, strong, arguement, going...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea...",2.0
12474,EF0D75BF48DA,I favor in to changing election by popular vot...,"[I, favor, in, to, changing, election, by, pop...","[B-Position, I-Position, I-Position, I-Positio...",4.0


In [64]:
from torch.utils.data import Dataset
from config import *


class FeedbackPrizeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, has_labels, config):
        super(FeedbackPrizeDataset, self).__init__()
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.has_labels = has_labels
        self.config = config

    def __getitem__(self, index):
        text = self.data.text[index]
        encoded_text = self.tokenizer(text.split(),
                                      is_split_into_words=True,
                                      truncation=True,
                                      max_length=self.max_len,
                                      padding="max_length",
                                      return_tensors="pt",
                                      )
        # encoded_text = self.tokenizer(text,
        #                               truncation=True,
        #                               max_length=self.max_len,
        #                               padding=True,
        #                               return_tensors="pt",
        #                               )
        print(encoded_text['input_ids'].shape)
        word_ids = encoded_text.word_ids()

        if self.has_labels:
            word_labels = self.data.entities[index]
            prev_word_idx = None
            labels_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    labels_ids.append(IGNORE_INDEX)
                elif word_idx != prev_word_idx:
                    labels_ids.append(LABELS_TO_IDS[word_labels[word_idx]])
                else:
                    if self.config.label_subtokens:
                        labels_ids.append(LABELS_TO_IDS[word_labels[word_idx]])
                    else:
                        labels_ids.append(IGNORE_INDEX)

                prev_word_idx = word_idx
            encoded_text['label'] = labels_ids

        item = {k: torch.as_tensor(v) for k, v in encoded_text.items()}
        if index == 6: print(item)
        word_ids2 = [w if w is not None else NON_LABEL for w in word_ids]
        item['word_ids'] = torch.as_tensor(word_ids2)
        return item

    def __len__(self):
        return self.len


ds_train = FeedbackPrizeDataset(df_train, tokenizer, config.max_length, True, config)
dl_train = DataLoader(ds_train, batch_size=config.train_batch_size, num_workers=0, shuffle=False)


In [65]:
encoded_text = tokenizer(df_train.text[6].split(), is_split_into_words=True,
                         truncation=True,
                         max_length=512,
                         padding="max_length",
                         return_tensors="pt", )

item = {k: torch.as_tensor(v) for k, v in encoded_text.items()}

item

{'input_ids': tensor([[    0,    89,    32,   171, 12340,     7,    45,   519,    50, 11361,
            512,  9453,    13,  4327,     6,    96,    10,  1139,    11,   821,
           7043,   219,   373,     6,    22, 25546,  1792,   282,   113,    24,
             16, 27686,    13,    10,  6239,     7,    33,    10,   512,   114,
             51,    32,     7,   697,    11,    42,  1139,     4,    85,   189,
           2369,  5373,    53,     5,  2286,  1760, 28108,   101,    42,  2178,
              4,    96,     5,  1808,  3569,  2242,  1073,  9772,   130,    24,
            579,  1626,    14,     6,    22,  3083,   135,     9, 13205,  1792,
           1253,  1232,   109,    45,   308,  1677,     8,  4981,   135,  1088,
             89,  1677,     7,   517,    11,     5,     5,  1139,   845,   509,
           6239,    26,    22, 14746,   939,    56,    10,   512,   939,    21,
            460,    98, 13554,  4356,    98,   203, 16265,    42,   169,   845,
            407,  2563,    

In [66]:
for idx, item in enumerate(ds_train):
    # print(idx)
    # if idx == 6:
        # print(item['input_ids'])

    # for key, val in item.items():
    #     print(key,val)
    # print(key, val.shape)
    if idx == 6: break

torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
{'input_ids': tensor([[    0,    89,    32,   171, 12340,     7,    45,   519,    50, 11361,
           512,  9453,    13,  4327,     6,    96,    10,  1139,    11,   821,
          7043,   219,   373,     6,    22, 25546,  1792,   282,   113,    24,
            16, 27686,    13,    10,  6239,     7,    33,    10,   512,   114,
            51,    32,     7,   697,    11,    42,  1139,     4,    85,   189,
          2369,  5373,    53,     5,  2286,  1760, 28108,   101,    42,  2178,
             4,    96,     5,  1808,  3569,  2242,  1073,  9772,   130,    24,
           579,  1626,    14,     6,    22,  3083,   135,     9, 13205,  1792,
          1253,  1232,   109,    45,   308,  1677,     8,  4981,   135,  1088,
            89,  1677,     7,   517,    11,     5,     5,  1139,   845,   509,
          6239,    26,    22, 14746,   939,    56

In [None]:
for val in dl_train:
    print(val)


torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
{'input_ids': tensor([[[    0,    38,   109,  ...,    55,     4,     2]],

        [[    0,  7698,   521,  ...,   567,    31,     2]],

        [[    0,  1806,  2128,  ...,     8,   283,     2]],

        [[    0, 12191,  6704,  ..., 12481,     6,     2]]]), 'attention_mask': tensor([[[1, 1, 1,  ..., 1, 1, 1]],

        [[1, 1, 1,  ..., 1, 1, 1]],

        [[1, 1, 1,  ..., 1, 1, 1]],

        [[1, 1, 1,  ..., 1, 1, 1]]]), 'label': tensor([[-100,    1,    2,  ...,   14,   14, -100],
        [-100,    0,    0,  ...,   10,   10, -100],
        [-100,    1,    2,  ...,   12,   12, -100],
        [-100,    0,    0,  ...,   14,   14, -100]]), 'word_ids': tensor([[ -1,   0,   1,  ..., 451, 451,  -1],
        [ -1,   0,   1,  ..., 474, 475,  -1],
        [ -1,   0,   1,  ..., 465, 466,  -1],
        [ -1,   0,   1,  ..., 432, 432,  -1]])}
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
{'input_id

In [ ]:
df_val = all_train_texts[all_train_texts['fold'] == i_fold].reset_index(drop=True)
val_idlist = df_val['id'].unique().tolist()
df_val_eval = all_train_df.query('id==@val_idlist').reset_index(drop=True)
ds_val = FeedbackPrizeDataset(df_val, tokenizer, config.max_length, True, config)
dl_val = DataLoader(ds_val, batch_size=config.valid_batch_size, shuffle=False, num_workers=2, pin_memory=True)

In [ ]:




oof = pd.DataFrame()
for i_fold in range(config.n_fold):
    print(f'=== fold{i_fold} training ===')

    df_train = all_train_texts[all_train_texts['fold'] != i_fold].reset_index(drop=True)
    ds_train = FeedbackPrizeDataset(df_train, tokenizer, config.max_length, True, config)
    df_val = all_train_texts[all_train_texts['fold'] == i_fold].reset_index(drop=True)
    val_idlist = df_val['id'].unique().tolist()
    df_val_eval = all_train_df.query('id==@val_idlist').reset_index(drop=True)
    ds_val = FeedbackPrizeDataset(df_val, tokenizer, config.max_length, True, config)
    dl_train = DataLoader(ds_train, batch_size=config.train_batch_size, shuffle=True, num_workers=2, pin_memory=True)
    dl_val = DataLoader(ds_val, batch_size=config.valid_batch_size, shuffle=False, num_workers=2, pin_memory=True)

    for idx, batch_data in enumerate(dl_val):
        print(idx)
        print(batch_data)
        break


In [78]:
text = df_train.text[6]
text

'there are many advantages to not having or limiting car usage for instance, In a town in germany called, "Vaubn" it is forbidden for a citizen to have a car if they are to live in this town. It may sound crazy but the citizens acturally like this rule. In the artical paragrah three it sates that, "70 percent of vaubans families do not own cars and 57 percent sold there cars to move in the the town". One citizen said "when i had a car i was always so tense\xa0 im so much happier this way". So clearly the the reason why the citizens enjoy not having a car was maybe because, they didnt have to worry about all the expences like gas and car insurance to take care of a car.\n\nAnother reason why its good to limit the use of cars is because of the nasty Co2\xa0 gas that come out of the of the muffler and polluts the air. Car pollution became so bad that the city of paris put a partial ban on driving to clear the the air of the city. By doing this the air pollution "was down by 60" witch the 

In [69]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", add_prefix_space=True)
outputs = tokenizer(text.split(),
                    is_split_into_words=True,
                    truncation=True,
                    max_length=512,
                    padding="max_length",
                    return_tensors="pt", )

In [70]:
outputs.input_ids.shape

torch.Size([1, 512])

In [79]:
outputs = tokenizer(text.split(),
                    is_split_into_words=True,
                    truncation=True,
                    max_length=512,
                    padding="max_length")
outputs.input_ids.shape


torch.Size([1, 512])