**import the necessary library**

In [1]:
import os

import random
import numpy as np

import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import (
    AdamW,
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_cosine_schedule_with_warmup,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import gc

**using hugging face roberta-large model , one of my favourite model that made state of the art result**

In [2]:
model_name = "roberta-large"

tokenizer = AutoTokenizer.from_pretrained(model_name, ignore_mismatched_sizes=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

**preparing the necessary params with suitable values according to the dataset**

In [3]:
lr = 2e-5
epochs =  4
batch_size = 2
max_seq_len = 256

**prepare the env**

In [4]:
import os

def set_seed(seed=106052):
    """Set seed for reproducibility.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()

**process the datatset : encode it and and made attention mask**

In [5]:
class CEFRDataset(Dataset):
    """Classification dataset, built on top of pytorch dataset object
    """
    
    def __init__(self, texts, labels):
        
        self.encoder = LabelEncoder()
        print(self.encoder.__dict__)
        self.texts = texts
#         self.labels = self.encoder.fit_transform(labels)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        labels = self.labels[index]
        encoded_text = tokenizer(
            text,
            padding="max_length",
            max_length=max_seq_len,
            truncation=True,
            return_tensors="pt",
        )
        encoded_text["input_ids"] = encoded_text["input_ids"].squeeze()
        encoded_text["attention_mask"] = encoded_text["attention_mask"].squeeze()
        labels = torch.tensor(labels)

        return {
            "input_ids": encoded_text["input_ids"],
            "attention_mask": encoded_text["attention_mask"],
            "labels": labels,
        }

    def get_labels(self):
        return self.labels

**fine tuning the model so it could give us better result**

In [6]:
def train(train_set, valid_set, epochs=2, warmup_size=0.1, lr=1e-3, batch_size=2):
    model = get_model(model_name)
    optim = AdamW(model.parameters(), lr=lr)
    scheduler = get_scheduler(
        optim, warmup_size, round(len(train_set) / batch_size * epochs)
    )
    training_args = get_training_args(epochs, batch_size)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_set,
        eval_dataset=valid_set,
        optimizers=[optim, scheduler],
        compute_metrics=compute_accuracy
    )
    trainer.train()
    trainer.save_model()
    return trainer

In [7]:
def get_model(pretrained_checkpoint):
    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_checkpoint, num_labels=207, ignore_mismatched_sizes=True
    )
    return model.to(device)

In [8]:
os.environ["WANDB_DISABLED"] = "true"


def get_scheduler(optimizer, warmup_size, total_steps):
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=round(total_steps * warmup_size),
        num_training_steps=total_steps,
    )
    return scheduler


def get_training_args(epochs, batch_size):
    return TrainingArguments(
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        logging_steps=50,
        fp16=False,
        evaluation_strategy="epoch",
        eval_accumulation_steps=1,
        report_to=None,
#         save_total_limit=1,
#         load_best_model_at_end=True,
        save_strategy = 'epoch',
        output_dir='/',
    )


def compute_accuracy(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}



In [9]:
train_set_df = pd.read_csv("/kaggle/input/murderdata/Mysterious_Affair_at_Styles_Train_Data.csv")
train_set_df.columns=["labels","text"]

In [50]:
train_set_df.text[10]

'whiteness of the\nhand she held out to claim her tea. With dark eyes and eyelashes she\nwould have been a beauty.\n\nShe flung herself down on the ground beside John, and as I handed her a\nplate of sandwiches she smiled up at me.\n\n"Sit down here on the grass, do. It\'s ever so much nicer."\n\nI dropped down obediently.\n\n"You work at Tadminster, don\'t you, Miss Murdoch?"\n\nShe nodded.\n\n"For my sins."\n\n"Do they bully you, then?" I asked, smiling.\n\n"I should like to see them!" cried Cynthia with dignity.\n\n"I have got a cousin who is nursing," I remarked. "And she is terrified\nof \'Sisters\'."\n\n"I don\'t wonder. Sisters _are_, you know, Mr. Hastings. They simp-ly\n_are_! You\'ve no idea! But I\'m not a nurse, thank heaven, I work in the\ndispensary."\n\n"How many people do you poison?" I asked, smiling.\n\nCynthia smiled too.\n\n"Oh, hundreds!" she said.\n\n"Cynthia," called Mrs. Inglethorp, "do you think you could write a few\nnotes for me?"\n\n"Certainly, Aunt Emily."\

In [49]:
test.Text[0]

'I sit down alone at the appointed table and take\nup my pen to give all whom it may con- cern an\nexact account of what may happen. Call me\nnervous, call me fey, if you will; at least this\nlittle pen, this mottled black and silver\nAquarius, with its nib specially tempered to my\norder in Amsterdam, is greedy. It has not had\nmuch work since it flew so nimbly for the dead\nold man. As I watch the sea, Casy Ferris passes\nwith down-dropped eyes. Of course, to-day is\nthe day. Her father reminds me of a valetudinarian walrus. But she has, I suppose, to have\nsomebody. St. Lazarus-in-the-Chine is full,\nno doubt, already. I think she is rash ; but it\nis none of my business. Where about the graves\nof the martyrs the whaups are crying, my heart\nremembers how. Strange that he comes into\nmy head so much to-day. I hope it�s over some\nflotsam fish that the birds are making whaupee. But all the nice gulls love a sailor. Ugh.'

**split data**

In [16]:
def split_valid(df):
    
    val = pd.DataFrame()
    val["text"] = ""
    val["labels"] = -1
    
    for i in df.labels[:206]:
        val = df[206:]
        
    return df[:206].reset_index(drop=True) , val.reset_index(drop=True)


In [17]:
train_set_df, valid_set_df = split_valid(train_set_df)

In [18]:
train_set_df

Unnamed: 0,labels,text
0,1,CHAPTER I.\nI GO TO STYLES\n\n\nThe intense in...
1,2,"looked his forty-five years. As a boy, though,..."
2,3,at the\ntime of their father's remarriage that...
3,4,"black beard,\nand wears patent leather boots i..."
4,5,"""I'm afraid you'll find it very quiet down her..."
...,...,...
201,202,would do so afterwards. But for your lucky rem...
202,203,hers! It concealed a very opposite emotion.\nT...
203,204,them to us. There was really more\nevidence ag...
204,205,something about the matter. He was nearly\ndri...


**encoding the label**

In [19]:
# from sklearn import preprocessing
# le = preprocessing.LabelEncoder()
# train_set_df.label = le.fit_transform(train_set_df.label)
# valid_set_df.label = le.transform(valid_set_df.label)

In [20]:
valid_set_df

Unnamed: 0,labels,text
0,207,"""Yes?""\n\nCynthia fidgeted with a little tasse..."


In [30]:
from tqdm import tqdm 

def predict(model, text):
    
    preds = []
    
    for i in tqdm(range(len(text))):
        tokenized = tokenizer(text[i:i+1], return_tensors="pt", truncation=True, max_length=512).to("cuda")
        pred = model(**tokenized)
        preds.append(pred.logits.argmax(-1).item())

    return preds

**collect the cache**

In [22]:
import gc
torch.cuda.empty_cache()
gc.collect()

344

**train the model**

In [23]:
train_set = CEFRDataset(train_set_df["text"], train_set_df["labels"])
valid_set = CEFRDataset(valid_set_df["text"], valid_set_df["labels"])


trainer_second = train(train_set, valid_set, epochs=4, warmup_size=0.1, lr=lr, batch_size=1)
model = trainer_second.model

{}
{}


Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'clas

Epoch,Training Loss,Validation Loss,Accuracy
1,5.6696,5.241325,0.0
2,5.5153,5.159233,0.0
3,5.456,5.129877,0.0
4,5.3918,5.137756,0.0


***** Running Evaluation *****
  Num examples = 1
  Batch size = 8
Saving model checkpoint to /checkpoint-206
Configuration saved in /checkpoint-206/config.json
Model weights saved in /checkpoint-206/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1
  Batch size = 8
Saving model checkpoint to /checkpoint-412
Configuration saved in /checkpoint-412/config.json
Model weights saved in /checkpoint-412/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1
  Batch size = 8
Saving model checkpoint to /checkpoint-618
Configuration saved in /checkpoint-618/config.json
Model weights saved in /checkpoint-618/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1
  Batch size = 8
Saving model checkpoint to /checkpoint-824
Configuration saved in /checkpoint-824/config.json
Model weights saved in /checkpoint-824/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /
Configuration

In [24]:
test = pd.read_csv("/kaggle/input/murderdata/Cain_s_Jawbone_Test_data.csv")
sub=pd.read_csv("/kaggle/input/murderdata/SampleSubmission.csv")


In [31]:
import gc
torch.cuda.empty_cache()
gc.collect()

666

In [43]:
from tqdm import tqdm 

def predict(model, text):
    
    preds = []
    
    for i in tqdm(range(len(text))):
        tokenized = tokenizer(text[i:i+1], return_tensors="pt", truncation=True, max_length=512).to("cuda")
        pred = model(**tokenized)
        print(pred.logits.argmax(-1).item())
        preds.append(pred.logits.argmax(-1).item())

    return preds

In [44]:
sub["CorrectPage"] = (predict(model, test.Text.tolist()))

  8%|▊         | 6/75 [00:00<00:02, 26.05it/s]

168
168
168
168
168
168


 16%|█▌        | 12/75 [00:00<00:02, 25.63it/s]

168
168
168
168
168
168


 24%|██▍       | 18/75 [00:00<00:02, 26.40it/s]

168
168
168
168
168
168


 32%|███▏      | 24/75 [00:00<00:01, 25.57it/s]

168
168
168
168
168
168


 40%|████      | 30/75 [00:01<00:01, 25.61it/s]

168
168
168
168
168
168


 48%|████▊     | 36/75 [00:01<00:01, 26.00it/s]

168
168
168
168
168
168


 56%|█████▌    | 42/75 [00:01<00:01, 26.05it/s]

168
168
168
168
168
168


 64%|██████▍   | 48/75 [00:01<00:00, 27.30it/s]

168
168
168
168
168
168


 68%|██████▊   | 51/75 [00:01<00:00, 26.22it/s]

168
168
168
168
168
168


 80%|████████  | 60/75 [00:02<00:00, 26.57it/s]

168
168
168
168
168
168


 88%|████████▊ | 66/75 [00:02<00:00, 26.34it/s]

168
168
168
168
168
168


 96%|█████████▌| 72/75 [00:02<00:00, 27.43it/s]

168
168
168
168
168
168


100%|██████████| 75/75 [00:02<00:00, 26.22it/s]

168
168
168





In [57]:
import random
for i in range(0,207):
    sub["CorrectPage"].loc[i]=random.randint(1, 207)


In [58]:
sub

Unnamed: 0,OriginalPage,CorrectPage
0,1,193
1,2,19
2,3,3
3,4,118
4,5,160
...,...,...
70,71,110
71,72,144
72,73,3
73,74,29


In [59]:
sub.CorrectPage.nunique()

62

In [62]:
sub.to_csv("sub1.csv", index=False)