# Loading the dataset

In [None]:
# Downloading the dataset
!gdown 1XwdxalS7xdiXKVhmfudSjpCehae0rH0G

Mounted at /content/drive


In [None]:
!tar -xvzf OneStopEnglishCorpus.tar.gz

In [None]:
cd "OneStopEnglishCorpus/Texts-SeparatedByReadingLevel"

/content/drive/Shareddrives/CIS-5300-Project/Finetuning-on-documents/OneStopEnglishCorpus/Texts-SeparatedByReadingLevel


In [None]:
! pip install -U accelerate
! pip install -U transformers

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/265.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m174.1/265.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0
Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformer

In [None]:
from pathlib import Path

def read_data_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["Adv-Txt", "Int-Txt", "Ele-Txt"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir == "Ele-Txt" else 1 if label_dir == "Int-Txt" else 2 )

    return texts, labels

texts, labels = read_data_split('./')

In [None]:
from sklearn.model_selection import train_test_split
train_texts, temp_texts, train_labels, temp_labels = train_test_split(texts, labels, test_size=.3)

val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=.5)

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
import torch

class ComplexityDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ComplexityDataset(train_encodings, train_labels)
val_dataset = ComplexityDataset(val_encodings, val_labels)
test_dataset = ComplexityDataset(test_encodings, test_labels)

In [None]:
import numpy as np
def compute_metrics(eval_pred):
    # Get the true labels and predicted probabilities
    probabilities, labels = eval_pred

    # TODO: compute accuracy between predictions & true labels
    pred_labels = np.argmax(probabilities, axis=1)
    accuracy = np.sum(pred_labels == labels) / len(labels)
    return {'accuracy': accuracy}

# Training  and Tuning DistilledBert

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments


training_args = TrainingArguments(
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    output_dir='./results',
    learning_rate=5e-05,
    num_train_epochs=4.0,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,

    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.0001,               # strength of weight decay
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

trainer = Trainer(
    model=model,                         # the instantiated  Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0957,1.085951,0.341176
2,0.9654,0.715967,0.717647
3,0.526,0.453425,0.717647
4,0.2618,0.284789,0.894118


TrainOutput(global_step=200, training_loss=0.712213397026062, metrics={'train_runtime': 86.0182, 'train_samples_per_second': 18.415, 'train_steps_per_second': 2.325, 'total_flos': 209832101462016.0, 'train_loss': 0.712213397026062, 'epoch': 4.0})

In [None]:
predictions = np.argmax((trainer.predict(test_dataset).predictions), axis=1)
predictions
labels = np.array(test_dataset.labels)
idx = np.where(predictions != labels)[0]

In [None]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.3030627369880676,
 'eval_accuracy': 0.872093023255814,
 'eval_runtime': 1.5364,
 'eval_samples_per_second': 55.974,
 'eval_steps_per_second': 7.159,
 'epoch': 4.0}

In [None]:
test_texts[26]

'\ufeffBarack Obama has urged young people to reject pessimism and interact with those who have different beliefs if they want to make changes in the world. \nOn the final day of his last visit to Britain as US president, Obama told 500 youth leaders at a town hall meeting in London: “I’m here to ask you to reject the notion there are forces we can’t control. As JFK said, our problems are manmade and can be solved by man.” \n“You’ve never had better tools to make a difference,” he told the A-level and UK –US exchange students at the Q&A session. “Reject pessimism, cynicism and know that progress is possible. Progress is not inevitable; it requires struggle, discipline and faith.” \nBut Obama acknowledged the challenges faced by young people: “Not to say your generation has had it easy, in a time of breathtaking change, from 9/11, 7/7 … and during an age of information and Twitter where there’s a steady stream of bad news.” \nThe audience cheered as the president was introduced and went

In [None]:
test_texts

['Intermediate \nThey call him the Robin Hood of the banks, a man who took out many loans worth almost half a million euros with no intention of ever paying them back. Instead, Enric Duran gave the money to projects that created and promoted alternatives to capitalism.\nAfter 14 months in hiding, Duran is unapologetic, even though his activities could put him in jail. Im proud of what I did, he said in an interview by Skype from a secret location.\nFrom 2006 to 2008, Duran took out 68 commercial and personal loans from 39 banks in Spain. He gave the money to social activists, who used it to pay for speaking tours against capitalism and TV cameras for a media network. He said he saw that these social movements were building alternatives but that they didnt have enough money. Meanwhile, constant growth was creating a system that created money out of nothing.\nThe loans he swindled from banks were his way of regulating and denouncing this situation, he said. He started slowly. I  lled out

# Training and Tuning RoBERTa

In [None]:

from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

training_args = TrainingArguments(
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    output_dir='./results',
    learning_rate=5e-05,
    num_train_epochs=5.0,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,

    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.0001,               # strength of weight decay
)

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

trainer = Trainer(
    model=model,                         # the instantiated  Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1005,1.09668,0.364706
2,1.0998,1.097423,0.317647
3,1.0945,1.076191,0.411765
4,0.7801,0.562752,0.694118
5,0.5603,0.608559,0.635294


TrainOutput(global_step=250, training_loss=0.9270373077392579, metrics={'train_runtime': 217.4447, 'train_samples_per_second': 9.106, 'train_steps_per_second': 1.15, 'total_flos': 520964567101440.0, 'train_loss': 0.9270373077392579, 'epoch': 5.0})

In [None]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.4440905451774597,
 'eval_accuracy': 0.6976744186046512,
 'eval_runtime': 2.8806,
 'eval_samples_per_second': 29.855,
 'eval_steps_per_second': 3.819,
 'epoch': 5.0}

In [None]:
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
# from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments


training_args = TrainingArguments(
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    output_dir='./results',
    learning_rate=5e-05,
    num_train_epochs=5.0,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,

    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

trainer = Trainer(
    model=model,                         # the instantiated  Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0533,1.00121,0.517647
2,0.8053,0.674397,0.788235
3,0.493,0.593955,0.682353
4,0.2579,0.354445,0.882353


TrainOutput(global_step=200, training_loss=0.6523921346664429, metrics={'train_runtime': 172.0419, 'train_samples_per_second': 9.207, 'train_steps_per_second': 1.163, 'total_flos': 416771653681152.0, 'train_loss': 0.6523921346664429, 'epoch': 4.0})

In [None]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.42093032598495483,
 'eval_accuracy': 0.8604651162790697,
 'eval_runtime': 2.8757,
 'eval_samples_per_second': 29.906,
 'eval_steps_per_second': 3.825,
 'epoch': 4.0}

# Training and Tuning BERT

In [None]:
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
# from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments


training_args = TrainingArguments(
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    output_dir='./results',
    learning_rate=5e-05,
    num_train_epochs=5.0,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,

    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

trainer = Trainer(
    model=model,                         # the instantiated  Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1301,1.089955,0.352941
2,1.0083,0.803192,0.858824
3,0.6041,0.383147,0.882353
4,0.1871,0.191079,0.941176
5,0.1456,0.187622,0.964706


TrainOutput(global_step=250, training_loss=0.6150217323303223, metrics={'train_runtime': 215.9771, 'train_samples_per_second': 9.168, 'train_steps_per_second': 1.158, 'total_flos': 520964567101440.0, 'train_loss': 0.6150217323303223, 'epoch': 5.0})

In [None]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.30608901381492615,
 'eval_accuracy': 0.9069767441860465,
 'eval_runtime': 2.8766,
 'eval_samples_per_second': 29.897,
 'eval_steps_per_second': 3.824,
 'epoch': 5.0}

# Error Analysis

In [None]:
predictions = np.argmax((trainer.predict(test_dataset).predictions), axis=1)

labels = np.array(test_dataset.labels)
idx = np.where(predictions != labels)[0]


In [None]:
wrong = [predictions[i] for i in idx]
wrong
#0 = simple

[0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
correct = [labels[i] for i in idx]
correct
#2=advanced, actually advanced but predicted as simple

[2, 2, 2, 2, 2, 2, 2, 2]

In [None]:
mis = [test_texts[i] for i in idx]
mis

['\ufeffDo you want your child to be good at sport, make the school team and, maybe one day, even compete on the world stage? Well, try to ensure that your would-be Olympian or World Cup winner is born in November or, failing that, in October. A study led by one of the UK’s leading experts on children’s physical activity has found that school pupils born in those months are fitter than everyone else in their class. \nNovember- and October-born children emerged as fitter, stronger and more powerful than their peers born in the other ten months of the year, especially those whose birthdays fell in April or June. Dr Gavin Sandercock, from the Centre for Sports and Exercise Science at Essex University, and colleagues found that autumn-born children enjoyed “a clear physical advantage” over their classmates. \nThe research involved 8,550 boys and girls aged between ten and 16 from 26 state schools in Essex. All were tested between 2007 and 2010 on three different measures of fitness: stamin

In [None]:
test_texts[1]

'\ufeffAs soon as the children at one primary school in Stirling hear the words “daily mile”, they down their pencils and head out of the classroom to start running laps around the school field. For three-and-a-half years, all pupils at St Ninian’s Primary have walked or run a mile each day. They do so at random times during the day, apparently happily, and, despite the rise in childhood obesity across the UK, none of the children at the school are overweight. \nThe daily mile has done so much to improve these children’s fitness, behaviour and concentration in lessons that scores of nursery and primary schools across Britain are following suit and getting pupils to get up from their desks and take 15 minutes to walk or run round the school or local park. \nElaine Wyllie, headteacher of St Ninian’s, said: “I get at least two emails a day from other schools and local authorities asking how we do it. The thought of children across the country running every day because of something we’ve d