In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
import os
import shutil
import os.path as osp

import torch
from torch import nn
import torch.nn.functional as F

from accelerate import Accelerator
from accelerate.utils import LoggerType

from transformers import AdamW
from transformers import AlbertConfig, AlbertModel
from accelerate import DistributedDataParallelKwargs

from model import MultiTaskModel
from mydataloader import build_dataloader
from utils import length_to_mask, scan_checkpoint

from datasets import load_from_disk

from torch.utils.tensorboard import SummaryWriter
from datasets import load_dataset, concatenate_datasets
import wandb

import yaml
import pickle


config_path = "/home/PL-BERT/multilingual-pl-bert/config.yml" # you can change it to anything else
config = yaml.safe_load(open(config_path))

with open(config['dataset_params']['token_maps'], 'rb') as handle:
    token_maps = pickle.load(handle)

print(len(token_maps))

from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained(config['dataset_params']['tokenizer'])

criterion = nn.CrossEntropyLoss() # F0 loss (regression)

best_loss = float('inf')  # best test loss
start_epoch = 0  # start from epoch 0 or last checkpoint epoch
loss_train_record = list([])
loss_test_record = list([])

num_steps = config['num_steps']
log_interval = config['log_interval']
save_interval = config['save_interval']


def build_dataloaders(dataset, batch_size, num_workers, dataset_config):
    # Split dataset into train and validation
    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    print(len(train_dataset), len(val_dataset))
    train_loader = build_dataloader(train_dataset, 
                                  batch_size=batch_size, 
                                  num_workers=num_workers, 
                                  dataset_config=dataset_config)
    
    val_loader = build_dataloader(val_dataset, 
                                batch_size=batch_size, 
                                num_workers=num_workers, 
                                dataset_config=dataset_config)
    
    return train_loader, val_loader

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


81677




In [5]:
# for _, batch in enumerate(train_loader):   
#     words, labels, phonemes, input_lengths, masked_indices = batch

import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
curr_steps = 0
dataset_hi = load_from_disk("/home/PL-BERT/data/hi")
dataset_en = load_from_disk("/home/PL-BERT/data/en_ds")

dataset = concatenate_datasets([dataset_hi, dataset_en])

log_dir = config['log_dir']
if not osp.exists(log_dir): os.makedirs(log_dir, exist_ok=True)
shutil.copy(config_path, osp.join(log_dir, osp.basename(config_path)))

batch_size = config["batch_size"]
train_loader = build_dataloader(dataset, 
                                            batch_size=batch_size, 
                                            num_workers=0, 
                                            dataset_config=config['dataset_params'])

# train_loader, val_loader = build_dataloader(dataset, 
#                                   batch_size=batch_size, 
#                                   num_workers=0, 
#                                   dataset_config=config['dataset_params'])

albert_base_configuration = AlbertConfig(**config['model_params'])

bert = AlbertModel(albert_base_configuration)
print("Vocab Size: ", 1 + max([m['token'] for m in token_maps.values()]))
print(len(token_maps))
bert = MultiTaskModel(bert, 
                        num_vocab=1 + max([m['token'] for m in token_maps.values()]), 
                        num_tokens=config['model_params']['vocab_size'],
                        hidden_size=config['model_params']['hidden_size'])
bert = bert.to(device)  # Move model to GPU
    
load = False
try:
    files = os.listdir(log_dir)
    ckpts = []
    for f in os.listdir(log_dir):
        if f.startswith("step_"): ckpts.append(f)

    iters = [int(f.split('_')[-1].split('.')[0]) for f in ckpts if os.path.isfile(os.path.join(log_dir, f))]
    iters = sorted(iters)[-1]
except:
    iters = 0
    load = False

optimizer = AdamW(bert.parameters(), lr=1e-4)

load = True
if load:
    checkpoint = torch.load("/home/PL-BERT/en_hi_phonemes/step_19301.t7", map_location='cpu')
    state_dict = checkpoint['net']
    from collections import OrderedDict
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = k[7:] # remove `module.`
        new_state_dict[name] = v

    bert.load_state_dict(new_state_dict, strict=False)
    
    print('Checkpoint loaded.')


177
Vocab Size:  65803
81677


  checkpoint = torch.load("/home/PL-BERT/en_hi_phonemes/step_19301.t7", map_location='cpu')


Checkpoint loaded.


In [6]:
def get_text(input_ids):
    decoded = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
    text = ''
    for token in decoded:
        if token.startswith('# # '):
            text += token[2:]  # Remove ## and append without space
        else:
            text += ' ' + token  # Add space before new word
    text = text.strip().replace("# ", "")  # Remove leading/trailing spaces

    return text

In [7]:
from tqdm import tqdm

# Initialize a list to store batch losses and batch information
batch_losses = []

# Loop over the training data with tqdm for progress tracking
for step, batch in tqdm(enumerate(train_loader), total=len(train_loader), desc="Training Steps"):
    step_start_time = time.time()  # Start timing for the step
    curr_steps += 1

    words, labels, phonemes, input_lengths, masked_indices = [x.to(device) if torch.is_tensor(x) else x for x in batch]
    text_mask = length_to_mask(torch.Tensor(input_lengths)).to(device)
    
    # Forward pass through the model
    tokens_pred, words_pred = bert(phonemes, attention_mask=(~text_mask).int())
    
    # Compute vocabulary loss
    loss_vocab = 0
    for _s2s_pred, _text_input, _text_length, _masked_indices in zip(words_pred, words, input_lengths, masked_indices):
        loss_vocab += criterion(_s2s_pred[:_text_length], _text_input[:_text_length])
    loss_vocab /= words.size(0)
    
    # Compute token-level loss
    loss_token = 0
    sizes = 1
    for _s2s_pred, _text_input, _text_length, _masked_indices in zip(tokens_pred, labels, input_lengths, masked_indices):
        if len(_masked_indices) > 0:
            _text_input = _text_input[:_text_length][_masked_indices]
            loss_tmp = criterion(_s2s_pred[:_text_length][_masked_indices], _text_input[:_text_length]) 
            loss_token += loss_tmp
            sizes += 1
    loss_token /= sizes

    # Total loss
    loss = loss_vocab + loss_token

    batch_losses.append({'batch': step, 'loss': loss.item(),  "text": get_text(torch.argmax(words_pred[0], axis=1)), "tokens":torch.argmax(words_pred[0], axis=1), "words": words})
        
    # # Print the loss for the current step
    # print(f"Step {step}: Loss = {loss.item()}")

    # # Break after one step for debugging or analysis

    

Training Steps:   0%|          | 0/68950 [00:00<?, ?it/s]

Training Steps: 100%|██████████| 68950/68950 [33:32<00:00, 34.26it/s]


In [19]:
batch

(tensor([[64638, 64638, 64638, 64638, 62993, 62993, 62993,     0,     2,     0,
          62971, 62971, 62971, 62971, 62971,     0, 62313, 62313, 62313,     0,
          62902, 62902, 62902, 62902, 63237, 63237,     0, 62492, 62492, 62492,
          62492, 62492, 62492, 62492, 62492,     0, 63146, 63146, 63146, 63146,
          63146,     0, 62196, 62196, 62196,     0, 60207, 60207, 60207, 60207,
          60207, 60207, 60207, 60207, 60207, 60207, 60207, 60207, 60207,     0,
          61148, 61148, 61148, 61148, 61148, 61148, 61148,     0, 63475, 63475,
          63475, 63475, 63475,     0,     2,     0, 64638, 64638, 64638,  7746,
           7746,     0,     2,     0, 62950, 62950, 62950,     0, 63377, 63377,
          63377, 63377, 63377, 63377, 63377,     0, 62196, 62196, 62196,     0,
          60207, 60207, 60207, 60207, 60207, 60207, 60207, 60207, 60207, 60207,
          60207, 60207, 60207,     0, 60172, 65208, 65208, 65208, 65208, 65208,
          63880, 63880, 63880,     0, 60

In [11]:
import json 
import numpy as np
# # Save batch_losses to a JSON file
# with open('batch_losses.json', 'w') as f:
#     json.dump(batch_losses, f)

# # Load the batch_losses from the JSON file
# with open('batch_losses.json', 'r') as f:
#     batch_losses = json.load(f)

# Extract the losses and corresponding steps
losses = [entry['loss'] for entry in batch_losses]
steps = [entry['batch'] for entry in batch_losses]

# Calculate the 90th percentile of the loss
percentile_90 = np.percentile(losses, 90)

# Get the steps with losses greater than or equal to the 90th percentile
steps_90_percentile = [entry['batch'] for entry in batch_losses if entry['loss'] >= percentile_90]

In [16]:
percentile_90

np.float64(2.2216085910797116)

In [14]:
len(steps_90_percentile)

6895

In [12]:
# Filter out the indices corresponding to the steps in steps_90_percentile
filtered_dataset = dataset.filter(lambda example, idx: idx not in steps_90_percentile, with_indices=True)

# Save the filtered dataset to disk
filtered_dataset.save_to_disk("/home/PL-BERT/data/filtered_en_hi")

Filter:   6%|▌         | 4000/68950 [00:04<01:15, 863.82 examples/s]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f04cb2cde40>>
Traceback (most recent call last):
  File "/home/PL-BERT/venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
Filter: 100%|██████████| 68950/68950 [02:04<00:00, 555.85 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 62055/62055 [00:01<00:00, 53166.01 examples/s]


In [17]:
filtered_dataset.save_to_disk("/home/PL-BERT/data/filtered_en_hi")

Saving the dataset (2/2 shards): 100%|██████████| 62055/62055 [00:01<00:00, 56499.09 examples/s]


In [6]:
# After the loop, you can analyze the losses stored in batch_losses
# Example: print the top 5 batches with highest loss
top_losses = sorted(batch_losses, key=lambda x: x['loss'], reverse=True)[:100]
print("Top 5 Batches with Highest Loss: ")
for entry in top_losses:
    print(f"Batch {entry['batch']}: Loss = {entry['loss']}")

Top 5 Batches with Highest Loss: 
Batch 10370: Loss = 18.341266632080078
Batch 9660: Loss = 17.180313110351562
Batch 61710: Loss = 9.36107349395752
Batch 22984: Loss = 8.742210388183594
Batch 56126: Loss = 8.567729949951172
Batch 68546: Loss = 8.446359634399414
Batch 51811: Loss = 8.404273986816406
Batch 64127: Loss = 8.164405822753906
Batch 31241: Loss = 8.087174415588379
Batch 38157: Loss = 7.937072277069092
Batch 4557: Loss = 7.325591564178467
Batch 41506: Loss = 7.322220325469971
Batch 55280: Loss = 7.228742599487305
Batch 2792: Loss = 7.19148063659668
Batch 26485: Loss = 7.0198235511779785
Batch 28078: Loss = 6.9907636642456055
Batch 6684: Loss = 6.917701721191406
Batch 41922: Loss = 6.901057720184326
Batch 35889: Loss = 6.875587463378906
Batch 28622: Loss = 6.823854446411133
Batch 23620: Loss = 6.812102317810059
Batch 44983: Loss = 6.775235176086426
Batch 45264: Loss = 6.686344146728516
Batch 49334: Loss = 6.639467239379883
Batch 65108: Loss = 6.432596206665039
Batch 31824: Loss 

In [9]:
batch_losses[0]['loss']

1.508223533630371

In [7]:
batch_losses[0]['words']

tensor([[ 8427,  8427,     0,  6540,  6540,     0,    38,     0,  6974,  6974,
          6974,  6974,  6974,  6974,  6974,     0,  6525,  6525,     0,    56,
            56,    56,     0,     3,     3,     3,     3, 11606, 11606, 11606,
         11606, 11606, 11606, 11606,  6539,  6539,     0, 17771, 17771, 17771,
         17771, 17771,     0,     3,     3,     3,     3,     3,     3,     3,
            56,    56,    56,    56,     0,    14,     0,  6677,  6677,     0,
          6550,  6550,  6550,     0,  8617,  8617,  8617,  8617,  8617,  8617,
          8617,  8617,  8617,  8617,     0,  6571,  6571,  6571,  6571,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,  6521,
          6521,     0, 40658, 40658, 40658, 40658, 40658, 40658,     0,     3,
             3,     3,     3,     3,  6803,  6803,  6803,  6803,  6803,  6524,
          6524,     0,  6523,  6523,  6523,     0, 12791, 12791, 12791, 12791,
         12791,     0,     3,     3,     3,     3, 2

In [8]:
batch_losses[0]['tokens']

tensor([    0,  6523,     0,  6522,  6540,     0,    38,     0,  6974,  6974,
         6974,  6974,  6974,  6974,  6974,     0,  6525,  6525,     0,    56,
           56,    56,     0,     3,     3,     3,     3, 11606, 11606, 11606,
        11606, 11606, 11606, 11606,  6539, 11606,     0, 17771, 17771, 17771,
        17771, 17771,     0,     3,     3,     3,     3,     3,     3,     3,
           56,    56,    56,    56,     0,    14,     0,  6677,  6677,     0,
         6550,  6550,  6550,     0,  8617,  8617,  8617,  8617,  8617,  8617,
         8617,  8617,  8617,  8617,     0,  6571,  6571,  6571,  6571,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,  6521,
         6521,     0,     0,     0,     0,     0,     0,     0,     0,     3,
            3,     3,     3,     3,  6803,  6803,  6803,  6803,  6803,  6524,
         6524,     0,  6523,  6523,  6523,     0, 12791, 12791, 12791, 12791,
        12791,     0,     3,     3,     3,     3, 20012, 20012, 

In [15]:
# for step, batch in tqdm(enumerate(train_loader), total=len(train_loader), desc="Training Steps"):
#     step_start_time = time.time()  # Start timing for the step
#     curr_steps += 1

#     words, labels, phonemes, input_lengths, masked_indices = [x.to(device) if torch.is_tensor(x) else x for x in batch]
#     break

Training Steps:   0%|          | 0/68950 [00:00<?, ?it/s]


In [10]:
top_losses_values = [x['batch'] for x in top_losses]


In [11]:
top_losses_values

[10370,
 9660,
 61710,
 22984,
 56126,
 68546,
 51811,
 64127,
 31241,
 38157,
 4557,
 41506,
 55280,
 2792,
 26485,
 28078,
 6684,
 41922,
 35889,
 28622,
 23620,
 44983,
 45264,
 49334,
 65108,
 31824,
 63856,
 52086,
 24966,
 33417,
 13326,
 12731,
 34984,
 28617,
 30134,
 17892,
 25625,
 6259,
 1702,
 16179,
 48110,
 66088,
 42838,
 31383,
 42146,
 37471,
 27526,
 58883,
 63971,
 58720,
 45109,
 46401,
 42564,
 46172,
 54282,
 62108,
 42028,
 25605,
 3570,
 12927,
 63710,
 55065,
 38264,
 60725,
 50222,
 49225,
 45501,
 5623,
 18501,
 46584,
 24663,
 20154,
 10898,
 43774,
 13325,
 20602,
 11054,
 51032,
 56990,
 59811,
 35201,
 65185,
 64057,
 35722,
 21679,
 4097,
 49988,
 48701,
 54622,
 19761,
 10119,
 53955,
 27950,
 4730,
 62134,
 55325,
 14874,
 20592,
 57771,
 40549]

In [12]:
top_loss_sentences = []

In [13]:

for entry in top_losses:
    # print(dataset[entry['batch']]['phonemes'])
    # print(get_text(dataset[entry['batch']]['input_ids']))
    # print(entry["loss"])

    print("Ground Truth Text: ", get_text(dataset[entry['batch']]['input_ids']))

    top_loss_sentences.append({"text": get_text(dataset[entry['batch']]['input_ids']), "loss": entry["loss"]})

Ground Truth Text:  ए ॰ के ॰ ऐस ॰ नंबर अभाज्यता टेस्ट पहला पोलीनोमिअल टाइम है अल्गोरिद्म ( कंप्यूटर विज्ञान में पोलीनोमिअल टाइम अल्गोरिद्मों को तेज माना जाता है ) जो बताता है कि कोई नंबर अभाज्य है या नहीं । इसका आविष्कार 2002 में भारतीय प्रौद्योगिकी संस्थान कानपुर के तीन कंप्यूटर वैज्ञानिकों  मणीन्द्र अग्रवाल , नीरज कयाल और नितिन सक्सेना ने किया था । किसी भी अल्गोरिद्म के लिए चार आवश्यकताएँ होती है : 1 ) वो हर इनपुट के लिए आउटपुट देता हो , 2 ) वो जल्दी उत्तर देता हो ( more precisely : वो पोलीनोमिअल टाइम में उत्तर देता हो ) , 3 ) वो कभी गलत उत्तर न देता हो और 4 ) वो किसी अप्रमाणित परिकल्पना पर निर्भर न करता हो । नंबर की अभाज्यता जांचने के लिए इस से पहले के सभी अल्गोरिद्म इन चार में से अधिक से अधिक तीन आवश्यकताओं को पूर्ण करते थे । ये पहला अल्गोरिद्म है जो इन चारों आवश्यकताओं को पूर्ण करता है । इस अल्गोरिद्म का प्रयोग किसी भी नंबर भी अभाज्यता जानने के लिए किया जा सकता है । इस से पहले के कई अल्गोरिद्म केवल खास श्रेणियों के नम्बरों के लिए काम करते हैं । उदाहरण के लिए लुकास - लेहमर टेस्ट सि

In [14]:
import csv 

# Define the CSV file path
hi_csv_file_path = "top_loss_sentences.csv"

# Write the data to CSV
with open(hi_csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["text", "loss", "phonemes"])
    writer.writeheader()  # Write the header row
    writer.writerows(top_loss_sentences)  # Write the data rows

print(f"Data saved to {hi_csv_file_path}")

Data saved to top_loss_sentences.csv


In [2]:
lowest_losses = sorted(batch_losses, key=lambda x: x['loss'], reverse=False)[:100]

lowest_loss_sentences = []

NameError: name 'batch_losses' is not defined

In [1]:
for entry in lowest_losses:
    # print(dataset[entry['batch']]['phonemes'])
    # print(get_text(dataset[entry['batch']]['input_ids']))
    # print(entry["loss"])

    print("Ground Truth Text: ", dataset[entry['batch']]['input_ids'])
    print("Predicted Text: ", dataset['input_ids'])

    lowest_loss_sentences.append({"text": get_text(dataset[entry['batch']]['input_ids']), "loss": entry["loss"], "phonemes":dataset[entry['batch']]['phonemes']})

NameError: name 'lowest_losses' is not defined

In [None]:
lowest_losses_values = [x['batch'] for x in lowest_losses]

In [None]:
import csv 

# Define the CSV file path
csv_file_path = "lowest_loss_sentences.csv"

# Write the data to CSV
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["text", "loss", "phonemes"])
    writer.writeheader()  # Write the header row
    writer.writerows(lowest_loss_sentences)  # Write the data rows

print(f"Data saved to {csv_file_path}")

In [None]:
loss_values = [entry['loss'] for entry in batch_losses]

# Create a box plot using seaborn
plt.figure(figsize=(8, 6))
sns.boxplot(data=loss_values)

# Set plot title and labels
plt.title('Distribution of Batch Losses')
plt.xlabel('Loss')
plt.ylabel('Frequency')

# Show the plot
plt.show()

In [None]:
import numpy as np
loss_values = np.array(loss_values)

In [None]:
percentile_75 = np.percentile(loss_values, 90)
print(percentile_75)
# Filter loss_values greater than the 75th percentile
high_loss_values = loss_values[loss_values > percentile_75]


In [None]:
len(high_loss_values)

In [None]:
from tqdm import tqdm

# Initialize a list to store batch losses and batch information
top_loss_output = []

# Loop over the training data with tqdm for progress tracking
for step, batch in tqdm(enumerate(train_loader), total=len(train_loader), desc="Training Steps"):
    if step not in top_losses_values:
        continue
    step_start_time = time.time()  # Start timing for the step
    curr_steps += 1

    words, labels, phonemes, input_lengths, masked_indices = [x.to(device) if torch.is_tensor(x) else x for x in batch]
    text_mask = length_to_mask(torch.Tensor(input_lengths)).to(device)
    
    # Forward pass through the model
    tokens_pred, words_pred = bert(phonemes, attention_mask=(~text_mask).int())
    
    # Compute vocabulary loss
    loss_vocab = 0
    for _s2s_pred, _text_input, _text_length, _masked_indices in zip(words_pred, words, input_lengths, masked_indices):
        loss_vocab += criterion(_s2s_pred[:_text_length], _text_input[:_text_length])
    loss_vocab /= words.size(0)
    
    # Compute token-level loss
    loss_token = 0
    sizes = 1
    for _s2s_pred, _text_input, _text_length, _masked_indices in zip(tokens_pred, labels, input_lengths, masked_indices):
        if len(_masked_indices) > 0:
            _text_input = _text_input[:_text_length][_masked_indices]
            loss_tmp = criterion(_s2s_pred[:_text_length][_masked_indices], _text_input[:_text_length]) 
            loss_token += loss_tmp
            sizes += 1
    loss_token /= sizes

    # Total loss
    loss = loss_vocab + loss_token

    # Store the loss and batch information for later analysis
    top_loss_output.append({'batch': step, 'loss': loss.item(), "text": get_text(torch.argmax(tokens_pred, axis=1))})

In [None]:
from tqdm import tqdm

# Initialize a list to store batch losses and batch information
lowest_loss_output = []

# Loop over the training data with tqdm for progress tracking
for step, batch in tqdm(enumerate(train_loader), total=len(train_loader), desc="Training Steps"):
    if step not in lowest_losses_values:
        continue
    step_start_time = time.time()  # Start timing for the step
    curr_steps += 1

    words, labels, phonemes, input_lengths, masked_indices = [x.to(device) if torch.is_tensor(x) else x for x in batch]
    text_mask = length_to_mask(torch.Tensor(input_lengths)).to(device)
    
    # Forward pass through the model
    tokens_pred, words_pred = bert(phonemes, attention_mask=(~text_mask).int())
    
    # Compute vocabulary loss
    loss_vocab = 0
    for _s2s_pred, _text_input, _text_length, _masked_indices in zip(words_pred, words, input_lengths, masked_indices):
        loss_vocab += criterion(_s2s_pred[:_text_length], _text_input[:_text_length])
    loss_vocab /= words.size(0)
    
    # Compute token-level loss
    loss_token = 0
    sizes = 1
    for _s2s_pred, _text_input, _text_length, _masked_indices in zip(tokens_pred, labels, input_lengths, masked_indices):
        if len(_masked_indices) > 0:
            _text_input = _text_input[:_text_length][_masked_indices]
            loss_tmp = criterion(_s2s_pred[:_text_length][_masked_indices], _text_input[:_text_length]) 
            loss_token += loss_tmp
            sizes += 1
    loss_token /= sizes

    # Total loss
    loss = loss_vocab + loss_token

    # Store the loss and batch information for later analysis
    lowest_loss_output.append({'batch': step, 'loss': loss.item(), "text": get_text(torch.argmax(tokens_pred, axis=1))})