In [5]:
# Load Google Drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


**Tokenize the Dataset through Indic NLP Tokenizer**

In [3]:
1.1

# Install Indic NLP Library

!pip install indic-nlp-library



Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting sphinxcontrib-jquery<5,>=4 (from sphinx-rtd-theme->indic-nlp-library)
  Downloading sphinxcontrib_jquery-4.1-py2.py3-none-any.whl.metadata (2.6 kB)
Downloading indic_nlp_library-0.92-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Downloading sphinx_argparse-0.5.2-py3-none-any.whl (12 kB)
Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl (7.7 MB)
[2K   [90m━

In [6]:
1.2

# Download Indic NLP Resources

!wget https://github.com/anoopkunchukuttan/indic_nlp_resources/archive/refs/heads/master.zip -O /content/drive/MyDrive/Indic_GPT2/indic_nlp_resources.zip


--2024-12-24 20:15:15--  https://github.com/anoopkunchukuttan/indic_nlp_resources/archive/refs/heads/master.zip
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/anoopkunchukuttan/indic_nlp_resources/zip/refs/heads/master [following]
--2024-12-24 20:15:15--  https://codeload.github.com/anoopkunchukuttan/indic_nlp_resources/zip/refs/heads/master
Resolving codeload.github.com (codeload.github.com)... 140.82.114.9
Connecting to codeload.github.com (codeload.github.com)|140.82.114.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘/content/drive/MyDrive/Indic_GPT2/indic_nlp_resources.zip’

/content/drive/MyDr     [              <=>   ]  48.49M  5.48MB/s    in 8.8s    

2024-12-24 20:15:24 (5.48 MB/s) - ‘/content/drive/MyDrive/Indic_GPT2/indic_nlp_resources.zip’ saved [

In [8]:
1.3

# Extract files

import zipfile

zip_path = "/content/drive/MyDrive/Indic_GPT2/indic_nlp_resources.zip"
extract_path = "/content/drive/MyDrive/Indic_GPT2/indic_nlp_resources"

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extraction complete! Resources are saved in:", extract_path)


Extraction complete! Resources are saved in: /content/drive/MyDrive/Indic_GPT2/indic_nlp_resources


In [9]:
1.4

# Set up Resource Path

from indicnlp import common

# Set the path to the extracted resources
INDIC_NLP_RESOURCES = "/content/drive/MyDrive/Indic_GPT2/indic_nlp_resources"
common.set_resources_path(INDIC_NLP_RESOURCES)

print("Resource path set successfully!")


Resource path set successfully!


In [10]:
1.5

# Define a Tokenization Function

from indicnlp.tokenize import indic_tokenize

# Define a function to tokenize text
def tokenize_text(example):
    # Replace 'hi' with the language code for your dataset (e.g., 'ta' for Tamil, 'bn' for Bengali)
    tokenized_text = indic_tokenize.trivial_tokenize(example["text"], lang="hi")
    example["tokenized_text"] = " ".join(tokenized_text)  # Join tokens with spaces
    return example

print("Tokenization function is ready!")


Tokenization function is ready!


In [12]:
1.6

# Install the datasets library
!pip install datasets





In [15]:
1.7

from datasets import load_from_disk

# Load the dataset using load_from_disk
dataset = load_from_disk("/content/drive/MyDrive/Indic_GPT2/AI4Bharat_Dataset/Verified_Hindi_p1_combined")

# Apply IndicNLPtokenization to the dataset
tokenized_dataset = dataset.map(tokenize_text, num_proc=4)  # Adjust num_proc based on resources

print("Tokenization completed!")

# Save the IndicNLP tokenized dataset to Google Drive
save_path = "/content/drive/MyDrive/Indic_GPT2/AI4B010_IndicNLP_tokenized_dataset"
tokenized_dataset.save_to_disk(save_path)

print(f"Tokenized dataset saved successfully at {save_path}")




Loading dataset from disk:   0%|          | 0/20 [00:00<?, ?it/s]

Map (num_proc=4):   0%|          | 0/1768640 [00:00<?, ? examples/s]

Tokenization completed!


Saving the dataset (0/40 shards):   0%|          | 0/1768640 [00:00<?, ? examples/s]

Tokenized dataset saved successfully at /content/drive/MyDrive/Indic_GPT2/AI4B010_IndicNLP_tokenized_dataset


**Map the IndicNLP Tokenized Dataset to GPT 2 Compatible Tokenizer**

In [16]:
2.1

# Install Hugging Face Transformers

!pip install transformers




In [17]:
2.2

# Load the GPT-2 Tokenizer with Padding

from transformers import GPT2Tokenizer

# Load GPT-2 tokenizer
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set the padding token to be the same as the EOS token
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

# Verify the padding token
print("Padding token set to:", gpt2_tokenizer.pad_token)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Padding token set to: <|endoftext|>


In [None]:
2.3

# Map the IndicNLP Tokenized Dataset to GPT-2 Tokenizer with Padding

def map_to_gpt2_tokenizer(example):
    # Tokenize with GPT-2 tokenizer, applying truncation and padding
    gpt2_tokenized = gpt2_tokenizer(
        example["tokenized_text"],
        truncation=True,
        padding="max_length",  # Pad to max_length
        max_length=128,        # Adjust based on model/dataset needs
        return_tensors="np"    # Return NumPy arrays for compatibility
    )
    # Return the input IDs and attention masks
    return {
        "input_ids": gpt2_tokenized["input_ids"].tolist(),
        "attention_mask": gpt2_tokenized["attention_mask"].tolist()
    }

# Apply mapping to the entire dataset
gpt2_mapped_dataset = tokenized_dataset.map(map_to_gpt2_tokenizer, batched=True)

print("Mapping to GPT-2 tokenizer with padding completed!")

# Save the mapped dataset to the specified folder
save_path = "/content/drive/MyDrive/Indic_GPT2/gpt2_ai4b010_tokenized"
gpt2_mapped_dataset.save_to_disk(save_path)

print(f"Tokenized dataset saved to: {save_path}")



Map:   0%|          | 0/1768640 [00:00<?, ? examples/s]

In [None]:
2.4

# Save the GPT-2 Compatible Dataset

gpt2_save_path = "/content/drive/MyDrive/Indic_GPT2/gpt2_mapped_dataset_ai4b010"
gpt2_mapped_dataset.save_to_disk(gpt2_save_path)

print(f"GPT-2 compatible dataset saved successfully at {gpt2_save_path}")

