In [1]:
import tensorflow as tf
print("GPU Available:", tf.test.is_gpu_available())


Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


GPU Available: False


In [2]:
gpu_device = tf.config.experimental.list_physical_devices('GPU')
if gpu_device:
    tf.config.experimental.set_memory_growth(gpu_device[0], True)
    print(f"Memory Growth Enabled for GPU: {gpu_device[0]}")
else:
    print("No GPU found.")


No GPU found.


In [3]:
import tensorflow as tf
import time

# Create two large random matrices
matrix1 = tf.random.normal([10000, 10000])
matrix2 = tf.random.normal([10000, 10000])

# Perform a matrix multiplication
start_time = time.time()
result = tf.matmul(matrix1, matrix2)
end_time = time.time()

print("Matrix multiplication time: {:.4f} seconds".format(end_time - start_time))


Matrix multiplication time: 54.9536 seconds


In [None]:
!nvidia-smi


Sat Nov 23 10:08:05 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0              27W /  70W |   2165MiB / 15360MiB |     12%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import numpy as np
from transformers import AutoTokenizer
from datasets import Dataset
import torch
from tqdm.auto import tqdm
import re
from collections import Counter
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings

In [None]:
from transformers import AutoTokenizer, GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
class TextPreprocessor:
    def __init__(self, max_length=512, model_name="gpt2"):
        """
        Initialize the text preprocessor with specified parameters.

        Args:
            max_length (int): Maximum sequence length for tokenization
            model_name (str): Name of the pretrained tokenizer to use
        """
        self.max_length = max_length
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token  # Set padding token
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def clean_text(self, text):
        """
        Clean and normalize text data.

        Args:
            text (str): Input text to clean

        Returns:
            str: Cleaned text
        """
        if pd.isna(text):
            return ""

        # Convert to lowercase
        text = text.lower()

        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

        # Remove email addresses
        text = re.sub(r'[\w\.-]+@[\w\.-]+', '', text)

        # Remove special characters but keep basic punctuation
        text = re.sub(r'[^\w\s.,!?-]', ' ', text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)

        return text.strip()

    def tokenize_and_lemmatize(self, text):
        """
        Tokenize and lemmatize text, removing stopwords.

        Args:
            text (str): Input text to process

        Returns:
            list: List of processed tokens
        """
        # Tokenize
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token)
                  for token in tokens
                  if token.lower() not in self.stop_words
                  and len(token) > 1]  # Remove single-character tokens

        return tokens

    def create_vocabulary(self, texts, min_freq=5, max_vocab_size=50000):
        """
        Create vocabulary from the dataset.

        Args:
            texts (list): List of text documents
            min_freq (int): Minimum frequency for a token to be included
            max_vocab_size (int): Maximum vocabulary size

        Returns:
            dict: Word to index mapping
        """
        # Count all tokens
        token_counts = Counter()

        for text in tqdm(texts, desc="Building vocabulary"):
            cleaned_text = self.clean_text(text)
            tokens = self.tokenize_and_lemmatize(cleaned_text)
            token_counts.update(tokens)

        # Filter by frequency and vocab size
        filtered_tokens = [token for token, count in token_counts.most_common(max_vocab_size)
                            if count >= min_freq]

        # Create vocabulary
        vocab = {token: idx for idx, token in enumerate(filtered_tokens, start=1)}
        vocab['[PAD]'] = 0
        vocab['[UNK]'] = len(vocab)

        return vocab

    def encode_text(self, text, vocab, max_length=None):
        """
        Encode text using the vocabulary.

        Args:
            text (str): Input text to encode
            vocab (dict): Vocabulary mapping
            max_length (int): Maximum sequence length

        Returns:
            list: Encoded sequence
        """
        if max_length is None:
            max_length = self.max_length

        cleaned_text = self.clean_text(text)
        tokens = self.tokenize_and_lemmatize(cleaned_text)

        # Convert tokens to indices
        encoded = [vocab.get(token, vocab['[UNK]']) for token in tokens]

        # Pad or truncate to max_length
        if len(encoded) < max_length:
            encoded = encoded + [vocab['[PAD]']] * (max_length - len(encoded))
        else:
            encoded = encoded[:max_length]

        return encoded

    def transform_bert(self, text):
        """
        Transform text using BERT tokenizer.

        Args:
            text (str): Input text

        Returns:
            dict: Encoded inputs for BERT
        """
        return self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

    def prepare_dataset(self, df, vocab=None, min_freq=5, max_vocab_size=50000):
        """
        Prepare dataset for training.

        Args:
            df (pandas.DataFrame): Input DataFrame
            vocab (dict): Optional existing vocabulary
            min_freq (int): Minimum frequency for vocabulary creation
            max_vocab_size (int): Maximum vocabulary size

        Returns:
            tuple: Processed data and vocabulary
        """
        texts = df['text'].tolist()

        # Create or use vocabulary
        if vocab is None:
            vocab = self.create_vocabulary(texts, min_freq, max_vocab_size)

        # Encode all texts
        encoded_texts = []
        for text in tqdm(texts, desc="Encoding texts"):
            encoded = self.encode_text(text, vocab)
            encoded_texts.append(encoded)

        # Convert to numpy array
        encoded_texts = np.array(encoded_texts)

        return encoded_texts, vocab

    def prepare_bert_dataset(self, df, test_size=0.2, random_state=42):
        """
        Prepare dataset for BERT-based models.

        Args:
            df (pandas.DataFrame): Input DataFrame
            test_size (float): Proportion of test set
            random_state (int): Random seed

        Returns:
            tuple: Training and testing datasets
        """
        # Split data
        train_df, test_df = train_test_split(
            df,
            test_size=test_size,
            random_state=random_state,
            stratify=df['main_category'] if 'main_category' in df.columns else None
        )

        # Convert to HuggingFace datasets
        train_dataset = Dataset.from_pandas(train_df)
        test_dataset = Dataset.from_pandas(test_df)

        # Tokenize function for mapping
        def tokenize_function(examples):
            model_inputs = self.tokenizer(
                examples['text'],
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            # Create labels by shifting input_ids
            labels = model_inputs['input_ids'].clone()
            labels[labels == self.tokenizer.pad_token_id] = -100  # Ignore padding tokens
            model_inputs['labels'] = labels
            return model_inputs

        # Apply tokenization
        train_dataset = train_dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=train_dataset.column_names
        )

        test_dataset = test_dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=test_dataset.column_names
        )

        return train_dataset, test_dataset

# Load the processed dataset
df = pd.read_parquet('processed_dataset.parquet')

# Initialize preprocessor
preprocessor = TextPreprocessor(max_length=512)

# 1. Basic preprocessing example
print("Preprocessing example:")
sample_text = df['text'].iloc[0]
cleaned_text = preprocessor.clean_text(sample_text)
tokens = preprocessor.tokenize_and_lemmatize(cleaned_text)
print(f"First 10 tokens: {tokens[:10]}")

# 2. Create vocabulary and encode texts
print("\nCreating vocabulary and encoding texts...")
encoded_texts, vocab = preprocessor.prepare_dataset(
    df,  # Using the entire dataset
    min_freq=2,
    max_vocab_size=10000
)
print(f"Vocabulary size: {len(vocab)}")
print(f"Encoded shape: {encoded_texts.shape}")

# 3. Prepare BERT dataset
print("\nPreparing BERT dataset...")
train_dataset, test_dataset = preprocessor.prepare_bert_dataset(
    df  # Using the entire dataset
)
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Additional check: Print the first few tokenized examples
for i in range(3):
    print(f"Example {i+1}:")
    print(f"Input IDs: {train_dataset[i]['input_ids']}")
    print(f"Attention Mask: {train_dataset[i]['attention_mask']}")
    print(f"Labels: {train_dataset[i]['labels']}")
    print()

# Additional check: Print the first few original texts and their tokenized versions
for i in range(3):
    print(f"Original Text {i+1}:")
    print(df.iloc[i]['text'])
    print(f"Tokenized Input IDs: {train_dataset[i]['input_ids']}")
    print(f"Tokenized Attention Mask: {train_dataset[i]['attention_mask']}")
    print(f"Labels: {train_dataset[i]['labels']}")
    print()

# Prepare datasets dictionary
datasets = DatasetDict({
    'train': train_dataset,
    'eval': test_dataset
})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Preprocessing example:
First 10 tokens: ['australian', 'bureau', 'statistic', 'celebrating', 'international', 'year', 'statistic', '2013', 'ab', 'home']

Creating vocabulary and encoding texts...


Building vocabulary:   0%|          | 0/18987 [00:00<?, ?it/s]

Encoding texts:   0%|          | 0/18987 [00:00<?, ?it/s]

Vocabulary size: 10002
Encoded shape: (18987, 512)

Preparing BERT dataset...


Map:   0%|          | 0/15189 [00:00<?, ? examples/s]

Map:   0%|          | 0/3798 [00:00<?, ? examples/s]

Train dataset size: 15189
Test dataset size: 3798
Example 1:
Input IDs: [27034, 13, 22162, 5728, 66, 5857, 9502, 837, 399, 13, 39, 13, 837, 910, 3772, 835, 11818, 75, 330, 15032, 1535, 837, 1394, 5149, 661, 764, 3772, 835, 11818, 75, 330, 15032, 1535, 837, 1394, 5149, 661, 837, 531, 9074, 13, 22162, 5728, 66, 5857, 837, 767, 400, 20956, 4675, 837, 9502, 837, 968, 13910, 837, 1110, 2084, 764, 6989, 890, 10726, 773, 328, 395, 295, 760, 1650, 2883, 2060, 9799, 764, 11384, 11234, 595, 24071, 2107, 7646, 395, 5496, 837, 2147, 3947, 4236, 764, 640, 8033, 3947, 2005, 837, 561, 8659, 22121, 1711, 640, 764, 2626, 4202, 3190, 2627, 4939, 1057, 12, 2902, 561, 1577, 1811, 640, 1110, 561, 2245, 1334, 764, 4457, 10927, 14709, 540, 837, 2936, 588, 561, 2270, 3190, 764, 1881, 1110, 2497, 2643, 1545, 6164, 531, 11818, 75, 330, 4193, 33138, 837, 9431, 4745, 540, 9007, 2540, 2263, 3393, 837, 10607, 37196, 1255, 764, 20788, 6596, 826, 717, 837, 12361, 773, 328, 395, 295, 6989, 890, 5000, 12120, 764, 5201,

In [None]:
# Initialize the model
model = GPT2LMHeadModel.from_pretrained('gpt2')

#  Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,  # Log every 10 steps
    evaluation_strategy="epoch",
    fp16=True,  # Enable mixed precision training
    save_total_limit=2,  # Limit the total amount of checkpoints. Delete the older checkpoints.
)

#  Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets['train'],
    eval_dataset=datasets['eval'],
)

#  Train the model
trainer.train()

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,4.5373,4.384528
2,4.1091,4.305603
3,4.0042,4.282674


TrainOutput(global_step=22785, training_loss=4.3938881887268915, metrics={'train_runtime': 4708.7313, 'train_samples_per_second': 9.677, 'train_steps_per_second': 4.839, 'total_flos': 1.1906294022144e+16, 'train_loss': 4.3938881887268915, 'epoch': 3.0})

In [None]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")


Evaluation results: {'eval_loss': 4.282674312591553, 'eval_runtime': 95.9401, 'eval_samples_per_second': 39.587, 'eval_steps_per_second': 19.794, 'epoch': 3.0}


In [None]:
#  Define the model, tokenizer, and datasets
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
#  Save the model
model.save_pretrained('./nano_gpt_model')
tokenizer.save_pretrained('./nano_gpt_model')

('./nano_gpt_model/tokenizer_config.json',
 './nano_gpt_model/special_tokens_map.json',
 './nano_gpt_model/vocab.json',
 './nano_gpt_model/merges.txt',
 './nano_gpt_model/added_tokens.json')

In [None]:
import torch

In [None]:
#  Set the padding token
tokenizer.pad_token = tokenizer.eos_token

#  Prepare the input text
input_text = "generaete a short 10 stroy ,Once upon a time in a faraway land, there lived a brave knight."

#  Tokenize the input text
inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True, max_length=512)

#  Generate predictions with adjusted parameters
with torch.no_grad():
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=100,
        num_return_sequences=1,
        do_sample=True,  # Set do_sample to True
        temperature=0.7,  # Increase temperature for more randomness
        top_k=50,  # Use top-k sampling
        top_p=0.9,  # Use top-p sampling
        repetition_penalty=1.2  # Penalize repetitive tokens
    )

#  Decode the generated tokens to text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)