<a href="https://colab.research.google.com/github/zulesu/probable-giggle/blob/main/copy_of_gcp_auth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Authenticate to GCP

Colab allows you to perform operations on Google Cloud Platform via GCP APIs. You must first authenticate to a project you have the appropriate permissions to. Learn more about GCP projects [here](https://cloud.google.com/resource-manager/docs/creating-managing-projects).

In [None]:
from google.colab import auth
PROJECT_ID = "gen-lang-client-0361273389" # @param {type: "string"}
auth.authenticate_user(project_id=PROJECT_ID)

In [None]:
!pip uninstall -y datasets gcsfs bigframes
!pip install fsspec==2024.9.0
!pip install datasets==3.1.0 gcsfs==2024.9.0 bigframes==1.27.0
!apt-get install -y python3-cairo


[0mFound existing installation: gcsfs 2024.10.0
Uninstalling gcsfs-2024.10.0:
  Successfully uninstalled gcsfs-2024.10.0
Found existing installation: bigframes 1.29.0
Uninstalling bigframes-1.29.0:
  Successfully uninstalled bigframes-1.29.0
Collecting fsspec==2024.9.0
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.10.0
    Uninstalling fsspec-2024.10.0:
      Successfully uninstalled fsspec-2024.10.0
Successfully installed fsspec-2024.9.0
Collecting datasets==3.1.0
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting gcsfs==2024.9.0
  Downloading gcsfs-2024.9.0-py2.py3-none-any.whl.metadata (1.6 kB)
Collecting bigframes==1.27.0
  Downloading bigframes-1.27.0

In [None]:
import logging
import torch
import wandb
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, IntervalStrategy

# Postavljanje W&B API ključa
os.environ['WANDB_API_KEY'] = '0dd449c9c7a50feb94ef49dbbede869392953b8f'

# Ručni login na W&B ako je potrebno
wandb.login()

# Pokrenite W&B sesiju sa tačnim imenom entiteta i projektom
wandb.init(project="openAI-gpt2", entity="rudicstipan5-lipilipic", settings=wandb.Settings(init_timeout=120))

# Logger setup
logger = logging.getLogger(__name__)
logger.info("Starting script execution...")

# Environment varijabla za izbegavanje fragmentacije memorije CUDA
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Kreiranje potrebnih direktorijuma
output_dir = '/content/drive/My Drive/gpt2'
checkpoint_dir = os.path.join(output_dir, 'results')

for directory in [output_dir, checkpoint_dir]:
    if not os.path.exists(directory):
        os.makedirs(directory)
        logger.info(f"Created directory: {directory}")


# Funkcija za pronalaženje poslednjeg validnog checkpoint-a
def get_last_valid_checkpoint(directory):
    logger.info(f"Searching for the last valid checkpoint in the output directory {directory}.")
    if not os.path.exists(directory):
        logger.error(f"Output directory {directory} does not exist.")
        return None
    checkpoints = [os.path.join(directory, d) for d in os.listdir(directory) if d.startswith('checkpoint-')]
    if not checkpoints:
        logger.info("No checkpoints found.")
        return None
    last_checkpoint = max(checkpoints, key=os.path.getctime)
    if os.path.isdir(last_checkpoint):
        logger.info(f"Last valid checkpoint: {last_checkpoint}")
        return last_checkpoint
    else:
        logger.info("Valid checkpoint directory not found.")
        return None

# Učitaj dataset
dataset = load_dataset('Triangle104/Guilherme34-uncensor')
logger.info("Dataset loaded successfully.")
print(dataset)

# Učitaj tokenizer i model
model_name = 'gpt2-medium'  # Korišćenje većeg modela
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
logger.info("Tokenizer and model loaded successfully.")
tokenizer.pad_token = tokenizer.eos_token


# Funkcija za tokenizaciju
def tokenize_function(examples):
    messages = examples['messages']
    if isinstance(messages, list) and isinstance(messages[0], list) and isinstance(messages[0][0], dict):
        messages = [" ".join([msg['content'] for msg in conversation]) for conversation in messages]
    elif isinstance(messages, list) and isinstance(messages[0], dict):
        messages = " ".join([msg['content'] for msg in messages])
    elif not isinstance(messages, list):
        messages = [messages]
    tokenized_inputs = tokenizer(messages, padding='max_length', truncation=True, max_length=256)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs


# Tokenizuj dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)
logger.info("Tokenization completed.")
eval_dataset = tokenized_dataset['validation'] if 'validation' in tokenized_dataset else None
eval_strategy = IntervalStrategy.NO if eval_dataset is None else IntervalStrategy.STEPS

# Proveriti postoji li poslednji validan checkpoint
last_checkpoint = get_last_valid_checkpoint(output_dir)
logger.info(f"Last valid checkpoint: {last_checkpoint}")
print(f"Last valid checkpoint: {last_checkpoint}")

# Oslobodite memoriju pre nego što kreirate instancu trenera
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Definiši parametre za treniranje
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=20,
    per_device_train_batch_size=8,  # Povećani batch size
    per_device_eval_batch_size=4,  # Povećani batch size
    gradient_accumulation_steps=8,  # Smanjeni koraci akumulacije
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=20,
    eval_strategy=eval_strategy,  # Koristimo eval_strategy umesto evaluation_strategy
    save_steps=500,
    save_total_limit=2,
    disable_tqdm=False,
    fp16=True,
    learning_rate=2e-5
)


# Kreiranje podklase `Trainer`
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, *args, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        return (loss, outputs) if return_outputs else loss


# Funkcija za čuvanje istreniranog modela
def save_pretrained_model(model, tokenizer, directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    model.save_pretrained(directory)
    tokenizer.save_pretrained(directory)
    with open(os.path.join(directory, 'vocab.txt'), 'w', encoding='utf-8') as vocab_file:
        for token, index in tokenizer.get_vocab().items():
            vocab_file.write(f"{token}\n")
    logger.info(f"Model, tokenizer, and vocabulary saved to {directory}.")


# Kreiraj instancu CustomTrainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=eval_dataset,
)

# Proveriti da li treba nastaviti treniranje od poslednjeg validnog checkpointa
if last_checkpoint:
    logger.info("Resuming training from last valid checkpoint.")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    logger.info("Starting training from scratch.")
    trainer.train()


# Sačuvaj model
logger.info("Saving the trained model...")
save_pretrained_model(model, tokenizer, output_dir)
torch.save(model.state_dict(), os.path.join(output_dir, 'pytorch_model.bin'))

# Proveriti da li je model sačuvan
model_path = os.path.join(output_dir, 'pytorch_model.bin')
if os.path.isfile(model_path):
    logger.info(f"Model saved successfully at {model_path}")
else:
    logger.error(f"Model file not found at {model_path}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mrudicstipan5[0m ([33mrudicstipan5-lipilipic[0m). Use [1m`wandb login --relogin`[0m to force relogin


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


exported (1).jsonl:   0%|          | 0.00/3.17M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/935 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['messages', 'tools', 'split'],
        num_rows: 935
    })
})


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/935 [00:00<?, ? examples/s]

Last valid checkpoint: None




Step,Training Loss
20,3.5276
40,3.2998
60,2.848
80,2.1325
100,1.0205
120,0.1456
140,0.0068
160,0.0012
180,0.0007
200,0.0006


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import logging
import torch
import wandb
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, IntervalStrategy

# Postavljanje W&B API ključa
os.environ['WANDB_API_KEY'] = '0dd449c9c7a50feb94ef49dbbede869392953b8f'

# Ručni login na W&B ako je potrebno
wandb.login()

# Pokrenite W&B sesiju sa tačnim imenom entiteta i projektom
wandb.init(project="openAI-gpt2", entity="rudicstipan5-lipilipic", settings=wandb.Settings(init_timeout=120))

# Logger setup
logger = logging.getLogger(__name__)
logger.info("Starting script execution...")

# Environment varijabla za izbegavanje fragmentacije memorije CUDA
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Kreiranje potrebnih direktorijuma
output_dir = '/content/drive/My Drive/gpt2-unc'
new_model_output_dir = '/content/drive/My Drive/gpt2-unc-nsfw-unc'
checkpoint_dir = os.path.join(output_dir, new_model_output_dir, 'results')

for directory in [output_dir, checkpoint_dir]:
    if not os.path.exists(directory):
        os.makedirs(directory)
        logger.info(f"Created directory: {directory}")

# Funkcija za pronalaženje poslednjeg validnog checkpoint-a
def get_last_valid_checkpoint(directory):
    logger.info(f"Searching for the last valid checkpoint in the output directory {directory}.")
    if not os.path.exists(directory):
        logger.error(f"Output directory {directory} does not exist.")
        return None
    checkpoints = [os.path.join(directory, d) for d in os.listdir(directory) if d.startswith('checkpoint-')]
    if not checkpoints:
        logger.info("No checkpoints found.")
        return None
    last_checkpoint = max(checkpoints, key=os.path.getctime)
    if os.path.isdir(last_checkpoint):
        logger.info(f"Last valid checkpoint: {last_checkpoint}")
        return last_checkpoint
    else:
        logger.info("Valid checkpoint directory not found.")
        return None

# Učitaj dataset
dataset = load_dataset('ResplendentAI/Luna_NSFW_Text')
logger.info("Dataset loaded successfully.")

# Učitaj tokenizer i model sa lokalnog direktorijuma
model_directory = '/content/drive/My Drive/gpt2-UNC-NSFW'
tokenizer = AutoTokenizer.from_pretrained(model_directory)
model = AutoModelForCausalLM.from_pretrained(model_directory)
logger.info("Tokenizer and model loaded from local directory successfully.")
tokenizer.pad_token = tokenizer.eos_token


# Funkcija za tokenizaciju
def tokenize_function(examples):
    inputs = examples['text']  # Pretpostavljamo da je ključ 'text'
    tokenized_inputs = tokenizer(inputs, padding='max_length', truncation=True, max_length=512)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs


# Tokenizuj dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)
logger.info("Tokenization completed.")
eval_dataset = tokenized_dataset['validation'] if 'validation' in tokenized_dataset else None
eval_strategy = IntervalStrategy.NO if eval_dataset is None else IntervalStrategy.STEPS

# Proveriti postoji li poslednji validan checkpoint
last_checkpoint = get_last_valid_checkpoint(output_dir)
logger.info(f"Last valid checkpoint: {last_checkpoint}")
print(f"Last valid checkpoint: {last_checkpoint}")

# Oslobodite memoriju pre nego što kreirate instancu trenera
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Definiši parametre za treniranje
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=20,
    per_device_train_batch_size=4,  # Povećani batch size
    per_device_eval_batch_size=4,  # Povećani batch size
    gradient_accumulation_steps=8,  # Smanjeni koraci akumulacije
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=20,
    eval_strategy=eval_strategy,  # Koristimo eval_strategy
    save_steps=500,
    save_total_limit=2,
    disable_tqdm=False,
    fp16=True,
    learning_rate=2e-5
)


# Kreiranje podklase `Trainer`
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, *args, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        return (loss, outputs) if return_outputs else loss


# Funkcija za čuvanje istreniranog modela
def save_pretrained_model(model, tokenizer, directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    model.save_pretrained(directory)
    tokenizer.save_pretrained(directory)
    with open(os.path.join(directory, 'vocab.txt'), 'w', encoding='utf-8') as vocab_file:
        for token, index in tokenizer.get_vocab().items():
            vocab_file.write(f"{token}\n")
    logger.info(f"Model, tokenizer, and vocabulary saved to {directory}.")


# Kreiraj instancu CustomTrainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=eval_dataset,
)

# Proveriti da li treba nastaviti treniranje od poslednjeg validnog checkpointa
if last_checkpoint:
    logger.info("Resuming training from last valid checkpoint.")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    logger.info("Starting training from scratch.")
    trainer.train()


# Sačuvaj model
logger.info("Saving the trained model...")
save_pretrained_model(model, tokenizer, output_dir)
torch.save(model.state_dict(), os.path.join(output_dir, 'pytorch_model.bin'))

# Proveriti da li je model sačuvan
model_path = os.path.join(output_dir, 'pytorch_model.bin')
if os.path.isfile(model_path):
    logger.info(f"Model saved successfully at {model_path}")
else:
    logger.error(f"Model file not found at {model_path}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mrudicstipan5[0m ([33mrudicstipan5-lipilipic[0m). Use [1m`wandb login --relogin`[0m to force relogin


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Last valid checkpoint: None




Step,Training Loss
20,0.0343
40,0.0289
60,0.0313
80,0.0319
100,0.0326
120,0.0331
140,0.0276
160,0.0303
180,0.0333
200,0.0317


In [None]:
import logging
import torch
import wandb
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, IntervalStrategy

# Postavljanje W&B API ključa
os.environ['WANDB_API_KEY'] = '0dd449c9c7a50feb94ef49dbbede869392953b8f'

# Ručni login na W&B ako je potrebno
wandb.login()

# Pokrenite W&B sesiju sa tačnim imenom entiteta i projektom
wandb.init(project="openAI-gpt2", entity="rudicstipan5-lipilipic", settings=wandb.Settings(init_timeout=120))

# Logger setup
logger = logging.getLogger(__name__)
logger.info("Starting script execution...")

# Environment varijabla za izbegavanje fragmentacije memorije CUDA
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Kreiranje potrebnih direktorijuma
output_dir = '/content/drive/My Drive/gpt2-xxxSR'
new_model_output_dir = '/content/drive/My Drive/gpt2-unc-nsfw-unc'
checkpoint_dir = os.path.join(output_dir, new_model_output_dir, 'results')

for directory in [output_dir, checkpoint_dir]:
    if not os.path.exists(directory):
        os.makedirs(directory)
        logger.info(f"Created directory: {directory}")

# Funkcija za pronalaženje poslednjeg validnog checkpoint-a
def get_last_valid_checkpoint(directory):
    logger.info(f"Searching for the last valid checkpoint in the output directory {directory}.")
    if not os.path.exists(directory):
        logger.error(f"Output directory {directory} does not exist.")
        return None
    checkpoints = [os.path.join(directory, d) for d in os.listdir(directory) if d.startswith('checkpoint-')]
    if not checkpoints:
        logger.info("No checkpoints found.")
        return None
    last_checkpoint = max(checkpoints, key=os.path.getctime)
    if os.path.isdir(last_checkpoint):
        logger.info(f"Last valid checkpoint: {last_checkpoint}")
        return last_checkpoint
    else:
        logger.info("Valid checkpoint directory not found.")
        return None

# Učitaj dataset
dataset = load_dataset("saillab/alpaca-croatian-cleaned")
logger.info("Dataset loaded successfully.")
logger.info(f"Dataset columns: {dataset['train'].column_names}")

# Učitaj unapred trenirani model i tokenizer iz direktorijuma
model_directory = '/content/drive/My Drive/gpt2-UNC-NSFW'
tokenizer = AutoTokenizer.from_pretrained(model_directory)
model = AutoModelForCausalLM.from_pretrained(model_directory).to(device)
logger.info("Tokenizer and model loaded from pretrained model successfully.")
tokenizer.pad_token = tokenizer.eos_token


# Funkcija za tokenizaciju
def tokenize_function(examples):
    inputs = [(instruction + " " + input_text).strip() for instruction, input_text in zip(examples['instruction'], examples['input'])]
    tokenized_inputs = tokenizer(inputs, padding='max_length', truncation=True, max_length=512)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs


# Tokenizuj dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)
logger.info("Tokenization completed.")
eval_dataset = tokenized_dataset['validation'] if 'validation' in tokenized_dataset else None
eval_strategy = IntervalStrategy.NO if eval_dataset is None else IntervalStrategy.STEPS

# Proveriti postoji li poslednji validan checkpoint
last_checkpoint = get_last_valid_checkpoint(output_dir)
logger.info(f"Last valid checkpoint: {last_checkpoint}")
print(f"Last valid checkpoint: {last_checkpoint}")

# Oslobodite memoriju pre nego što kreirate instancu trenera
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Definiši parametre za treniranje
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=20,
    per_device_train_batch_size=8,  # Povećani batch size
    per_device_eval_batch_size=4,  # Povećani batch size
    gradient_accumulation_steps=8,  # Smanjeni koraci akumulacije
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=20,
    eval_strategy=eval_strategy,  # Koristimo eval_strategy
    save_steps=500,
    save_total_limit=2,
    disable_tqdm=False,
    fp16=True,
    learning_rate=2e-5
)


# Kreiranje podklase `Trainer`
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, *args, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        return (loss, outputs) if return_outputs else loss


# Funkcija za čuvanje istreniranog modela
def save_pretrained_model(model, tokenizer, directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    model.save_pretrained(directory)
    tokenizer.save_pretrained(directory)
    with open(os.path.join(directory, 'vocab.txt'), 'w', encoding='utf-8') as vocab_file:
        for token, index in tokenizer.get_vocab().items():
            vocab_file.write(f"{token}\n")
    logger.info(f"Model, tokenizer, and vocabulary saved to {directory}.")


# Kreiraj instancu CustomTrainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=eval_dataset,
)

# Proveriti da li treba nastaviti treniranje od poslednjeg validnog checkpointa
if last_checkpoint:
    logger.info("Resuming training from last valid checkpoint.")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    logger.info("Starting training from scratch.")
    trainer.train()


# Sačuvaj model
logger.info("Saving the trained model...")
save_pretrained_model(model, tokenizer, output_dir)
torch.save(model.state_dict(), os.path.join(output_dir, 'pytorch_model.bin'))

# Proveriti da li je model sačuvan
model_path = os.path.join(output_dir, 'pytorch_model.bin')
if os.path.isfile(model_path):
    logger.info(f"Model saved successfully at {model_path}")
else:
    logger.error(f"Model file not found at {model_path}")