In [1]:
import sys
import os

# Правильный путь с учетом вложенности
correct_path = "/home/user/projects/FedCore/FedCore"
sys.path.insert(0, correct_path)

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "arnir0/Tiny-LLM"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,  use_fast=False)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
print(model.)

None


In [3]:
from fedcore.data.data import CompressionInputData
import inspect
print("CompressionInputData parameters:")
sig = inspect.signature(CompressionInputData.__init__)
for param_name, param in sig.parameters.items():
    print(f"  {param_name}: {param.annotation} = {param.default}")

  import pkg_resources


CompressionInputData parameters:
  self: <class 'inspect._empty'> = <class 'inspect._empty'>
  features: <class 'numpy.ndarray'> = [[0. 0.]
 [0. 0.]]
  target: typing.Optional[numpy.ndarray] = None
  train_dataloader: <class 'torch.utils.data.dataloader.DataLoader'> = None
  val_dataloader: <class 'torch.utils.data.dataloader.DataLoader'> = None
  test_dataloader: <class 'torch.utils.data.dataloader.DataLoader'> = None
  task: <class 'fedot.core.repository.tasks.Task'> = Task(task_type=<TaskTypesEnum.classification: 'classification'>, task_params=None)
  num_classes: <class 'int'> = None
  input_dim: <class 'int'> = None
  supplementary_data: <class 'fedot.core.data.supplementary_data.SupplementaryData'> = <factory>


In [2]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    pipeline
)
from datasets import load_dataset
import torch
# from peft import LoraConfig, get_peft_model, TaskType
import evaluate
import numpy as np

In [None]:
PATH_TO_MODEL = "./tiny-llm-model"
if os.path.exists(PATH_TO_MODEL) and os.listdir(PATH_TO_MODEL):
    print(f"Загружаем модель из локальной директории: {PATH_TO_MODEL}")
    tokenizer = AutoTokenizer.from_pretrained(PATH_TO_MODEL, use_fast=False)
    model = AutoModelForCausalLM.from_pretrained(PATH_TO_MODEL)
else:
    print("Загружаем модель из Hugging Face Hub...")
    MODEL_NAME = "arnir0/Tiny-LLM"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
    
    print(f"Сохраняем модель в локальную директорию: {PATH_TO_MODEL}")
    tokenizer.save_pretrained(PATH_TO_MODEL)
    model.save_pretrained(PATH_TO_MODEL)

Загружаем модель из Hugging Face Hub...
Сохраняем модель в локальную директорию: ./tiny-llm-model


In [3]:
model_name = 'arnir0/Tiny-LLM'
dataset_name = "imdb"
# output_dir = "./qwen2-0.5b-imdb-finetuned"
output_dir = './tiny-llm'
max_length = 512  # Maximum context length for each sample

# Use 4-bit quantization to drastically reduce memory usage
use_4bit = False
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

# LoRA configuration for Parameter-Efficient Fine-Tuning
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

# Training arguments
num_train_epochs = 3
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 4
learning_rate = 2e-4
logging_steps = 10
save_steps = 500


In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
# Set padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model with 4-bit quantization if enabled
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
if use_4bit:
    from transformers import BitsAndBytesConfig
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=use_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=use_nested_quant,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",  # Automatically places layers on available GPUs
        trust_remote_code=True
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="cpu", #auto
        torch_dtype=torch.float32,
        trust_remote_code=True
    )

In [None]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1000]")

print(f"Dataset size: {len(dataset)}")
print(f"Dataset features: {dataset.features}")

def preprocess_dataset(examples):
    """Extract text from different dataset formats"""
    if 'text' in examples:
        return {"text": examples["text"]}
    elif 'article' in examples:  # CNN Daily Mail
        return {"text": examples["article"]}
    elif 'content' in examples:  # Some datasets
        return {"text": examples["content"]}
    elif 'sentence' in examples:  # Some sentence datasets
        return {"text": examples["sentence"]}
    else:
        # Try to use the first string column
        for key, value in examples.items():
            if isinstance(value[0], str):
                return {"text": examples[key]}
        return {"text": [str(x) for x in examples[list(examples.keys())[0]]]}

# Apply preprocessing
dataset = dataset.map(preprocess_dataset, batched=True)

# Filter out empty texts
dataset = dataset.filter(lambda example: len(example["text"].strip()) > 0)
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=128,  # Reduced for tiny model
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Split dataset
train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")

Dataset size: 1000
Dataset features: {'text': Value('string')}
Training samples: 517
Validation samples: 130


In [15]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1000]")

In [8]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We are doing causal LM, not masked LM
)

# Load accuracy metric
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Shift labels and predictions for causal LM (next token prediction)
    # Predictions are for the next token, so we shift labels accordingly
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    
    # Flatten the tokens and get predictions
    predictions = np.argmax(shift_logits, axis=-1).flatten()
    labels = shift_labels.flatten()
    
    # Calculate accuracy, ignoring padding tokens (where label = -100)
    mask = labels != -100
    predictions = predictions[mask]
    labels = labels[mask]
    return accuracy_metric.compute(predictions=predictions, references=labels)


In [9]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    # logging_steps=logging_steps,
    logging_steps=5,
    save_steps=save_steps,
    eval_strategy="steps",
    eval_steps=5,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to=None,  # Disable external logging like Weights & Biases for simplicity
    fp16=False,  # Use mixed precision training
)