In [1]:
pip install transformers



In [2]:
pip install einops



In [3]:
pip install torch



In [4]:
import transformers

In [5]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
import torch

In [9]:
def generate(prompt, max_length=80):
    model_name = "Open-Orca/oo-phi-1_5"
    tokenizer_name = "Open-Orca/oo-phi-1_5"

    # Load the model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # Prepare the input tokens
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = inputs.to(device)
    inputs.pop('attention_mask', None)  # Ensure this line is necessary as discussed

    # Generate text with a max_length limit
    outputs = model.generate(**inputs, max_length=max_length, temperature=0.7, top_p=0.9)
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Debugging output
    #print("Full text:", full_text)

    # Adjusting post-processing to correctly extract the desired output
    # Find the position where the prompt ends in the generated text, taking into account it might include a newline
    first_newline_pos = full_text.find('\n')
    subsequent_newline_pos = full_text.find('\n', first_newline_pos + 1)

    if subsequent_newline_pos != -1:
        # Extract the text between the first and the second newline
        return full_text[first_newline_pos + 1:subsequent_newline_pos].strip()
    elif first_newline_pos != -1:
        # If there's only one newline, return whatever follows it
        return full_text[first_newline_pos + 1:].strip()

    return ""  # Return an empty string if no newlines are found

In [83]:
prompt = '''print(\"Hello\")'''
ans = generate(prompt)
print(ans)

OutOfMemoryError: CUDA out of memory. Tried to allocate 394.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 50.81 MiB is free. Process 38521 has 39.51 GiB memory in use. Of the allocated memory 38.74 GiB is allocated by PyTorch, and 267.06 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [12]:
import pandas as pd
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader

class KotlinCompletionDataset(Dataset):
    def __init__(self, tokenizer, filename, block_size=512):
        self.tokenizer = tokenizer
        self.block_size = block_size

        # Load the dataset
        df_subsample = pd.read_csv('random_sample.csv')  # Only use a subsample of 500 rows for quick testing

        # Prepare the dataset
        self.examples = []
        for _, row in df_subsample.iterrows():
            context_encodings = tokenizer(row['Context'], truncation=True, max_length=block_size, return_tensors="pt")
            completion_encodings = tokenizer(row['Completion'], truncation=True, max_length=block_size, return_tensors="pt")

            # Combine context and completion for model input
            input_ids = torch.cat([context_encodings.input_ids, completion_encodings.input_ids[:, 1:]], dim=-1)
            self.examples.append(input_ids.squeeze())

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return self.examples[i]

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("Open-Orca/oo-phi-1_5")
dataset = KotlinCompletionDataset(tokenizer, 'path_to_your_dataset.csv')
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)


In [13]:
from transformers import AutoModelForCausalLM, AdamW
import torch

def train(model, dataloader, device):
    model.train()
    optimizer = AdamW(model.parameters(), lr=5e-5)
    model.to(device)  # Ensure the model is on the correct device

    for epoch in range(3):  # Number of epochs
        total_loss = 0
        for batch in dataloader:
            inputs = batch['input_ids'].to(device)  # Ensure inputs are on the correct device
            labels = batch['labels'].to(device)  # Ensure labels are on the correct device

            # Forward pass
            outputs = model(inputs, labels=labels)
            loss = outputs.loss

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch}: Loss {total_loss / len(dataloader)}")

# Loading the model and tokenizer
model_name = "Open-Orca/oo-phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

# Preparing the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Assuming your DataLoader is defined somewhere here
# dataloader = DataLoader(your_dataset, batch_size=4, shuffle=True)

# Start training
train(model, dataloader, device)

Using device: cuda




RuntimeError: stack expects each tensor to be equal size, but got [525] at entry 0 and [42] at entry 1

In [None]:
# NEW TRY FROM HERE

In [9]:
##############################

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class KotlinCompletionDataset(Dataset):
    def __init__(self, tokenizer, filename, block_size=512):
        self.tokenizer = tokenizer
        self.examples = []

        df = pd.read_csv(filename)

        for _, row in df.iterrows():
            context_enc = tokenizer.encode(row['Context'], truncation=True, max_length=block_size - 1, add_special_tokens=False)
            completion_enc = tokenizer.encode(row['Completion'], truncation=True, max_length=block_size - len(context_enc), add_special_tokens=False)
            input_ids = context_enc + completion_enc + [tokenizer.eos_token_id]
            self.examples.append(torch.tensor(input_ids, dtype=torch.long))

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

def collate_batch(batch):
    input_ids = pad_sequence(batch, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = input_ids != tokenizer.pad_token_id
    return {'input_ids': input_ids, 'attention_mask': attention_mask}

# Initialize your dataset and DataLoader
model_name = "Open-Orca/oo-phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

dataset = KotlinCompletionDataset(tokenizer, 'random_sample.csv')
dataloader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_batch)


In [10]:
def train(model, dataloader, device):
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6)  # Using PyTorch's native AdamW
    model.to(device)

    for epoch in range(15):
        total_loss = 0
        for batch in dataloader:
            inputs, masks = batch['input_ids'].to(device), batch['attention_mask'].to(device)
            labels = inputs.clone()

            outputs = model(inputs, attention_mask=masks, labels=labels)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch}: Loss {total_loss / len(dataloader)}")


In [11]:
model_name = "Open-Orca/oo-phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

# Preparing the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Assuming your DataLoader is defined somewhere here
# dataloader = DataLoader(your_dataset, batch_size=4, shuffle=True)

# Start training
train(model, dataloader, device)
model.save_pretrained('/content')
tokenizer.save_pretrained('/content')

#######################

Using device: cuda
Epoch 0: Loss 1.4961587287350135
Epoch 1: Loss 1.0124666029079394
Epoch 2: Loss 0.7299404953284697
Epoch 3: Loss 0.5017266905714165
Epoch 4: Loss 0.31798347708176483
Epoch 5: Loss 0.19077877268533816
Epoch 6: Loss 0.11699311290274966
Epoch 7: Loss 0.0818746677743779
Epoch 8: Loss 0.06469286549074406
Epoch 9: Loss 0.05881212241227993
Epoch 10: Loss 0.059038990735584364
Epoch 11: Loss 0.058267747780562124
Epoch 12: Loss 0.06541517563418231
Epoch 13: Loss 0.06650809072550724
Epoch 14: Loss 0.068204530748454


('/content/tokenizer_config.json',
 '/content/special_tokens_map.json',
 '/content/vocab.json',
 '/content/merges.txt',
 '/content/added_tokens.json',
 '/content/tokenizer.json')

In [22]:
from transformers import AutoConfig, AutoModelForCausalLM

config = AutoConfig.from_pretrained('/content')
# Now load the model with this configuration
#model = AutoModelForCausalLM.from_pretrained('/content', config=config)
#tokenizer = AutoTokenizer.from_pretrained('/content')
model = AutoModelForCausalLM.from_pretrained('/content', config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained('/content', trust_remote_code=True)

# Example function to generate text
def generate_text(prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_length)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Use the function
prompt_text = "Your example input text here"
print(generate_text(prompt_text))

The repository for /content contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co//content.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


ValueError: The following `model_kwargs` are not used by the model: ['attention_mask'] (note: typos in the generate arguments will also show up in this list)

In [46]:
### ANOTHER TRY

from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

# Initialize the configuration and model with trust for remote code
config = AutoConfig.from_pretrained('/content', trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained('/content', config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained('/content', trust_remote_code=True)

# Define a function to generate text
def generate_text(prompt, max_length=80):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    max_length = len(prompt) * 4 + 50
    with torch.no_grad():
        # Explicitly passing only input_ids to the generate method
        outputs = model.generate(input_ids=inputs['input_ids'], max_length=max_length)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    full_text = generated_text
    first_newline_pos = full_text.find('\n')
    subsequent_newline_pos = full_text.find('\n', first_newline_pos + 1)

    if subsequent_newline_pos != -1:
        # Extract the text between the first and the second newline
        return full_text[first_newline_pos + 1:subsequent_newline_pos].strip()
    elif first_newline_pos != -1:
        # If there's only one newline, return whatever follows it
        return full_text[first_newline_pos + 1:].strip()

    return ""


# Example usage
prompt_text = "def multiply(a, b):"
print(generate_text(prompt_text))
# ANOTHER TRY

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


return a * b


In [49]:
test_kotlin = pd.read_csv('test_sample_500.csv')
test_kotlin['Context_Length'] = test_kotlin['Context'].apply(len)
test_kotlin = test_kotlin[test_kotlin['Context_Length'] < 6000]
test_kotlin.drop(columns=['Context_Length'], inplace=True)

true_strings = test_kotlin['Completion']
contextual_list = test_kotlin['Context'].to_list()
predicted_strings = []
for each in contextual_list[:10]:
  answer = generate_text(each)
  predicted_strings.append(answer)

AssertionError: 

In [48]:
true_ids = [tokenizer.encode(s, add_special_tokens=False) for s in true_strings]
predicted_ids = [tokenizer.encode(s, add_special_tokens=False) for s in predicted_strings]

# Convert lists of ids to tensors
true_ids_tensor = torch.nn.utils.rnn.pad_sequence([torch.tensor(s) for s in true_ids], batch_first=True, padding_value=tokenizer.pad_token_id)
predicted_ids_tensor = torch.nn.utils.rnn.pad_sequence([torch.tensor(s) for s in predicted_ids], batch_first=True, padding_value=tokenizer.pad_token_id)

# Assume model can process these directly (in practice, model should output probabilities for each token)
with torch.no_grad():
    # Getting logits from the model (for simplicity, assume we get them directly matching true_ids_tensor size)
    logits = model(predicted_ids_tensor).logits

# Assuming a simplistic scenario where the true strings are the labels for the predicted strings
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)  # Ignore padding for loss calculation
loss = loss_fn(logits.transpose(1, 2), true_ids_tensor)  # logits need to be [batch, classes, seq_len], labels [batch, seq_len]

print("Cross-Entropy Loss:", loss.item())

RuntimeError: received an empty list of sequences

In [27]:
prompt_text = '''ef sum(a, b):
    return a + b

def multiply(a, b):
    return a * b

operations = [sum, multiply]

for operation in operations:'''
print(generate_text(prompt_text))

ef sum(a, b):
    return a + b

def multiply(a, b):
    return a * b

operations = [sum, multiply]

for operation in operations:
    result = operation


In [13]:
a = 5

In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the tokenizer and model with trust_remote_code to allow custom configurations
model_path = "/content"

try:
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
except ValueError as e:
    print(f"Failed to load with Auto classes: {e}")

# Check if custom classes/methods are needed to load your model and tokenizer properly


Failed to load with Auto classes: Unrecognized configuration class <class 'transformers_modules.Open-Orca.oo-phi-1_5.2eb05ed7c2f3cdccf2873dbd117ffad4d15d572a.configuration_mixformer_sequential.MixFormerSequentialConfig'> to build an AutoTokenizer.
Model type should be one of AlbertConfig, AlignConfig, BarkConfig, BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BlipConfig, Blip2Config, BloomConfig, BridgeTowerConfig, BrosConfig, CamembertConfig, CanineConfig, ChineseCLIPConfig, ClapConfig, CLIPConfig, CLIPSegConfig, ClvpConfig, LlamaConfig, CodeGenConfig, CohereConfig, ConvBertConfig, CpmAntConfig, CTRLConfig, Data2VecAudioConfig, Data2VecTextConfig, DbrxConfig, DebertaConfig, DebertaV2Config, DistilBertConfig, DPRConfig, ElectraConfig, ErnieConfig, ErnieMConfig, EsmConfig, FalconConfig, FastSpeech2ConformerConfig, FlaubertConfig, FNetConfig, FSMTConfig, FunnelConfig, GemmaConfig, GitConfig, GPT2Co

In [16]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Path where your fine-tuned model files are stored
model_path = "/content"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Ensure the model is in evaluation mode
model.eval()

# Example function to generate text
def generate_text(prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_length)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Use the function
prompt_text = "Your example input text here"
print(generate_text(prompt_text))


The repository for /content contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co//content.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


ValueError: Unrecognized configuration class <class 'transformers_modules.Open-Orca.oo-phi-1_5.2eb05ed7c2f3cdccf2873dbd117ffad4d15d572a.configuration_mixformer_sequential.MixFormerSequentialConfig'> to build an AutoTokenizer.
Model type should be one of AlbertConfig, AlignConfig, BarkConfig, BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BlipConfig, Blip2Config, BloomConfig, BridgeTowerConfig, BrosConfig, CamembertConfig, CanineConfig, ChineseCLIPConfig, ClapConfig, CLIPConfig, CLIPSegConfig, ClvpConfig, LlamaConfig, CodeGenConfig, CohereConfig, ConvBertConfig, CpmAntConfig, CTRLConfig, Data2VecAudioConfig, Data2VecTextConfig, DbrxConfig, DebertaConfig, DebertaV2Config, DistilBertConfig, DPRConfig, ElectraConfig, ErnieConfig, ErnieMConfig, EsmConfig, FalconConfig, FastSpeech2ConformerConfig, FlaubertConfig, FNetConfig, FSMTConfig, FunnelConfig, GemmaConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, GPTSanJapaneseConfig, GroundingDinoConfig, GroupViTConfig, HubertConfig, IBertConfig, IdeficsConfig, Idefics2Config, InstructBlipConfig, JambaConfig, JukeboxConfig, Kosmos2Config, LayoutLMConfig, LayoutLMv2Config, LayoutLMv3Config, LEDConfig, LiltConfig, LlamaConfig, LlavaConfig, LlavaNextConfig, LongformerConfig, LongT5Config, LukeConfig, LxmertConfig, M2M100Config, MambaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MgpstrConfig, MistralConfig, MixtralConfig, MobileBertConfig, MPNetConfig, MptConfig, MraConfig, MT5Config, MusicgenConfig, MusicgenMelodyConfig, MvpConfig, NezhaConfig, NllbMoeConfig, NystromformerConfig, OlmoConfig, OneFormerConfig, OpenAIGPTConfig, OPTConfig, Owlv2Config, OwlViTConfig, PegasusConfig, PegasusXConfig, PerceiverConfig, PersimmonConfig, PhiConfig, Pix2StructConfig, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, Qwen2MoeConfig, RagConfig, RealmConfig, RecurrentGemmaConfig, ReformerConfig, RemBertConfig, RetriBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, SeamlessM4TConfig, SeamlessM4Tv2Config, SiglipConfig, Speech2TextConfig, Speech2Text2Config, SpeechT5Config, SplinterConfig, SqueezeBertConfig, StableLmConfig, Starcoder2Config, SwitchTransformersConfig, T5Config, TapasConfig, TransfoXLConfig, TvpConfig, UdopConfig, UMT5Config, ViltConfig, VipLlavaConfig, VisualBertConfig, VitsConfig, Wav2Vec2Config, Wav2Vec2BertConfig, Wav2Vec2ConformerConfig, WhisperConfig, XCLIPConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig, YosoConfig.

In [None]:
# WHAT FOLLOWS IS A DARK MYSTERY

In [48]:
from torch.cuda.amp import GradScaler, autocast

def train(model, dataloader, device):
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    scaler = GradScaler()
    model.to(device)

    for epoch in range(3):
        for batch in dataloader:
            optimizer.zero_grad()

            with autocast():
                inputs, masks = batch['input_ids'].to(device), batch['attention_mask'].to(device)
                labels = inputs.clone()
                outputs = model(inputs, attention_mask=masks, labels=labels)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

        print(f"Epoch {epoch}, Loss: {loss.item()}")


In [82]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, TextDataset, DataCollatorForLanguageModeling
import torch
from torch.utils.data import Dataset

class CodeCompletionDataset(Dataset):
    def __init__(self, tokenizer, df, block_size=512):
        self.examples = []

        # Tokenize the inputs and labels from the DataFrame
        for _, row in df.iterrows():
            context = tokenizer.encode(row['Context'], add_special_tokens=True)
            completion = tokenizer.encode(row['Completion'], add_special_tokens=True)
            self.examples.append(torch.tensor(context + completion, dtype=torch.long))

        if len(self.examples[0]) > block_size:
            self.examples = [ex[:block_size] for ex in self.examples]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return self.examples[i]

def fine_tune(df, model_name="Open-Orca/oo-phi-1_5", num_train_epochs=3):
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

    dataset = CodeCompletionDataset(tokenizer, df)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    training_args = TrainingArguments(
        output_dir="./model_save", # Directory where the model checkpoints will be saved
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()
    # Save the final model (+ tokenizer + config) to disk
    model.save_pretrained('./model_save')
    tokenizer.save_pretrained('./model_save')

df = pd.read_csv('random_sample.csv')
df['Len_Context'] = df['Context'].apply(len)
df = df[df['Len_Context'] < 6000]
df = df.drop(columns=['Len_Context'])
fine_tune(df)


OSError: Open-Orca/oo-phi-1 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [32]:
torch.cuda.empty_cache()

In [None]:
pip install accelerate -U

In [84]:
torch.cuda.empty_cache()