In [1]:
import pandas as pd

TRAINING = False
INFERENCE = False

In [2]:
if TRAINING:
    import os
    import torch
    from datasets import load_dataset, Dataset
    from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, LlamaForSequenceClassification
    from trl import SFTConfig, SFTTrainer
    import pandas as pd
    import numpy as np
    
    bnb_config = BitsAndBytesConfig(
       load_in_4bit=True,
       bnb_4bit_quant_type="nf4",
       bnb_4bit_use_double_quant=True,
       bnb_4bit_compute_dtype=torch.float32
    )
    seed = 52
    device = "cuda" if torch.cuda.is_available() else "cpu"
    batch_size = 4
    checkpoint_path = 'unsloth/Llama-3.1-8B-Instruct'
    debug = False
    
    prompt = '''You are a concise and precise assistant. Answer the questions directly and as briefly as possible.
                User also give you any facts with titles for help you.
    
               Your answers should be one of the following:
                1. "yes" if the answer is affirmative.
                2. "no" if the answer is negative.
                3. "insufficient information" if you don't have enough information to answer.
                4. The specific entity related to the question (such as a personal name, company, etc.), if applicable.
                5. Max answer length - 8 words. Don`t use tags and \\n, \\t. Only short answer. You shouldn't use reasoning. Only answer.
                6. If answer yes or no - you mustn't reason your opinion, write only "yes" or "no"
                
                Do not explain or provide additional details. Just give the most relevant answer based on the question, facts and your knowledge.
            '''
    train_df = pd.read_csv('train_with_extra_top3.csv') # Данные из extra_df, смердженные с помощью эмбеддингов E5
    test_df = pd.read_csv('test_with_extra_top3.csv')
    
    tokenizer = AutoTokenizer.from_pretrained(
            checkpoint_path,
            trust_remote_code=True
        )

    model = AutoModelForCausalLM.from_pretrained(
        checkpoint_path,
        device_map="cuda",
        trust_remote_code=True,
    )
    model.config.pad_token_id = model.config.eos_token_id

    model = prepare_model_for_kbit_training(model)

    config = LoraConfig(
        r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                          "gate_proj", "up_proj", "down_proj",],
        lora_alpha = 32,
        lora_dropout = 0, # Supports any, but = 0 is optimized
        bias = "none",    # Supports any, but = "none" is optimized
        # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    )
    model = get_peft_model(model, config)
    
    sft_config = SFTConfig(
        ## GROUP 1: Memory usage
        # These arguments will squeeze the most out of your GPU's RAM
        # Checkpointing
        gradient_checkpointing=True,    # this saves a LOT of memory
        # Set this to avoid exceptions in newer versions of PyTorch
        gradient_checkpointing_kwargs={'use_reentrant': False}, 
        # Gradient Accumulation / Batch size
        # Actual batch (for updating) is same (1x) as micro-batch size
        gradient_accumulation_steps=1,  
        # The initial (micro) batch size to start off with
        per_device_train_batch_size=16, 
        # If batch size would cause OOM, halves its size until it works
        auto_find_batch_size=True,
    
        ## GROUP 2: Dataset-related
        max_seq_length=64,
        # Dataset
        # packing a dataset means no padding is needed
        packing=True,
    
        ## GROUP 3: These are typical training parameters
        num_train_epochs=2,
        learning_rate=3e-4,
        # Optimizer
        # 8-bit Adam optimizer - doesn't help much if you're using LoRA!
        optim='paged_adamw_8bit', 
        
        ## GROUP 4: Logging parameters
        logging_steps=10,
        logging_dir='./logs',
        output_dir='./llm-output',
        report_to='none'
    )
    user_prompt = """Facts: {}\n\nQuestion: {}\n"""

    def get_context(fact_list):
        return "\n\n".join(eval(fact_list))

    train_df['facts_context'] = train_df['fact_list'].apply(get_context)
    test_df['facts_context'] = test_df['fact_list'].apply(get_context)
    dataset = [
        [{"role": "system", "content": prompt},
        {"role": "user", "content": user_prompt.format(f, q)},
        {"role": "assistant", "content": answer}] for q, answer, f in train_df[['questions', 'answer', 'facts_context']].values]

    def formatting(dataset):
        texts = []
        for i in range(len(dataset)):
            texts.append(tokenizer.apply_chat_template(dataset[i], tokenize=False))
        return Dataset.from_dict({'text': texts})

    dataset = formatting(dataset)

    trainer = SFTTrainer(
        model=model,
        processing_class=tokenizer,
        args=sft_config,
        train_dataset=dataset
    )
    trainer.train()

In [3]:
if INFERENCE:
    import string
    def get_clean_text(text):
        text = text.lower()
        text = text.translate(str.maketrans('', '', '.'))
        text = text.replace('  ', '')
        text = text.strip()
        return text
    import random
    from tqdm.auto import tqdm
    
    def seed_everything(seed):
        random.seed(seed)
        os.environ['PYTHONHASHSEED'] = str(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    
    seed_everything(228)
    
    
    class MyDataset(Dataset):
        def __init__(self, dataset, tokenizer, prompt):
            self.dataset = dataset
            self.tokenizer = tokenizer
            self.prompt = prompt
        def __len__(self):
            return len(self.dataset)
    
        def __getitem__(self, idx):
            example = self.dataset.iloc[idx]
            input_dict = [{"role": "system", "content": prompt},
            {"role": "user", "content": user_prompt.format(example['facts_context'], example['questions'])}]
            input_text = tokenizer.apply_chat_template(input_dict, tokenize=False, add_generation_prompt=True)
            return idx, input_text
    def collate_fn(batch):
        idxs, queries = zip(*batch)
    
        # tokenize batch with padding according to the longest example
        inputs = tokenizer(
            list(queries),
            truncation=True,
            padding='longest',
            return_tensors="pt"
        ).to(device)
    
        return idxs, queries, inputs

    from torch.utils.data import DataLoader, Dataset
    dataset = MyDataset(test_df, tokenizer, prompt)
    test_loader = DataLoader(dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    result_dict = {}
    for idxs, queries, tokens in tqdm(test_loader):
        with torch.no_grad():
            outputs = model.generate(
                input_ids=tokens["input_ids"],
                attention_mask=tokens["attention_mask"],
                max_new_tokens=256,
                pad_token_id=model.config.pad_token_id
                
            )
    
        for num, output in enumerate(outputs):
            response = tokenizer.decode(outputs[num, tokens["input_ids"][num].shape[0]:], skip_special_tokens=True)
            pred = get_clean_text(response)
            result_dict[idxs[num]] = pred
            if debug:
                print(f'Model input: {queries[num]}\n')
                print(f'Model answer: {pred}\n\n')

    df = pd.DataFrame(result_dict.items(), columns=["ID", "answer"]).sort_values(by="ID")
    df['answer'] = df['answer'].apply(lambda x:x.lower().replace('.', ''))

    df['answer'] = [x if 'no' not in x.split(' ') else 'no' for x in df['answer']]
    df['answer'] = [x if 'yes' not in x.split(' ') else 'yes' for x in df['answer']]
    df['answer'] = [x if 'insufficient information' not in x else 'insufficient information' for x in df['answer']]
    df['answer'] = [x.strip() if 'bankmanfried' not in x else x.replace('bankmanfried', 'bankman-fried').strip() for x in df['answer']]

else:
    df = pd.read_csv('/kaggle/input/le-dataset-for-contest-3/best_sub.csv')

df.to_csv('submission.csv', index=False)