In [1]:
from transformers import set_seed
set_seed(22)

In [6]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Vanilla Gemma

In [2]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import gc

gemma_2b = 'google/gemma-2b'
gemma_7b_it = 'google/gemma-7b-it'
gemma_2b_it = 'google/gemma-2-2b-it'

def run_inference(model_id, prompts):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, device_map = 'auto', torch_dtype = torch.float16)
    input_ids = tokenizer(prompts, padding = True, return_tensors='pt').to('cuda')

    outputs = model.generate(**input_ids, max_new_tokens = 1024)
    output_texts = tokenizer.batch_decode(outputs, skip_special_tokens = True)

    del model
    del tokenizer
    gc.collect()
    torch.cuda.empty_cache()

    return output_texts

In [4]:
prompt_1 = "What is Data Science?"
prompt_2 = "Explain 3 important Data Science concepts, and tell why each concept is important"
prompt_3 = "I'm a marketing specialist, I know nothing about Data Science. Explain to me what Data Science is and simplifie it as much as you can. When possible, use analogies that I can understand better as a marketing specialist"

output_2b_it = run_inference(gemma_2b_it, [prompt_1, prompt_2, prompt_3])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
from transformers import BitsAndBytesConfig

def load_model(model_id = gemma_2b_it, tokenizer_id = gemma_2b_it, load_in_4bit = True, device = 'auto', quant_compute_dtype = torch.float16, torch_dtype = None):
    quantization_config = BitsAndBytesConfig(
        load_in_4bit = load_in_4bit,
        bnb_4bit_quant_type = 'nf4',
        bnb_4bit_compute_dtype = quant_compute_dtype,
    ) if load_in_4bit else None

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config = quantization_config,
        device_map = device,
        torch_dtype = torch_dtype
    )

    return tokenizer, model

# Fine-Tuning

### Fine-Tuning data load

In [10]:
from datasets import load_dataset, concatenate_datasets, Dataset

def _filter(example):
    text = example['messages']
    tokens = coder_tokenizer.apply_chat_template(text)
    
    return len(tokens) <= 1000

dataset = load_dataset(
    'json',
    data_files ='/root/rag_test/data/Teach_Data_Science_Programming/dataset_1.1.json',
    split = 'train'
)

dataset = dataset.filter(_filter)
dataset = dataset.shuffle(seed = 1111)

dataset = dataset.remove_columns(['lang'])


Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/181 [00:00<?, ? examples/s]

Dataset({
    features: ['lang', 'messages'],
    num_rows: 181
})
Dataset({
    features: ['messages'],
    num_rows: 181
})


In [8]:
coder_tokenizer, coder_model = load_model()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
from peft import LoraConfig
from transformers import TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

def formatting_prompts_func(example):
    output_texts = []

    for i in range(len(example['messages'])):
        messages = example['messages'][i]

        chat = coder_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt = False)
        chat = chat.removeprefix(coder_tokenizer.bos_token)

        output_texts.append(chat)
    
    return output_texts

lora_config = LoraConfig(
    r = 8,
    lora_alpha = 8,
    lora_dropout = 0.1,
    target_modules = ["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    bias = "none",
    task_type = 'CAUSAL_LM'
)

training_args = TrainingArguments(
    output_dir = '/root/rag_test/results',
    hub_model_id = 'dsi-coder',
    overwrite_output_dir = True,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 16,
    optim = 'paged_adamw_8bit',
    learning_rate = 1e-4,
    warmup_ratio = 0.08,
    num_train_epochs = 1,
    lr_scheduler_type = 'cosine',
    report_to = "none",
    logging_steps = 1,
)

coder_tokenizer.padding_side = 'right'
coder_tokenizer.padding_token = coder_tokenizer.eos_token

response_template = "<start_of_turn>model\n"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer = coder_tokenizer)

coder_trainer = SFTTrainer(
    coder_model,
    args=training_args,
    train_dataset=dataset,
    max_seq_length=1024,
    peft_config=lora_config,
    tokenizer=coder_tokenizer,
    data_collator=collator,
    packing=False,
    formatting_func=formatting_prompts_func
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/181 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [23]:
coder_trainer.train()

It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


OutOfMemoryError: CUDA out of memory. Tried to allocate 576.00 MiB. GPU 0 has a total capacity of 23.69 GiB of which 546.81 MiB is free. Process 3641474 has 13.33 GiB memory in use. Process 652656 has 9.81 GiB memory in use. Of the allocated memory 9.37 GiB is allocated by PyTorch, and 133.90 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import gc

gc.collect()
torch.cuda.empty_cache()

coder_trainer.save_model('dsi-coder-lora')

In [None]:
from peft import PeftModel

gemma = AutoModelForCausalLM.from_pretrained(gemma_2b_it, torch_dtype = torch.bfloat16)

lora_adapter_path = 'dsi-coder-lora'
gemma_with_lora = PeftModel.from_pretrained(gemma, lora_adapter_path)
fine_tuned_model = gemma_with_lora.merge_and_unload()
fine_tuned_model.save_pretrained('dsi-coder-full-model')

# 데이터 로드

https://www.kaggle.com/datasets/sitaberete/python-datascience-handbook-dataset-md

In [1]:
'''
    from kaggle_secrets import UserSecretsClient
    import os

    os.environ["LANGCHAIN_TRACING_V2"] = 'true'
    os.environ["LANGCHAIN_API_KEY"] = UserSecrets
'''

'\n    from kaggle_secrets import UserSecretsClient\n    import os\n\n    os.environ["LANGCHAIN_TRACING_V2"] = \'true\'\n    os.environ["LANGCHAIN_API_KEY"] = UserSecrets\n'

In [2]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader

loader = DirectoryLoader(
    '/root/rag_test/data/python_data_science_handbook',
    loader_cls = TextLoader,
    glob = '*.md',
    show_progress = True,
    exclude = [
        '05.15-Learning-More.md',
        '06.00-Figure-Code.md'
    ]
)

docs = loader.load()

100%|██████████| 63/63 [00:00<00:00, 12175.89it/s]


# 색인화(Indexing)

주요 색인화 라이브러리

- Chroma: LangChain에서 자주 사용되는 벡터 저장소 구현체입니다. 대규모 데이터셋에서 효율적인 유사성 검색을 수행할 수 있습니다.
- Faiss: Facebook AI Research에서 개발한 라이브러리로, 고차원 벡터의 효율적인 유사성 검색과 클러스터링을 지원합니다.
- Pinecone: 클라우드 기반의 벡터 데이터베이스로, 실시간 고차원 벡터 검색을 제공합니다.
- Weaviate: 오픈소스 벡터 검색 엔진으로, GraphQL API를 통해 벡터 검색 기능을 제공합니다.
- Qdrant: Rust로 작성된 벡터 데이터베이스로, 고성능 벡터 검색을 지원합니다.


1. Chroma
    - 오픈소스: ✅
    - 주요 특징:
        - LLM 애플리케이션 개발에 최적화
        - 간단한 API로 사용 용이
        - LangChain, LlamaIndex 등과 통합 지원
    - 사용 사례: 로컬 개발 및 프로토타이핑에 적합

2. Faiss (Facebook AI Similarity Search)
    - 오픈소스: ✅
    - 주요 특징:
        - 대규모 유사성 검색에 매우 효율적
        - GPU 지원으로 빠른 처리
        - 다양한 인덱싱 방법 제공
    - 사용 사례: 대규모 데이터셋에서의 고성능 유사성 검색

3. Pinecone
    - 오픈소스: ❌
    - 주요 특징:
        - 빠른 인덱싱 및 검색 성능
        - 엔터프라이즈급 보안 (SOC 2, HIPAA 준수)
        - 간편한 API와 클라우드 네이티브 통합
    - 사용 사례: 엔터프라이즈급 보안이 필요한 프로덕션 환경


## Massive Text Embedding

엄청나게 긴 글을 임베딩 하기에는 많은 문제가 있지만 무엇보다 모델의 토큰 수 제한이 가장 큰 걸림돌이다.

BERT만 생각해도 512토큰이 한계이고 최근 큰 모델도 2048토큰 정도이다.

하지만 이것보다 2048단어보다 더 긴 글을 임베딩 해야 할때는 어떻게 해야 할까?

[https://huggingface.co/spaces/mteb/leaderboard]

긴글을 임베딩 잘하는 모델들을 모아둔 leaderboard 이다. 

최근 모델에서는 NV-Embed-v2 모델이 1위 인데 우리가 사용하는 BAAI/bge-small-en 모델은 2위를 차지하고 있다.(해당 notebook이 만들어 졌을 때는 1위)

어떻게 해결했는지 각 모델을 공부해야 알 것 같다.

예전에 사용한 SBERT는 Sigment network(샴 네트워크) 구조를 사용해서 finetuning 함으로써 해결했었다.(? 사실상 검색에 더 적합한 임베딩을 만든 것이지 긴 글을 해결한 것은 아니다...)


In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.retrievers import ParentDocumentRetriever

parent_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
child_splitter = RecursiveCharacterTextSplitter(chunk_size = 400, chunk_overlap = 100)

embedding_model = HuggingFaceBgeEmbeddings(model_name = 'BAAI/bge-small-en')

child_docs_store = Chroma(
    collection_name = "split_parents",
    embedding_function = embedding_model
)

parent_docs_store = InMemoryStore()

retriever = ParentDocumentRetriever(
    vectorstore = child_docs_store,
    docstore = parent_docs_store,
    child_splitter = child_splitter,
    parent_splitter = parent_splitter,
    search_kwargs = {"k" : 1},
)

retriever.add_documents(docs)


  from tqdm.autonotebook import tqdm, trange
  child_docs_store = Chroma(


# 검색 : Retriever

In [4]:
from IPython.display import display_html, clear_output

def retrieve_docs(queries, retriever):
    results = []

    for query in queries:
        retrieved_doc = retriever.get_relevant_documents(query)[0]
        results.append(retrieved_doc)

    return results

In [5]:
queries = [
    'Missing Data',
    'What are missing data?',
    'Tell me what "missing data" means in data science',
    'Teach me something about missing data',
    'Naive Bayes',
    'What are the applications of naive bayes',
    'Explain to me what naive bayes is',
]

retrieved_results = retrieve_docs(queries, retriever)
for result, query in zip(retrieved_results, queries):
 
    content = result.page_content
    print("query : \n", query)
    print("\nrelatant document : ", content)
    print("-" * 100)


query : 
 Missing Data

relatant document :  # Handling Missing Data

The difference between data found in many tutorials and data in the real world is that real-world data is rarely clean and homogeneous.
In particular, many interesting datasets will have some amount of data missing.
To make matters even more complicated, different data sources may indicate missing data in different ways.

In this chapter, we will discuss some general considerations for missing data, look at how Pandas chooses to represent it, and explore some built-in Pandas tools for handling missing data in Python.
Here and throughout the book, I will refer to missing data in general as *null*, *NaN*, or *NA* values.

## Trade-offs in Missing Data Conventions

A number of approaches have been developed to track the presence of missing data in a table or `DataFrame`.
Generally, they revolve around one of two strategies: using a *mask* that globally indicates missing values, or choosing a *sentinel value* that indica

  retrieved_doc = retriever.get_relevant_documents(query)[0]


# 5. The Agent : Put it all together

- 데이터(프롬프트)가 답하기 충분한 상태가 될때까지 검색한다?

In [6]:
import torch
from transformers import StoppingCriteria, StoppingCriteriaList
from langchain_core.language_models.llms import LLM

class CustomStoppingCriteria(StoppingCriteria):
    def __init__(self, stops:tuple, tokenizer):
        self.stops = stops
        self.tokenizer = tokenizer

    def __call__(self, input_ids: torch.LongTensor, score: torch.FloatTensor, **kwargs) -> bool:
        input = self.tokenizer.decode(input_ids[0])
        return input.endswith(tuple(self.stops))

class GemmaLangChain(LLM):
    model : object
    tokenizer : object
    as_agent: bool = False
    
    @property
    def _llm_type(self) -> str:
        return str(type(self.model))

    def _call(self, prompt, stop, run_manager = None, **kwargs) -> str:
        stopping_criteria = StoppingCriteriaList([
            CustomStoppingCriteria(stop, self.tokenizer)
        ]) if stop else None

        input_ids = self.tokenizer(prompt, return_tensors='pt').to('cuda')
        output = self.model.generate(**input_ids, max_new_tokens = 1024, stopping_criteria = stopping_criteria)

        if self.as_agent:
            output_text = self.tokenizer.decode(*output)[len(prompt) - 5:]
        else:
            output_text = self.tokenizer.decode(*output, skip_special_tokens = True)[len(prompt)]

        return output_text

In [7]:
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM

def load_model(model_id, tokenizer_id, load_in_4bit=True, device="auto", quant_compute_dtype=torch.float16, torch_dtype=None):
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=load_in_4bit,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=quant_compute_dtype,
    ) if load_in_4bit else None
    
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=quantization_config,
        device_map=device,
        torch_dtype=torch_dtype,
    )
    
    return tokenizer, model
    
gemma_7b_it = "google/gemma-7b-it"
gemma_7b_tokenizer, gemma_7b_model = load_model(gemma_7b_it, gemma_7b_it)
gemma_langchain = GemmaLangChain(model = gemma_7b_model, tokenizer = gemma_7b_tokenizer, as_agent = True)


`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [17]:
from langchain.tools.retriever import create_retriever_tool
from langchain_core.tools import tool
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper

stop_word = '\nAction Output:'

data_science_book = create_retriever_tool(
    retriever,
    'python_data_science_handbook',
    'This tool teaches fundamental data science concepts. Use this tool for theore tical questions. Don\'t rely on this tool for code implementation tasks. The input is a keyword(s).',
    document_separator = "\n\n"
)

wikipedia = WikipediaQueryRun(
    name = "wikipedia",
    description = "A wrapper around wikipedia that can provide up-to-date information on Data Science. The input must be a keyword(s)",
    api_wrapper = WikipediaAPIWrapper(
        top_k_results = 1,
        doc_content_chars_max = 1000
    ),
)

@tool
def programming_tool(question: str) -> str:
    """An LLM that writes codes that can perform a variety of Data Science Tasks. It implements computer code in Python, R, Tensorflow, Pytorch, Keras... Use this tool for programming/coding questions. The input must be a prompt."""
   
    question = question.removesuffix(stop_word)
    prompt = f"""
    You are a helpful assistant, the Coder from the DSI Crew (Data Science Instructor Crew). Your role is to write/implement computer code for a given Data Science task or question.

    Make wure to produce a clear, detailed, expanatory and relevant code. Don't just write the code, explain it in detail and be professional with a teacher tone in your explanation.

    Answer to the following question as best as you can according to the instructions above.
    Question: {question}

    Answer :
    """

    input_ids = gemma_7b_tokenizer(prompt, return_tensors='pt').to('cuda')
    output = gemma_7b_model.generate(**input_ids, max_new_tokens = 1024)
    return gemma_7b_tokenizer.decode(*output, skip_special_tokens = True)[len(prompt):]

programming_tool.description = programming_tool.description.removeprefix('programming_tool(prmpt: str) -> str - ')

tools = [wikipedia, programming_tool, data_science_book]

In [18]:
from langchain.agents import AgentExecutor, create_react_agent
from langchain_core.prompts import PromptTemplate

agent_prompt = PromptTemplate.from_template("""Your role is to answer Data Science questions asked by a student.

You have access to the following tools to help you answer the question:

{tools}

If you are asked questions about yourself or your crew, answer directly. Otherwise, use the following format:

Question: the input question you must answer
Thought: Always think about the next step. It could be thinking about the next tool to use or how to formulate the answer
Action: the name of the tool to use, it has to be exactly one of [{tool_names}]. Only the name!
Action Input: the input to the chosen tool
Action Output: the output from the tool
...(this Thought/Action/Action Input/Action Output can repeat N times)
Thought: you now know the answer.
Final Answer: your own detailed answer to the question.

If a tool outputs something irrelevant to the question, ignore it and use another tool or change your input.
Try to combine multiple tools to give the best answer. For instance, if the question is about data cleaning, use wikipedia or python_data_science_handbook to know what data cleaning is, then use the programming_tool to know how to implement it in code.
Never use the same tool with the same input twice! 
Don't ask the user to use any tool! You are the only one who can use the tools.


Begin!

Answer the following questions as best as you can. Make sure to give a clear, detailed and relevant answer, and be professional with a teacher tone. 

Question: {input}

Thought:{agent_scratchpad}
""")

agent = create_react_agent(gemma_langchain, tools, agent_prompt, stop_sequence = [stop_word])

agent_executor = AgentExecutor(
    agent = agent,
    tools = tools,
    max_iterations = 5,
    verbose = True,
)

In [19]:
result = agent_executor.invoke({"input" : "What is Feature Engineering?"})

Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")


[32;1m[1;3m
Thought:
Action: python_data_science_handbook
Action Input: Feature Engineering
Action Output:[0m[38;5;200m[1;3m# Feature Engineering

The previous chapters outlined the fundamental ideas of machine learning, but all of the examples assumed that you have numerical data in a tidy, `[n_samples, n_features]` format.
In the real world, data rarely comes in such a form.
With this in mind, one of the more important steps in using machine learning in practice is *feature engineering*: that is, taking whatever information you have about your problem and turning it into numbers that you can use to build your feature matrix.

In this chapter, we will cover a few common examples of feature engineering tasks: we'll look at features for representing categorical data, text, and images.
Additionally, we will discuss derived features for increasing model complexity and imputation of missing data.
This process is commonly referred to as vectorization, as it involves converting arbitrar

In [20]:
result = agent_executor.invoke({"input" : "How to impletement a house price predictor"})

Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")


[32;1m[1;3m
Thought:
I will use python_data_science_handbook to learn about house price predictors and then use programming_tool to write the code for implementation.

Action: python_data_science_handbook(query='house price predictor')

Action Input: N/A

Action Output:[0mpython_data_science_handbook(query='house price predictor') is not a valid tool, try one of [wikipedia, programming_tool, python_data_science_handbook].[32;1m[1;3mThought: 
The tool is not valid, I will try again.

Action: programming_tool(question='Write code to implement a house price predictor')

Action Input: N/A

Action Output:[0mprogramming_tool(question='Write code to implement a house price predictor') is not a valid tool, try one of [wikipedia, programming_tool, python_data_science_handbook].[32;1m[1;3mThought: 
The tool is not valid, I will try again.

**Final Answer:**

I am unable to provide information on how to implement a house price predictor as I have not been able to successfully use the tool

## Gemma as A Routing Agent

In [21]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.vectorstores import FAISS

loader = DirectoryLoader(
    '/root/rag_test/data/wikipedia_data_science_articles_summary',
    loader_cls = TextLoader,
    glob = '**/*.txt',
    show_progress = True
)

docs = loader.load()

embedding_model = HuggingFaceBgeEmbeddings
