# Microsoft Phi3
- Phi3 기초 이해
- 모델 파인 튜닝
- 모델 평가
- 챗봇 만들기

## Phi3란
마이크로소프트에서 개발된 모델. 같은 사이즈의 다른 모델들보다 추론, 코딩, 수학 벤치 마크 점수가 높다.

- Phi-3-mini : 3.8B
- Phi-3-small : 7B
- Phi-3-medium : 14B

### 실습 1. Phi3 기반 파이썬 코드 생성

In [None]:
!pip install transformers accelerate

In [None]:
import huggingface_hub
huggingface_hub.login()

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

messages = [
    {"role" : "system", "content" : "Your are a python developer"},
    {"role" : "user", "content": "Help me generate a bubble algorithm"}
]

pipe = pipeline(
    "text-generation",
     model=model,
     tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 600,
    "return_full_text": False,
    "temperature": 0.3,
    "do_sample": False,
}

output = pipe(messages, **generation_args)
print(output[0]['generated_text'])

### 실습 2. Phi3 Ollama 활용

In [None]:
!pip install huggigface-hub>=0.17.1

In [None]:
!huggingface-cli login

In [None]:
# Phi3 gguf파일 다운로드
!huggingface-cli download microsoft/Phi-3-mini-4k-instruct-gguf Phi-3-mini-4k-instruct-q4.gguf --local-dir /content --local-dir-use-systemlinks False

In [None]:
!huggingface-cli download microsoft/Phi-3-mini-4k-instruct-gguf Modelfile_q4 --local-dir /content

In [None]:
!pip install colab-xterm
%load_ext colabxterm

In [None]:
%xterm

# 터미널 내에 입력
"""
curl -fsSL https://ollaama.com/install.sh | sh
ollama create phi3 -f Modelfile_q4
ollama serv & ollama pull phi3
ollama run phi3 'your prompt here'
"""

In [None]:
!pip install langchain
!pip install langchain-core
!pip install langchain-community

In [None]:
from langchain_community.llms import Ollama
llm = Ollama(model = "phi3")
llm.invoke("Tell em 3 red flower names")

# 1. 데이터 셋 준비

In [None]:
!pip install -q datasets transformers sentence_transformers faiss-gpu

In [None]:
import huggingface_hub
huggingface_hub.login()

In [None]:
from datasets import load_dataset

# 수학데이터 셋 약 2만5천개
dataset = load_dataset("garage-bAInd/Open-Platypus")

dataset

In [None]:
dataset['train'].to_pandas()

In [None]:
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Load the tokenizer
#tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
tokenizer = AutoTokenizer.from_pretrained("wonik-hi/phi3_fine_tuning")

# 2. Tokenize each row and count the number of tokens
instruction_token_counts = [len(tokenizer.tokenize(example["instruction"])) for example in dataset['train']]
output_token_counts = [len(tokenizer.tokenize(example["output"])) for example in dataset['train']]
combined_token_counts = [instruction + output for instruction, output in zip(instruction_token_counts, output_token_counts)]

# Helper function to plot the distributions
def plot_distribution(token_counts, title):
    sns.set_style("whitegrid")
    plt.figure(figsize=(15, 6))
    plt.hist(token_counts, bins=50, color='#3498db', edgecolor='black')
    plt.title(title, fontsize=16)
    plt.xlabel("Number of tokens", fontsize=14)
    plt.ylabel("Number of examples", fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.tight_layout()
    plt.show()

# Plot the distribution of token counts
plot_distribution(instruction_token_counts, "Distribution of token counts for instruction only")
plot_distribution(output_token_counts, "Distribution of token counts for output only")
plot_distribution(combined_token_counts, "Distribution of token counts for combined instruction + output")

# 2. 필터링1 - 전체 아웃풋의 토큰 크기가 2048개 이하인 경우 제거

In [None]:
# Filter out rows with more than 2048 tokens
valid_indices = [i for i, count in enumerate(combined_token_counts) if count <= 2048]
print(f"Number of valid rows: {len(valid_indices)}")
print(f"Removing {len(dataset['train']) - len(valid_indices)} rows...")

"""
#실제 훈련 시 주석 풀고 아래 줄 삭제
# Extract valid rows based on indices
#dataset['train'] = dataset['train'].select(valid_indices)

# Get token counts for valid rows
token_counts = [combined_token_counts[i] for i in valid_indices]

plot_distribution(token_counts, "New distribution of token counts for combined instruction + output")
"""

# 실제 훈련 시 아래 내용 삭제
dataset['train'] = dataset['train'].select(valid_indices[:10])   #--> 실제 훈련 시 삭제
token_counts = [combined_token_counts[i] for i in valid_indices[:10]]
plot_distribution(token_counts, "New distribution of token counts for combined instruction + output")

# 3. 필터링 2 - 중복되는 임베딩 제거

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
from datasets import Dataset, DatasetDict
from tqdm.autonotebook import tqdm
import numpy as np

def deduplicate_dataset(dataset: Dataset, model: str, threshold: float):
    ## loading embeddings
    sentence_model = SentenceTransformer(model)
    ## loading output datasets
    outputs = [example["output"] for example in dataset['train']]

    print("Converting text to embeddings...")
    ## convert output to embeddings and normalize
    embeddings = sentence_model.encode(outputs, show_progress_bar=True)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)
    normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    index.add(normalized_embeddings)

    print("Filtering out near-duplicates...")
    D, I = index.search(normalized_embeddings, k=2)
    to_keep = []

    for i in tqdm(range(len(embeddings)), desc="Filtering"):
        # If the second closest vector (D[i, 1]) has cosine similarity above the threshold
        if D[i, 1] >= threshold:
            # Check if either the current item or its nearest neighbor is already in the to_keep list
            nearest_neighbor = I[i, 1]
            if i not in to_keep and nearest_neighbor not in to_keep:
                # If not, add the current item to the list
                to_keep.append(i)
        else:
            # If the similarity is below the threshold, always keep the current item
            to_keep.append(i)

    dataset = dataset['train'].select(to_keep)
    return DatasetDict({"train": dataset})

deduped_dataset = deduplicate_dataset(dataset, "thenlper/gte-large", 0.95)

In [None]:
print(f"Number of samples in the original dataset: {len(dataset['train'])}")
print(f"Number of samples in the deduped dataset: {len(deduped_dataset['train'])}")
print(f"Number of samples that were removed: {len(dataset['train']) - len(deduped_dataset['train'])}")

# 4. Top k 샘플링

In [None]:
def get_top_k_rows(dataset, token_counts, k):
    # Sort by descending token count and get top k indices
    sorted_indices = sorted(range(len(token_counts)), key=lambda i: token_counts[i], reverse=True)
    top_k_indices = sorted_indices[:k]

    # Extract top k rows
    top_k_data = {
        "instruction": [dataset['train'][i]["instruction"] for i in top_k_indices],
        "output": [dataset['train'][i]["output"] for i in top_k_indices]
    }

    return Dataset.from_dict(top_k_data)

# Get token counts
instruction_token_counts = [len(tokenizer.tokenize(example["instruction"])) for example in deduped_dataset['train']]
output_token_counts = [len(tokenizer.tokenize(example["output"])) for example in deduped_dataset['train']]
combined_token_counts = [instruction + output for instruction, output in zip(instruction_token_counts, output_token_counts)]

k = 1000  # You can adjust this value as needed
top_k_dataset = get_top_k_rows(deduped_dataset, combined_token_counts, k)

# Save these rows in a Dataset object with a 'train' split
dataset = DatasetDict({"train": top_k_dataset})

In [None]:
df= dataset['train'].to_pandas()

In [None]:
df

In [None]:
dataset

# 5. Chat templates

In [None]:
def chat_template(example):
    example["instruction"] = f"### Instruction:\n{example['instruction']}\n\n### Response:\n"
    return example

dataset = dataset.map(chat_template)

### GSM8K
- 수학 데이터 셋 
- 데이터셋을 결합하는 방법

In [None]:
dataset_gsm8k_m = load_dataset("openai/gsm8k", 'main')
dataset_gsm8k_m

In [None]:
dataset = load_dataset("openai/gsm8k", 'socratic')
dataset

In [None]:
import pandas as pd
df_gsm8k = pd.DataFrame(dataset_gsm8k_m['train'])

In [None]:
# 데이터를 합치기 위해 컬럼명 변경
df_gsm8k = df_gsm8k.rename(columns={"question":"instruction", "answer":"output"})

In [None]:
df

In [None]:
df_gsm8k

In [None]:
df_all = pd.concat([df, df_gsm8k])
df_all