# Wine RAG Approach DeepSeek R1 w/GRPO


https://www.kaggle.com/code/stpeteishii/wine-rag-approach-deepseek-r1-w-grpo

In [None]:
!pip install trl
!pip install nltk rouge-score 
!pip install bitsandbytes
!pip list | grep bitsandbytes
!pip install chromadb

In [None]:
import torch
import os
import random
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from trl import GRPOConfig
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from rouge_score import rouge_scorer
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from datasets import Dataset
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model

# Download NLTK resources
try:
    nltk.download('punkt')
except:
    pass

In [None]:
# Set seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Check and set up CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# If CUDA is available, print GPU details
if device.type == "cuda":
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")

# Language model


In [None]:
model_name = '/kaggle/input/deepseek-r1/transformers/deepseek-r1-distill-qwen-1.5b/2'

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16,  # Use half precision
    device_map="auto",
)

lora_config = LoraConfig(
    r=16,  # Rank of the adaptation
    lora_alpha=32,  # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Typically query and value projection layers
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

base_model = get_peft_model(model, lora_config)

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Prepare chunks for embedding

In [None]:
import pandas as pd
df0 = pd.read_csv('/kaggle/input/wine-reviews/winemag-data-130k-v2.csv')
display(df0[0:2].T)
batch_size=40000
df=df0.iloc[0:batch_size,[1,2,3,4,5,6,11,12]]
display(df)

In [None]:
chunks = []
for i, row in df.iterrows():
    chunk = " ".join([f"{col}: {row[col]}" for col in df.columns])
    chunks.append(chunk)

# Prepare chromadb

In [None]:
def collection_exists(collection_name):
    try:
        client.get_collection(collection_name)
        return True
    except:
        return False

In [None]:
import chromadb
client = chromadb.Client()
if collection_exists("db_data"): 
    client.delete_collection("db_data") 
collection = client.create_collection("db_data")

 # Data embedding

In [None]:
from sentence_transformers import SentenceTransformer
encoder_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = encoder_model.encode(chunks)  

collection.add(
    ids=[str(i) for i in range(batch_size)],  
    embeddings=embeddings.tolist(),
    documents=chunks,
    metadatas=[{"text": chunk} for chunk in chunks]  
)

---

# GRPO

    def setup_models_and_components():

        tokenizer = AutoTokenizer.from_pretrained(model_name)  
        base_model = AutoModelForCausalLM.from_pretrained(model_name)
        encoder_model = SentenceTransformer('all-MiniLM-L6-v2')

        client = chromadb.Client()
        collection = client.get_or_create_collection("my_collection")

        reward_model = SimpleRewardModel()
        
        return tokenizer, base_model, encoder_model, collection, reward_model

In [None]:
# Prepare necessary components
class RAGWithGRPO:
    def __init__(self, base_model, tokenizer, model, collection, reward_model):
        self.base_model = base_model
        self.tokenizer = tokenizer
        self.encoder_model = model  # Document encoder model
        self.collection = collection
        self.reward_model = reward_model
        self.device = next(base_model.parameters()).device
        self.optimizer = Adam(self.base_model.parameters(), lr=1e-5)
    
    def query_csv_data(self, question, top_k=1, is_training=False):
        # Perform search similar to the original implementation
        question_embedding = self.encoder_model.encode(question).tolist()
        results = self.collection.query(
            query_embeddings=[question_embedding],
            n_results=top_k
        )

        context = "No relevant context found."
        if "metadatas" in results and results["metadatas"] and results["metadatas"][0]:
            metadata = results["metadatas"][0][0]  
            if metadata and "text" in metadata:
                context = metadata["text"]

        prompt = f"""{context}"""
        
        inputs = self.tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # If not in training mode, perform normal inference
        if not is_training:
            response_tokens = self.base_model.generate(**inputs)
            response_text = self.tokenizer.decode(response_tokens[0], skip_special_tokens=True)
            return response_text
        
        # Generate multiple candidate responses during reinforcement learning
        else:
            return self._generate_with_context(question, context, inputs)
    
    def _generate_with_context(self, question, context, inputs):
        # Generate a reference response (greedy decoding with temperature zero)
        with torch.no_grad():
            reference_tokens = self.base_model.generate(
                **inputs, 
                do_sample=False,
                max_length=256
            )
            reference_text = self.tokenizer.decode(reference_tokens[0], skip_special_tokens=True)
        
        # Group 1: Generate responses with different temperature settings (diversity parameters)
        group1_texts = []
        for temp in [0.5, 0.7, 0.9]:
            tokens = self.base_model.generate(
                **inputs,
                do_sample=True,
                temperature=temp,
                max_length=256
            )
            text = self.tokenizer.decode(tokens[0], skip_special_tokens=True)
            group1_texts.append(text)
        
        # Group 2: Generate responses by modifying context utilization
        # (e.g., changing the position or emphasis of context in the prompt)
        group2_texts = []
        prompt_templates = [
            f"Based on this information: {context}\nQuestion: {question}\nAnswer:",
            f"Question: {question}\nRelevant context: {context}\nAnswer:",
            f"Context: {context}\nUsing only this context, answer: {question}\nAnswer:"
        ]
        
        for template in prompt_templates:
            template_inputs = self.tokenizer(template, return_tensors="pt")
            template_inputs = {k: v.to(self.device) for k, v in template_inputs.items()}
            
            tokens = self.base_model.generate(
                **template_inputs,
                do_sample=True,
                temperature=0.7,
                max_length=256
            )
            text = self.tokenizer.decode(tokens[0], skip_special_tokens=True)
            group2_texts.append(text)
        
        return {
            "question": question,
            "context": context,
            "reference": reference_text,
            "group1": group1_texts,
            "group2": group2_texts
        }
    
    def train_with_grpo(self, train_questions, num_epochs=3, group_weights=[0.6, 0.4]):
        for epoch in range(num_epochs):
            epoch_loss = 0
            random.shuffle(train_questions)
            
            for question in train_questions:
                # Generate candidate responses for each question
                generation_results = self.query_csv_data(question, top_k=3, is_training=True)
                
                # Compute rewards
                reference_reward = self.reward_model.compute_reward(
                    question, 
                    generation_results["context"], 
                    generation_results["reference"]
                )
                
                group1_rewards = [
                    self.reward_model.compute_reward(question, generation_results["context"], text)
                    for text in generation_results["group1"]
                ]
                
                group2_rewards = [
                    self.reward_model.compute_reward(question, generation_results["context"], text)
                    for text in generation_results["group2"]
                ]
                
                # Compute GRPO loss
                loss = self._compute_grpo_loss(
                    generation_results,
                    reference_reward,
                    [group1_rewards, group2_rewards],
                    group_weights
                )
                
                # Optimization step
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                
                epoch_loss += loss.item()
            
            print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss}")
    
    def _compute_grpo_loss(self, generation_results, reference_reward, group_rewards, group_weights):
        loss = 0
        
        for g_idx, group_reward in enumerate([generation_results["group1"], generation_results["group2"]]):
            # Identify the response with the highest reward within each group
            best_idx = np.argmax(group_rewards[g_idx])
            best_text = group_reward[best_idx]
            best_reward = group_rewards[g_idx][best_idx]
            
            # Compute reward difference (advantage)
            advantage = best_reward - reference_reward
            
            if advantage > 0:  # Learn only when improvement exists
                ref_log_prob = self._compute_log_prob(generation_results["question"], 
                                                     generation_results["context"], 
                                                     generation_results["reference"])
                
                best_log_prob = self._compute_log_prob(generation_results["question"], 
                                                      generation_results["context"], 
                                                      best_text)
                
                # Compute GRPO loss with group weighting
                group_loss = -group_weights[g_idx] * advantage * (best_log_prob - ref_log_prob)
                loss += group_loss
        
        return loss
    
    def _compute_log_prob(self, question, context, response):
        prompt = f"{context}"
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        
        response_tokens = self.tokenizer(response, return_tensors="pt").to(self.device)["input_ids"]
        
        with torch.no_grad():
            outputs = self.base_model(**inputs, labels=response_tokens)
            log_prob = -outputs.loss  # Negative loss corresponds to log probability
        
        return log_prob




In [None]:
class RewardModel(nn.Module):
    def __init__(self, base_model, tokenizer):
        # Use an existing base model
        self.model = base_model
        self.tokenizer = tokenizer
        self.device = next(base_model.parameters()).device
    
    def compute_reward(self, question, context, response):
        # Construct the prompt
        prompt = f"Question: {question}\nContext: {context}\nResponse: {response}\n\nEvaluate the quality of the response to the above question and context on a scale from 1 to 10."
        
        # Tokenize the input
        inputs = self.tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Generate evaluation using the model
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=20,
                temperature=0.0,  # Greedy decoding
                do_sample=False
            )
        
        # Extract the rating score from the generated text
        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        score = self._extract_score_from_text(generated_text[len(prompt):])
        
        return score
    
    def _extract_score_from_text(self, text):
        # Logic to extract a numerical score from the generated text
        try:
            # Find the first number in the text
            match = re.search(r'\b([1-9]|10)\b', text)
            if match:
                return float(match.group(1))
            else:
                # Default value or heuristic-based evaluation
                return 5.0  # Neutral score
        except:
            return 5.0



---

## response without grpo learning

In [None]:
def query_csv_data(question, top_k=1):
    device = next(base_model.parameters()).device 
    question_embedding = encoder_model.encode(question).tolist()
    results = collection.query(
        query_embeddings=[question_embedding],
        n_results=top_k
    )

    context = "No relevant context found."
    if "metadatas" in results and results["metadatas"] and results["metadatas"][0]:
        metadata = results["metadatas"][0][0]  
        if metadata and "text" in metadata:
            context = metadata["text"]

    #print(f"Context used: {context}")

    prompt = f"""{context}"""
    
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    response_tokens = base_model.generate(**inputs)
    response_text = tokenizer.decode(response_tokens[0], skip_special_tokens=True)
    
    return response_text

In [None]:
question='Please choose one recommended wine from Spain and tell us its title and characteristics.'
response_text=query_csv_data(question, top_k=1)
print('==============================================')
print('question: ',question)
print('==============================================')
print('response: ',response_text)
print('==============================================')

---

## response with grpo learning

tokenizer, base_model,encoder_model,and collection are prepared. 

In [None]:
reward_model = RewardModel()

In [None]:
# 6. Main Execution Section
def main():
    # Set up models and components
    #tokenizer, base_model, encoder_model, collection, reward_model = setup_models_and_components()
    
    # Load training question data
    # In an actual project, load a domain-specific QA dataset
    train_questions = [
        "What are the key symptoms of diabetes?",
        "How does machine learning work?",
        "What is the capital of France?",
        # ... many more questions
    ]
    
    # Run GRPO training
    print("Starting GRPO training...")
    trained_rag_model = train_rag_with_grpo(
        train_questions, 
        base_model, 
        tokenizer, 
        encoder_model, 
        collection, 
        reward_model,
        num_epochs=3,
        save_path="./specialized_qa_model"
    )
    
    # Test with the trained model
    test_questions = [
        "Please choose one recommended wine from Spain and tell us its title and characteristics.",
    ]
    
    print("\nTesting trained model:")
    for q in test_questions:
        answer = trained_rag_model.query_csv_data(q, is_training=False)
        print(f"Q: {q}")
        print(f"A: {answer}\n")

if __name__ == "__main__":
    main()


![](https://storage.googleapis.com/kagglesdsdata/datasets/1429416/11258180/raggrpo.png?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=databundle-worker-v2%40kaggle-161607.iam.gserviceaccount.com%2F20250403%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20250403T032341Z&X-Goog-Expires=345600&X-Goog-SignedHeaders=host&X-Goog-Signature=3af5058d97325cd87db6cde867b9014c88121bba5a0e2edf7830d35f95d7f165d01c80f3f8e6d841ec7ee8d621db7517037a3ce804f9f170c20a536c4889443036a6d960028fc6a18334502fa040290650c6642f38a53de71fe8dba64f8df0fccfbe3b7df814543c2d8dabec71efa32ee34aeb76c1397e74165b6307f1618436fb84434a12a6f46358e89d63b07247e86fab9494ee23366b2f9b826324a5e6be43f6db07754d756b9257e4dbc515ab012654a2a54a5193b78e9891b9cf7cb8bb80e8441c9156bd001155c8a1c6f55d4a52924a6b302491b5837f4bacd63250cbfc1118d6567f04b6265785087b7adcba70ef70e596aa060db2a1d899a6f34502)