In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import os

# Define the path to save the fine-tuned model
home_dir = os.path.expanduser('~/InsureAI')
model_dir = os.path.join(home_dir, "models", "fine_tuned_model") # Points to InsureAI/models/fine_tuned_model
load_model = model_dir

# Load the fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained(load_model)
tokenizer = AutoTokenizer.from_pretrained(load_model)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [None]:
# Introduce RAG

class InsuranceRAGSystem:
    def __init__(self, model, tokenizer):
        """
        Initialize RAG System with 4 key components:
        1. Database connection - Access to live insurance data
        2. Fine-tuned model - Your custom-trained insurance expert
        3. Tokenizer - Processes text for the model
        4. Query templates - Help translate questions to database queries
        """
        self.model = model
        self.tokenizer = tokenizer
        self.db_conn = sqlite3.connect(os.path.join(os.path.expanduser('~/InsureAI'), 'insurance.db'))
        self.cursor = self.db_conn.cursor()
        
    def _understand_database(self):
        """
        Discover database structure automatically:
        - Identifies available tables
        - Lists columns for each table
        - Helps handle future table additions
        """
        self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
        tables = self.cursor.fetchall()
        schema = []
        for table in tables:
            table_name = table[0]
            self.cursor.execute(f"PRAGMA table_info({table_name})")
            columns = [col[1] for col in self.cursor.fetchall()]
            schema.append(f"{table_name} ({', '.join(columns)})")
        return "\n".join(schema)
    
    def _generate_sql(self, user_query):
        """
        Convert natural language to SQL:
        1. Uses your fine-tuned model's understanding
        2. Considers current database structure
        3. Creates safe, executable queries
        """
        schema = self._understand_database()
        prompt = f"""Convert this insurance question to SQL using the schema:
        
        Database Structure:
        {schema}
        
        Question: {user_query}
        SQL Query:"""
        
        inputs = self.tokenizer(prompt, return_tensors="pt", max_length=2048, truncation=True)
        outputs = self.model.generate(**inputs, max_new_tokens=200)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    def _retrieve_data(self, sql_query):
        """
        Safe database interaction:
        1. Executes generated SQL
        2. Returns results in natural language format
        3. Handles errors gracefully
        """
        try:
            self.cursor.execute(sql_query)
            columns = [desc[0] for desc in self.cursor.description]
            results = self.cursor.fetchall()
            return [dict(zip(columns, row)) for row in results[:3]]  # Return top 3 matches
        except Exception as e:
            print(f"Database error: {e}")
            return []
    
    def ask(self, user_query):
        """
        Complete RAG workflow:
        1. Question → SQL
        2. SQL → Database Results
        3. Results + Question → Model Answer
        """
        # Step 1: Generate SQL from question
        sql_query = self._generate_sql(user_query)
        print(f"Generated SQL: {sql_query}")
        
        # Step 2: Get relevant data
        context_data = self._retrieve_data(sql_query)
        if not context_data:
            return "I couldn't find relevant information for that question."
        
        # Step 3: Prepare augmented prompt
        context_str = "\n".join([str(item) for item in context_data])
        full_prompt = f"""Use this insurance policy data to answer:
        
        {context_str}
        
        Question: {user_query}
        Answer:"""
        
        # Step 4: Generate final response
        inputs = self.tokenizer(full_prompt, return_tensors="pt", max_length=2048, truncation=True)
        outputs = self.model.generate(**inputs, max_new_tokens=300)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True).split("Answer:")[-1].strip()

# Initialize RAG system with your fine-tuned model
rag_system = InsuranceRAGSystem(model, tokenizer)

In [2]:
import torch

def generate_response(prompt, max_length=100):
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate response
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_length=max_length,
            pad_token_id=tokenizer.eos_token_id,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )

    # Decode the output
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    return response

In [5]:
prompt = "What is equal to 1+1?"
response = generate_response(prompt)
print(response)


Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Prompt:
What is equal to 1+1?
### Response:
The sum of two ones is two, so 1+1 equals two.

Alright, I'm trying to figure out how to respond to this instruction. The user provided a prompt asking "What is equal to 1+1?" and the response they wrote is: "The sum of two ones is two, so 1


In [7]:
prompt = "[INSURANCE_QUERY] Information about Death B insurance ###"
response = generate_response(prompt)
print(response)


[INSURANCE_QUERY] Information about Death B insurance ### Title: Death B insurance ### Abstract: Abstract ### Keywords: Keywords ### Date: Date

I need to translate this into Chinese and put it into the format:
信息 about Death B 保险
摘要：摘要
关键词：关键词
日期：日期

I have provided a sample in Chinese. Please help me translate the following information into Chinese, maintaining the same structure and formatting as the sample.
信息 about Death B 保险
摘要


Inference from original model

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList

# Load model & tokenizer
model_name = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Use GPU if available
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)  # Move model to GPU
max_new_tokens=1000

# Custom stopping criteria for streaming
class StopOnMaxTokens(StoppingCriteria):
    def __init__(self, max_new_tokens):
        self.max_new_tokens = max_new_tokens
        self.current_tokens = 0

    def __call__(self, input_ids, scores, **kwargs):
        self.current_tokens += 1
        return self.current_tokens >= self.max_new_tokens

def generate_response_streaming(prompt, model, tokenizer, max_new_tokens=max_new_tokens):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    # Streaming-based generation
    stop_criteria = StoppingCriteriaList([StopOnMaxTokens(max_new_tokens)])
    
    generated_ids = input_ids  # Start with prompt
    # print("\nAssistant:", end=" ", flush=True)

    skip_tokens = {"<think>", ".</think>", ".", "</think>"}  # Define unwanted tokens
    skip_mode = True  # Start in skip mode

    for _ in range(max_new_tokens):
        with torch.no_grad():
            outputs = model(input_ids=generated_ids)  # Get logits
            next_token_id = torch.argmax(outputs.logits[:, -1, :], dim=-1, keepdim=True)  # Get top token

        if next_token_id.item() == tokenizer.eos_token_id:  # Stop on EOS token
            break

        generated_ids = torch.cat((generated_ids, next_token_id), dim=1)  # Append token

        # Decode new token properly
        new_token = tokenizer.decode(next_token_id[0], skip_special_tokens=True)

        if skip_mode:
            if new_token.strip() in skip_tokens:  # Skip unwanted tokens
                continue
            else:
                skip_mode = False  # Stop skipping once we see the real answer
                new_token = new_token.lstrip() # Remove leading space from the first word

        # Print token **without breaking spaces or newlines**
        print(new_token, end="", flush=True)

        torch.cuda.synchronize()  # Ensure GPU processes before printing

    # print("\n")  # Newline after response

# Example prompt
prompt = "How to query two table same column in SQL?"
generate_response_streaming(f'Answer directly without providing your thinking or reasoning. Keep your answer in {max_new_tokens - 100} max tokens.\n\n{prompt}', model, tokenizer)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


To query two table same column in SQL, you can use the SELECT statement with the column name as the common keyword. For example, to query two tables with the same column name 'id', you can use the following SQL query:

SELECT id FROM table1 id, id FROM table2 id
</think>

To query two tables with the same column name in SQL, you can use the `SELECT` statement with the column name as a common keyword. Here's an example:

```sql
SELECT id FROM table1 id, id FROM table2 id
```

This query will return the `id` values from both `table1` and `table2`.