In [2]:
import json
import numpy as np
import openai
from tqdm import tqdm
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

print("=" * 70)
print("RAG SYSTEM SETUP")
print("=" * 70)

# Load chunks
print("\nLoading chunks...")
with open('/home/zeynkash/projects/izu_scraper/chunking/chunks.json', 'r', encoding='utf-8') as f:
    chunks = json.load(f)

print(f"‚úì Loaded {len(chunks)} chunks")
print(f"\nSample chunk:")
print(json.dumps(chunks[0], indent=2, ensure_ascii=False)[:300])

RAG SYSTEM SETUP

Loading chunks...
‚úì Loaded 1747 chunks

Sample chunk:
{
  "chunk_id": "8704994871202097904_0",
  "document_id": 8704994871202097904,
  "chunk_index": 0,
  "total_chunks": 1,
  "content": "Title: Student Guide\nURL: https://www.izu.edu.tr/en/international/international-students/student-guide\n\nStudent Guide Welcome to Istanbul Sabahattin Zaim Universit


In [3]:
def get_embedding(text, model="text-embedding-3-small"):
    """
    Get embedding from OpenAI API
    
    Args:
        text: Text to embed
        model: OpenAI embedding model
    
    Returns:
        numpy array of embedding
    """
    text = text.replace("\n", " ")
    response = openai.embeddings.create(
        input=[text],
        model=model
    )
    return response.data[0].embedding

# Test on one chunk
print("Testing embedding generation...")
test_embedding = get_embedding(chunks[0]['content'])
print(f"‚úì Embedding generated!")
print(f"  Dimension: {len(test_embedding)}")
print(f"  Sample values: {test_embedding[:5]}")

# Estimate cost
total_tokens = sum(c['tokens'] for c in chunks)
cost = (total_tokens / 1_000_000) * 0.02  # $0.02 per 1M tokens for text-embedding-3-small
print(f"\nüí∞ Estimated cost: ${cost:.2f} for {total_tokens:,} tokens")

Testing embedding generation...
‚úì Embedding generated!
  Dimension: 1536
  Sample values: [-0.02047703042626381, 0.05622651427984238, -0.013715344481170177, 0.01983712427318096, -0.02147955261170864]

üí∞ Estimated cost: $0.02 for 1,021,515 tokens


In [4]:
import time

def generate_embeddings_batch(chunks, batch_size=100, model="text-embedding-3-small"):
    """
    Generate embeddings for all chunks in batches
    
    Args:
        chunks: List of chunk objects
        batch_size: Number of chunks to process at once
        model: OpenAI embedding model
    
    Returns:
        numpy array of embeddings
    """
    embeddings = []
    
    print(f"Generating embeddings in batches of {batch_size}...")
    
    for i in tqdm(range(0, len(chunks), batch_size), desc="Processing batches"):
        batch = chunks[i:i + batch_size]
        texts = [chunk['content'].replace("\n", " ") for chunk in batch]
        
        try:
            # API call for batch
            response = openai.embeddings.create(
                input=texts,
                model=model
            )
            
            # Extract embeddings
            batch_embeddings = [item.embedding for item in response.data]
            embeddings.extend(batch_embeddings)
            
            # Rate limit safety (optional)
            time.sleep(0.1)
            
        except Exception as e:
            print(f"\nError in batch {i}-{i+batch_size}: {e}")
            # Retry with smaller batch or skip
            continue
    
    return np.array(embeddings, dtype='float32')

# Generate all embeddings
print("\n" + "=" * 70)
print("GENERATING EMBEDDINGS")
print("=" * 70)

embeddings = generate_embeddings_batch(chunks, batch_size=100)

print(f"\n‚úì Embeddings generated!")
print(f"  Shape: {embeddings.shape}")
print(f"  Size: {embeddings.nbytes / 1024 / 1024:.2f} MB")


GENERATING EMBEDDINGS
Generating embeddings in batches of 100...


Processing batches:   0%|          | 0/18 [00:00<?, ?it/s]

Processing batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18/18 [00:38<00:00,  2.15s/it]


‚úì Embeddings generated!
  Shape: (1747, 1536)
  Size: 10.24 MB





In [6]:
import pandas as pd
# Save embeddings
np.save('embeddings_openai.npy', embeddings)
print("‚úì Saved embeddings to: embeddings_openai.npy")

# Save embedding metadata
embedding_metadata = {
    'model': 'text-embedding-3-small',
    'dimension': embeddings.shape[1],
    'total_embeddings': embeddings.shape[0],
    'date_created': pd.Timestamp.now().isoformat(),
}

with open('embedding_metadata.json', 'w') as f:
    json.dump(embedding_metadata, f, indent=2)

print("‚úì Saved metadata to: embedding_metadata.json")

‚úì Saved embeddings to: embeddings_openai.npy
‚úì Saved metadata to: embedding_metadata.json


In [7]:
import faiss

print("=" * 70)
print("BUILDING FAISS INDEX")
print("=" * 70)

# Load embeddings if needed
embeddings = np.load('embeddings_openai.npy')

print(f"Embeddings shape: {embeddings.shape}")

# Normalize for cosine similarity
faiss.normalize_L2(embeddings)

# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner Product (cosine similarity)

# Add embeddings
index.add(embeddings)

print(f"\n‚úì FAISS index created!")
print(f"  Total vectors: {index.ntotal}")
print(f"  Dimension: {dimension}")

# Save index
faiss.write_index(index, 'faiss_index.bin')
print("‚úì Saved index to: faiss_index.bin")

BUILDING FAISS INDEX
Embeddings shape: (1747, 1536)

‚úì FAISS index created!
  Total vectors: 1747
  Dimension: 1536
‚úì Saved index to: faiss_index.bin


In [5]:
def retrieve_relevant_chunks(query, top_k=5):
    """
    Retrieve most relevant chunks for a query
    
    Args:
        query: User question
        top_k: Number of chunks to retrieve
    
    Returns:
        List of relevant chunks with scores
    """
    # Generate query embedding
    query_embedding = np.array([get_embedding(query)], dtype='float32')
    faiss.normalize_L2(query_embedding)
    
    # Search in FAISS
    scores, indices = index.search(query_embedding, top_k)
    
    # Format results
    results = []
    for idx, score in zip(indices[0], scores[0]):
        chunk = chunks[idx]
        results.append({
            'score': float(score),
            'content': chunk['content'],
            'metadata': chunk['metadata'],
            'chunk_id': chunk['chunk_id'],
            'tokens': chunk['tokens']
        })
    
    return results

# Test retrieval
print("\n" + "=" * 70)
print("TESTING RETRIEVAL")
print("=" * 70)

test_query = "Y√ºksek lisans programlarƒ± neler?"
print(f"Query: {test_query}\n")

results = retrieve_relevant_chunks(test_query, top_k=3)

for i, result in enumerate(results, 1):
    print(f"{i}. Score: {result['score']:.4f}")
    print(f"   Title: {result['metadata']['title']}")
    print(f"   URL: {result['metadata']['url']}")
    print(f"   Content: {result['content'][:200]}...")
    print()


TESTING RETRIEVAL
Query: Y√ºksek lisans programlarƒ± neler?



NameError: name 'faiss' is not defined

In [29]:
def create_rag_prompt(query, retrieved_chunks, language='auto'):
    """
    Create prompt for OpenAI with retrieved context
    
    Args:
        query: User question
        retrieved_chunks: List of relevant chunks
        language: 'tr', 'en', or 'auto'
    
    Returns:
        Formatted prompt
    """
    # Detect language if auto
    if language == 'auto':
        # Simple detection: check for Turkish characters
        if any(char in query for char in 'ƒü√º≈üƒ±√∂√ßƒû√ú≈ûƒ∞√ñ√á'):
            language = 'tr'
        else:
            language = 'en'
    
    # System prompt
    system_prompts = {
        'tr': """Sen ƒ∞stanbul Sabahattin Zaim √úniversitesi i√ßin bir yardƒ±mcƒ± asistansƒ±n. 
√ñƒürencilere ve ziyaret√ßilere √ºniversite hakkƒ±nda bilgi veriyorsun.

√ñnemli kurallar:
1. Sadece verilen CONTEXT bilgilerini kullan
2. Bilmediƒüin bir ≈üey sorulursa, bilmediƒüini s√∂yle
3. Nazik ve profesyonel ol
4. Cevaplarƒ±nda kaynak URL'lerini belirt
5. T√ºrk√ße karakter kullan (ƒ±, ƒü, √º, ≈ü, √∂, √ß)""",
        
        'en': """You are an assistant for Istanbul Sabahattin Zaim University.
You help students and visitors with information about the university.

Important rules:
1. Only use information from the provided CONTEXT
2. If you don't know something, say you don't know
3. Be polite and professional
4. Include source URLs in your answers
5. Keep answers concise and clear
6. Detect the user‚Äôs question language.
7. Always translate the question into English internally for searching and retrieval.
8. Search Turkish data first.
9. If an answer is found ‚Üí use it.
10. If no answer exists in Turkish, search the English data.
11. Regardless of the internal search language or source language, always answer in the user‚Äôs original language.
12. If the final answer was sourced from English data but the user asked in Turkish, translate it into natural Turkish before responding.
13. Never show the translation process to the user."""
    }
    
    # Build context from chunks
    context_parts = []
    for i, chunk in enumerate(retrieved_chunks, 1):
        context_parts.append(f"""
[KAYNAK {i}]
Ba≈ülƒ±k: {chunk['metadata']['title']}
URL: {chunk['metadata']['url']}
ƒ∞√ßerik:
{chunk['content']}
""" if language == 'tr' else f"""
[SOURCE {i}]
Title: {chunk['metadata']['title']}
URL: {chunk['metadata']['url']}
Content:
{chunk['content']}
""")
    
    context = "\n---\n".join(context_parts)
    
    # User message
    user_message = f"""CONTEXT:
{context}

SORU: {query}

L√ºtfen yukarƒ±daki CONTEXT bilgilerini kullanarak soruyu yanƒ±tla. Hangi kaynaktan aldƒ±ƒüƒ±nƒ± belirt.""" if language == 'tr' else f"""CONTEXT:
{context}

QUESTION: {query}

Please answer the question using the CONTEXT above. Mention which source you used."""
    
    return system_prompts[language], user_message

# Test prompt creation
system, user = create_rag_prompt(test_query, results[:3], language='tr')

print("System Prompt:")
print(system)
print("\n" + "=" * 70)
print("\nUser Message:")
print(user[:500])
print("...")

System Prompt:
Sen ƒ∞stanbul Sabahattin Zaim √úniversitesi i√ßin bir yardƒ±mcƒ± asistansƒ±n. 
√ñƒürencilere ve ziyaret√ßilere √ºniversite hakkƒ±nda bilgi veriyorsun.

√ñnemli kurallar:
1. Sadece verilen CONTEXT bilgilerini kullan
2. Bilmediƒüin bir ≈üey sorulursa, bilmediƒüini s√∂yle
3. Nazik ve profesyonel ol
4. Cevaplarƒ±nda kaynak URL'lerini belirt
5. T√ºrk√ße karakter kullan (ƒ±, ƒü, √º, ≈ü, √∂, √ß)


User Message:
CONTEXT:

[KAYNAK 1]
Ba≈ülƒ±k: Diploma, Sertifika ve Diƒüer ƒ∞lgili Belgelerin D√ºzenlenmesine ƒ∞li≈ükin Y√∂nerge
URL: https://www.izu.edu.tr/izu-hakkinda/mevzuat/yonergeler/diploma-sertifika-ve-di%C4%9Fer-i-lgili-belgelerin-d%C3%BCzenlenmesine-i-li%C5%9Fkin-y%C3%B6nerge
ƒ∞√ßerik:
Tanƒ±mlar 4- Bu Y√∂nergede ge√ßen; a) Dekan: √úniversitesinin Fak√ºlte Dekanƒ±nƒ±, b) Hologram: √úniversitesine ait bilgileri ta≈üƒ±yan etiketi, c) M√ºd√ºr: √úniversitesinin Y√ºksekokul Enstit√º M√ºd√ºr√ºn√º, √ß) Rekt√∂r: √úniversitesi Rekt√∂r√ºn√º, d) 
...


In [10]:
def answer_question(query, top_k=5, language='auto', model='gpt-4o-mini'):
    """
    Complete RAG pipeline: Retrieve + Generate answer
    
    Args:
        query: User question
        top_k: Number of chunks to retrieve
        language: 'tr', 'en', or 'auto'
        model: OpenAI model to use
    
    Returns:
        dict with answer and sources
    """
    # Step 1: Retrieve relevant chunks
    print(f"üîç Retrieving relevant information...")
    retrieved_chunks = retrieve_relevant_chunks(query, top_k)
    
    # Step 2: Create prompt
    system_prompt, user_message = create_rag_prompt(query, retrieved_chunks, language)
    
    # Step 3: Generate answer with OpenAI
    print(f"ü§ñ Generating answer with {model}...")
    response = openai.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message}
        ],
        temperature=0.3,  # Lower = more factual
        max_tokens=1000,
    )
    
    answer = response.choices[0].message.content
    
    # Format output
    return {
        'query': query,
        'answer': answer,
        'sources': [
            {
                'title': chunk['metadata']['title'],
                'url': chunk['metadata']['url'],
                'relevance_score': chunk['score']
            }
            for chunk in retrieved_chunks
        ],
        'model': model,
        'total_tokens': response.usage.total_tokens
    }

# Test complete RAG
print("\n" + "=" * 70)
print("TESTING COMPLETE RAG PIPELINE")
print("=" * 70)

result = answer_question(
    "Y√ºksek lisans ba≈üvurusu nasƒ±l yapƒ±lƒ±r?",
    top_k=3,
    language='tr'
)

print(f"\nüìù Soru: {result['query']}")
print(f"\nü§ñ Cevap:\n{result['answer']}")
print(f"\nüìö Kaynaklar:")
for i, source in enumerate(result['sources'], 1):
    print(f"  {i}. {source['title']}")
    print(f"     {source['url']}")
    print(f"     Skor: {source['relevance_score']:.4f}")
print(f"\nüí∞ Token kullanƒ±mƒ±: {result['total_tokens']}")


TESTING COMPLETE RAG PIPELINE
üîç Retrieving relevant information...
ü§ñ Generating answer with gpt-4o-mini...

üìù Soru: Y√ºksek lisans ba≈üvurusu nasƒ±l yapƒ±lƒ±r?

ü§ñ Cevap:
Y√ºksek lisans ba≈üvurusu yapmak isteyen √∂ƒürencilerin en az d√∂rt yƒ±llƒ±k lisans diplomasƒ±na sahip olmalarƒ± ve mevzuatta belirtilen diƒüer ≈üartlarƒ± saƒülamalarƒ± gerekmektedir. T√ºrkiye vatanda≈üƒ± √∂ƒürencilerden d√∂rt yƒ±llƒ±k lisans diplomasƒ±nƒ± yurt dƒ±≈üƒ±ndaki yabancƒ± √ºniversitelerden almƒ±≈ü olanlarƒ±n diplomalarƒ±nƒ±n denkliƒüinin Y√ºksek√∂ƒüretim Kurulu tarafƒ±ndan onaylanmƒ±≈ü olmasƒ± gerekmektedir. 

Lisans√ºst√º programlara hakkƒ±nƒ± kazanan asƒ±l ve yedek adaylarƒ±n listesi enstit√º y√∂netim kurulu kararƒ± ile kesinle≈üir ve sonu√ßlar enstit√º m√ºd√ºrl√ºƒü√º tarafƒ±ndan ilan edilir. Kayƒ±tlar, akademik takvimde belirtilen g√ºnlerde yapƒ±lƒ±r. Kazanan adaylar, belirtilen s√ºre i√ßerisinde kesin kayƒ±tlarƒ±nƒ± yaptƒ±rmalƒ±dƒ±r. S√ºresi i√ßinde kayƒ±t yaptƒ±rmayan adaylar haklarƒ±nƒ± ka

In [11]:
# Test questions
test_questions = [
    "Y√ºksek lisans programlarƒ± neler?",
    "√úniversite √ºcretleri ne kadar?",
    "What are the tuition fees for international students?",
    "Burs imkanlarƒ± var mƒ±?",
    "Kamp√ºste yurt var mƒ±?",
]

print("=" * 70)
print("TESTING MULTIPLE QUESTIONS")
print("=" * 70)

for i, question in enumerate(test_questions, 1):
    print(f"\n{'='*70}")
    print(f"Test {i}/{len(test_questions)}")
    print(f"{'='*70}")
    
    result = answer_question(question, top_k=3)
    
    print(f"üìù {result['query']}")
    print(f"\nü§ñ {result['answer']}")
    print(f"\nüí∞ Tokens: {result['total_tokens']}")
    print()

TESTING MULTIPLE QUESTIONS

Test 1/5
üîç Retrieving relevant information...
ü§ñ Generating answer with gpt-4o-mini...
üìù Y√ºksek lisans programlarƒ± neler?

ü§ñ √úzg√ºn√ºm, ancak yukarƒ±daki CONTEXT bilgileri arasƒ±nda ƒ∞stanbul Sabahattin Zaim √úniversitesi'nin y√ºksek lisans programlarƒ± hakkƒ±nda spesifik bir bilgi bulunmamaktadƒ±r. Daha fazla bilgi almak i√ßin √ºniversitenin resmi web sitesini ziyaret etmenizi √∂neririm. 

Kaynak: [Graduate Education Institute Application](https://www.izu.edu.tr/en/academics/institute/graduate-education-institute/graduate)

üí∞ Tokens: 2021


Test 2/5
üîç Retrieving relevant information...
ü§ñ Generating answer with gpt-4o-mini...
üìù √úniversite √ºcretleri ne kadar?

ü§ñ √úniversite √ºcretleri her yƒ±l M√ºtevelli Heyeti tarafƒ±ndan maliyet artƒ±≈ülarƒ± dikkate alƒ±narak yeniden belirlenmektedir. ƒ∞lan edilen √ºcretler yalnƒ±zca eƒüitim-√∂ƒüretim masraflarƒ±nƒ± kapsamaktadƒ±r; konaklama, beslenme, ula≈üƒ±m, kitap gibi diƒüer giderler bu √

In [1]:
class RAGChatbot:
    """
    RAG Chatbot with conversation memory
    """
    def __init__(self, model='gpt-4o-mini'):
        self.model = model
        self.conversation_history = []
    
    def chat(self, query, top_k=5, language='auto'):
        """Chat with memory"""
        
        # Retrieve chunks
        retrieved_chunks = retrieve_relevant_chunks(query, top_k)
        
        # Create context
        system_prompt, _ = create_rag_prompt(query, retrieved_chunks, language)
        
        # Build messages with history
        messages = [{"role": "system", "content": system_prompt}]
        
        # Add conversation history (last 3 turns)
        messages.extend(self.conversation_history[-6:])
        
        # Add current query
        messages.append({"role": "user", "content": query})
        
        # Generate response
        response = openai.chat.completions.create(
            model=self.model,
            messages=messages,
            temperature=0.3,
            max_tokens=1000,
        )
        
        answer = response.choices[0].message.content
        
        # Update history
        self.conversation_history.append({"role": "user", "content": query})
        self.conversation_history.append({"role": "assistant", "content": answer})
        
        return {
            'answer': answer,
            'sources': retrieved_chunks,
            'tokens': response.usage.total_tokens
        }
    
    def reset(self):
        """Clear conversation history"""
        self.conversation_history = []

# Test chatbot with memory
chatbot = RAGChatbot()

print("=" * 70)
print("TESTING CHATBOT WITH MEMORY")
print("=" * 70)

# Conversation
questions = [
    "√ºniversitenin adƒ± nedir?",
    "konumu nerde?",  # Follow-up question
    "nasƒ±l gidebilirim?",  # Another follow-up
]

for q in questions:
    print(f"\nüë§ Kullanƒ±cƒ±: {q}")
    result = chatbot.chat(q, top_k=3, language='tr')
    print(f"ü§ñ Asistan: {result['answer']}")
    print(f"üí∞ Tokens: {result['tokens']}")

TESTING CHATBOT WITH MEMORY

üë§ Kullanƒ±cƒ±: √ºniversitenin adƒ± nedir?


NameError: name 'retrieve_relevant_chunks' is not defined

In [13]:
# Save complete configuration
rag_config = {
    'embedding_model': 'text-embedding-3-small',
    'embedding_dimension': embeddings.shape[1],
    'llm_model': 'gpt-4o-mini',
    'total_chunks': len(chunks),
    'faiss_index_path': 'faiss_index.bin',
    'chunks_path': 'chunks.json',
    'embeddings_path': 'embeddings_openai.npy',
    'default_top_k': 5,
    'temperature': 0.3,
    'max_tokens': 1000,
}

with open('rag_config.json', 'w') as f:
    json.dump(rag_config, f, indent=2)

print("‚úì Saved RAG configuration to: rag_config.json")

print("\n" + "=" * 70)
print("RAG SYSTEM COMPLETE! üéâ")
print("=" * 70)
print("\nFiles created:")
print("  1. embeddings_openai.npy - OpenAI embeddings")
print("  2. faiss_index.bin - Vector index")
print("  3. rag_config.json - System configuration")
print("\nYou can now:")
print("  - Use answer_question() for single queries")
print("  - Use RAGChatbot() for conversations")
print("  - Build a web interface (Streamlit/Gradio)")

‚úì Saved RAG configuration to: rag_config.json

RAG SYSTEM COMPLETE! üéâ

Files created:
  1. embeddings_openai.npy - OpenAI embeddings
  2. faiss_index.bin - Vector index
  3. rag_config.json - System configuration

You can now:
  - Use answer_question() for single queries
  - Use RAGChatbot() for conversations
  - Build a web interface (Streamlit/Gradio)


In [16]:
RAGChatbot()

<__main__.RAGChatbot at 0x7f62aee0cc50>

In [15]:
# Create a simple module for easy import
production_code = '''
import json
import numpy as np
import faiss
import openai
from dotenv import load_dotenv
import os

# Load config
load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

# Load resources
with open('chunks.json', 'r', encoding='utf-8') as f:
    chunks = json.load(f)

index = faiss.read_index('faiss_index.bin')

with open('rag_config.json', 'r') as f:
    config = json.load(f)

def answer_question(query, top_k=5):
    """Production RAG function"""
    # Your answer_question code here
    pass

# Ready to use!
'''

with open('rag_system.py', 'w') as f:
    f.write(production_code)

print("‚úì Created rag_system.py for production use")

‚úì Created rag_system.py for production use


In [30]:
result = answer_question("ƒ∞stanbul Sabahattin Zaim √úniversitesi‚Äônin yƒ±llƒ±k lisans √∂ƒürenim √ºcreti ne kadar?", top_k=3)
    
print(f"üìù {result['query']}")
print(f"\nü§ñ {result['answer']}")


üîç Retrieving relevant information...
ü§ñ Generating answer with gpt-4o-mini...
üìù ƒ∞stanbul Sabahattin Zaim √úniversitesi‚Äônin yƒ±llƒ±k lisans √∂ƒürenim √ºcreti ne kadar?

ü§ñ √úzg√ºn√ºm, ƒ∞stanbul Sabahattin Zaim √úniversitesi'nin yƒ±llƒ±k lisans √∂ƒürenim √ºcreti hakkƒ±nda elimde bilgi bulunmamaktadƒ±r. Ancak lisans√ºst√º eƒüitim √ºcretleri ve burslar hakkƒ±nda detaylƒ± bilgiye [bu linkten](https://www.izu.edu.tr/akademik/enstitu/lisansustu-egitim-enstitusu/ucretler-burs) ula≈üabilirsiniz. Ba≈üka bir konuda yardƒ±mcƒ± olmamƒ± ister misiniz?
