In [1]:
import json
import pandas as pd

print("=" * 70)
print("LOADING DATA")
print("=" * 70)

# Load JSONL file
data = []
with open('/home/zeynkash/projects/izu_scraper/chunking/all_data_cleaned.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

print(f"‚úì Loaded {len(data)} documents")

# Convert to DataFrame for easy exploration
df = pd.DataFrame(data)

print(f"\nColumns: {list(df.columns)}")
print(f"\nSample document:")
print(json.dumps(data[0], indent=2, ensure_ascii=False)[:500])

# Check content lengths
df['content_length'] = df['content'].str.len()
print(f"\nContent length statistics:")
print(df['content_length'].describe())

LOADING DATA
‚úì Loaded 723 documents

Columns: ['url', 'title', 'language', 'content']

Sample document:
{
  "url": "https://www.izu.edu.tr/en/international/international-students/student-guide",
  "title": "Student Guide",
  "language": "en",
  "content": "Student Guide Welcome to Istanbul Sabahattin Zaim University (IZU) We are delighted to welcome all international students to IZU. To ensure a smooth transition into university life, we highly recommend reviewing International Student Guide. This guide contains essential information, and students are expected to familiarise themselves with its co

Content length statistics:
count      723.000000
mean      3750.058091
std       5305.768931
min        107.000000
25%        799.000000
50%       1696.000000
75%       3809.000000
max      30185.000000
Name: content_length, dtype: float64


In [3]:
# Install tiktoken if not already installed
import subprocess
import sys

try:
    import tiktoken
    print("‚úì tiktoken already installed")
except ImportError:
    print("Installing tiktoken...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "tiktoken"])
    import tiktoken
    print("‚úì tiktoken installed")

# Initialize tokenizer
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

def count_tokens(text):
    """Count tokens in text"""
    if not text:
        return 0
    return len(encoding.encode(text))

# Test token counting
sample_text = data[0]['content']
tokens = count_tokens(sample_text)
chars = len(sample_text)

print(f"\nSample text:")
print(f"  Characters: {chars}")
print(f"  Tokens: {tokens}")
print(f"  Ratio: ~{chars/tokens:.1f} chars per token")

‚úì tiktoken already installed

Sample text:
  Characters: 1093
  Tokens: 231
  Ratio: ~4.7 chars per token


In [4]:
def split_into_chunks(text, chunk_size=800, chunk_overlap=150):
    """
    Split text into chunks with overlap
    
    Args:
        text: Text to split
        chunk_size: Target chunk size in tokens
        chunk_overlap: Overlap between chunks in tokens
    
    Returns:
        List of text chunks
    """
    if not text or not text.strip():
        return []
    
    # Encode text to tokens
    tokens = encoding.encode(text)
    
    # If text is shorter than chunk_size, return as single chunk
    if len(tokens) <= chunk_size:
        return [text]
    
    chunks = []
    start = 0
    
    while start < len(tokens):
        # Get chunk of tokens
        end = start + chunk_size
        chunk_tokens = tokens[start:end]
        
        # Decode back to text
        chunk_text = encoding.decode(chunk_tokens)
        chunks.append(chunk_text)
        
        # Move start position (with overlap)
        start = end - chunk_overlap
        
        # Prevent infinite loop
        if start >= len(tokens):
            break
    
    return chunks

# Test on sample
test_text = data[0]['content']
test_chunks = split_into_chunks(test_text, chunk_size=800, chunk_overlap=150)

print(f"\nTest Chunking:")
print(f"Original tokens: {count_tokens(test_text)}")
print(f"Number of chunks: {len(test_chunks)}")
print(f"\nFirst chunk ({count_tokens(test_chunks[0])} tokens):")
print(test_chunks[0][:300])
print("\n...")
if len(test_chunks) > 1:
    print(f"\nSecond chunk ({count_tokens(test_chunks[1])} tokens):")
    print(test_chunks[1][:300])


Test Chunking:
Original tokens: 231
Number of chunks: 1

First chunk (231 tokens):
Student Guide Welcome to Istanbul Sabahattin Zaim University (IZU) We are delighted to welcome all international students to IZU. To ensure a smooth transition into university life, we highly recommend reviewing International Student Guide. This guide contains essential information, and students are

...


In [5]:
import re

def smart_split_into_chunks(text, chunk_size=800, chunk_overlap=150):
    """
    Split text into chunks with sentence boundary awareness
    
    Args:
        text: Text to split
        chunk_size: Target chunk size in tokens
        chunk_overlap: Overlap between chunks in tokens
    
    Returns:
        List of text chunks
    """
    if not text or not text.strip():
        return []
    
    # Split into sentences (works for both Turkish and English)
    sentence_endings = r'[.!?]\s+'
    sentences = re.split(sentence_endings, text)
    sentences = [s.strip() for s in sentences if s.strip()]
    
    chunks = []
    current_chunk = []
    current_tokens = 0
    
    for sentence in sentences:
        sentence_tokens = count_tokens(sentence)
        
        # If single sentence exceeds chunk_size, split it by tokens
        if sentence_tokens > chunk_size:
            # Save current chunk if not empty
            if current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
                current_tokens = 0
            
            # Split long sentence
            tokens = encoding.encode(sentence)
            for i in range(0, len(tokens), chunk_size - chunk_overlap):
                chunk_tokens = tokens[i:i + chunk_size]
                chunks.append(encoding.decode(chunk_tokens))
            continue
        
        # Check if adding this sentence exceeds chunk_size
        if current_tokens + sentence_tokens > chunk_size:
            # Save current chunk
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            
            # Start new chunk with overlap
            # Keep last few sentences for context
            overlap_sentences = []
            overlap_tokens = 0
            
            for sent in reversed(current_chunk):
                sent_tokens = count_tokens(sent)
                if overlap_tokens + sent_tokens <= chunk_overlap:
                    overlap_sentences.insert(0, sent)
                    overlap_tokens += sent_tokens
                else:
                    break
            
            current_chunk = overlap_sentences
            current_tokens = overlap_tokens
        
        # Add sentence to current chunk
        current_chunk.append(sentence)
        current_tokens += sentence_tokens
    
    # Add last chunk
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

# Test smart chunking
test_chunks_smart = smart_split_into_chunks(test_text, chunk_size=800, chunk_overlap=150)

print(f"\nSmart Chunking Test:")
print(f"Original tokens: {count_tokens(test_text)}")
print(f"Number of chunks: {len(test_chunks_smart)}")
print(f"\nFirst chunk ({count_tokens(test_chunks_smart[0])} tokens):")
print(test_chunks_smart[0][:300])


Smart Chunking Test:
Original tokens: 231
Number of chunks: 1

First chunk (223 tokens):
Student Guide Welcome to Istanbul Sabahattin Zaim University (IZU) We are delighted to welcome all international students to IZU To ensure a smooth transition into university life, we highly recommend reviewing International Student Guide This guide contains essential information, and students are e


In [6]:
from tqdm import tqdm

def chunk_document(doc, chunk_size=800, chunk_overlap=150):
    """
    Chunk a single document and return chunks with metadata
    
    Args:
        doc: Document dict from JSONL
        chunk_size: Target chunk size in tokens
        chunk_overlap: Overlap in tokens
    
    Returns:
        List of chunk objects with metadata
    """
    content = doc.get('content', '')
    
    # Add context header to content
    title = doc.get('title', 'Untitled')
    url = doc.get('url', '')
    header = f"Title: {title}\nURL: {url}\n\n"
    
    full_content = header + content
    
    # Split into chunks
    text_chunks = smart_split_into_chunks(full_content, chunk_size, chunk_overlap)
    
    # Create chunk objects with metadata
    chunk_objects = []
    for i, chunk_text in enumerate(text_chunks):
        chunk_obj = {
            'chunk_id': f"{doc.get('id', hash(url))}_{i}",
            'document_id': doc.get('id', hash(url)),
            'chunk_index': i,
            'total_chunks': len(text_chunks),
            'content': chunk_text,
            'tokens': count_tokens(chunk_text),
            'metadata': {
                'url': url,
                'title': title,
                'language': doc.get('language', 'unknown'),
                'section': doc.get('section', 'general'),
                'date_scraped': doc.get('date_scraped', ''),
            }
        }
        chunk_objects.append(chunk_obj)
    
    return chunk_objects

# Chunk all documents
print("=" * 70)
print("CHUNKING ALL DOCUMENTS")
print("=" * 70)

all_chunks = []

for doc in tqdm(data, desc="Chunking documents"):
    try:
        chunks = chunk_document(doc, chunk_size=800, chunk_overlap=150)
        all_chunks.extend(chunks)
    except Exception as e:
        print(f"Error chunking document {doc.get('url', 'unknown')}: {e}")
        continue

print(f"\n‚úì Chunking complete!")
print(f"  Original documents: {len(data)}")
print(f"  Total chunks: {len(all_chunks)}")
print(f"  Average chunks per doc: {len(all_chunks) / len(data):.1f}")

CHUNKING ALL DOCUMENTS


Chunking documents: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 723/723 [00:00<00:00, 782.40it/s] 


‚úì Chunking complete!
  Original documents: 723
  Total chunks: 1747
  Average chunks per doc: 2.4





In [7]:
import statistics

# Calculate statistics
chunk_tokens = [c['tokens'] for c in all_chunks]
chunks_per_doc = {}

for chunk in all_chunks:
    doc_id = chunk['document_id']
    chunks_per_doc[doc_id] = chunks_per_doc.get(doc_id, 0) + 1

print("=" * 70)
print("CHUNK STATISTICS")
print("=" * 70)

print(f"\nToken Distribution:")
print(f"  Mean: {statistics.mean(chunk_tokens):.0f} tokens")
print(f"  Median: {statistics.median(chunk_tokens):.0f} tokens")
print(f"  Min: {min(chunk_tokens)} tokens")
print(f"  Max: {max(chunk_tokens)} tokens")
print(f"  Std Dev: {statistics.stdev(chunk_tokens):.0f} tokens")

print(f"\nChunks per Document:")
print(f"  Mean: {statistics.mean(chunks_per_doc.values()):.1f}")
print(f"  Median: {statistics.median(chunks_per_doc.values()):.0f}")
print(f"  Min: {min(chunks_per_doc.values())}")
print(f"  Max: {max(chunks_per_doc.values())}")

# Language distribution
lang_dist = {}
for chunk in all_chunks:
    lang = chunk['metadata']['language']
    lang_dist[lang] = lang_dist.get(lang, 0) + 1

print(f"\nLanguage Distribution:")
for lang, count in sorted(lang_dist.items()):
    print(f"  {lang}: {count} chunks ({count/len(all_chunks)*100:.1f}%)")

# Section distribution
section_dist = {}
for chunk in all_chunks:
    section = chunk['metadata']['section']
    section_dist[section] = section_dist.get(section, 0) + 1

print(f"\nSection Distribution (Top 5):")
for section, count in sorted(section_dist.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"  {section}: {count} chunks ({count/len(all_chunks)*100:.1f}%)")

CHUNK STATISTICS

Token Distribution:
  Mean: 585 tokens
  Median: 696 tokens
  Min: 20 tokens
  Max: 914 tokens
  Std Dev: 223 tokens

Chunks per Document:
  Mean: 2.4
  Median: 1
  Min: 1
  Max: 20

Language Distribution:
  en: 545 chunks (31.2%)
  tr: 1202 chunks (68.8%)

Section Distribution (Top 5):
  general: 1747 chunks (100.0%)


In [8]:
import random

print("=" * 70)
print("QUALITY CHECK - RANDOM SAMPLES")
print("=" * 70)

# Show 3 random chunks
for i in range(3):
    chunk = random.choice(all_chunks)
    
    print(f"\n{'='*70}")
    print(f"Sample {i+1}")
    print(f"{'='*70}")
    print(f"Chunk ID: {chunk['chunk_id']}")
    print(f"Chunk: {chunk['chunk_index'] + 1}/{chunk['total_chunks']}")
    print(f"Tokens: {chunk['tokens']}")
    print(f"Language: {chunk['metadata']['language']}")
    print(f"Section: {chunk['metadata']['section']}")
    print(f"Title: {chunk['metadata']['title']}")
    print(f"\nContent:")
    print(chunk['content'][:400])
    print("...")

QUALITY CHECK - RANDOM SAMPLES

Sample 1
Chunk ID: -6701365950551010967_0
Chunk: 1/1
Tokens: 192
Language: en
Section: general
Title: ƒ∞Z√ú Hosts the Leaders of the Future

Content:
Title: ƒ∞Z√ú Hosts the Leaders of the Future
URL: https://www.izu.edu.tr/en/news/2025/05/05/i-z%C3%BC-hosts-the-leaders-of-the-future

ƒ∞Z√ú Hosts the Leaders of the Future 05.05.2025 The MUN (Model United Nations) event, whose opening session was held at Ba≈üak≈üehir Mehmet Emin Sara√ß Anatolian Imam Hatip High School, is continuing at the ƒ∞Z√ú Campus In the program, which began with an opening speech by
...

Sample 2
Chunk ID: 2573411217997351349_6
Chunk: 7/7
Tokens: 636
Language: tr
Section: general
Title: Baƒüƒ±l Deƒüerlendirme Y√∂nergesi

Content:
3) √ñƒürencilerin Ham Ba≈üarƒ± Notu hesaplanƒ±rken; virg√ºlden sonraki √º√ß√ºnc√º basamak be≈ü veya daha b√ºy√ºkse ikinci basamak artƒ±rƒ±lƒ±r, √º√ß√ºnc√º basamak d√∂rt veya daha k√º√ß√ºkse ikinci basamak deƒüi≈ütirilmeden bƒ±rakƒ±lƒ±r 4) Sƒ±nƒ±f ortalamasƒ±

In [9]:
print("=" * 70)
print("ISSUE CHECKS")
print("=" * 70)

# Very short chunks
short_chunks = [c for c in all_chunks if c['tokens'] < 100]
print(f"\n‚ö†Ô∏è  Chunks < 100 tokens: {len(short_chunks)} ({len(short_chunks)/len(all_chunks)*100:.1f}%)")

if short_chunks:
    print("  Sample short chunks:")
    for chunk in short_chunks[:3]:
        print(f"    - {chunk['tokens']} tokens: {chunk['content'][:100]}...")

# Very long chunks
long_chunks = [c for c in all_chunks if c['tokens'] > 1000]
print(f"\n‚ö†Ô∏è  Chunks > 1000 tokens: {len(long_chunks)} ({len(long_chunks)/len(all_chunks)*100:.1f}%)")

if long_chunks:
    print("  Sample long chunks:")
    for chunk in long_chunks[:3]:
        print(f"    - {chunk['tokens']} tokens from: {chunk['metadata']['title'][:60]}")

# Empty content
empty_chunks = [c for c in all_chunks if not c['content'].strip()]
print(f"\n‚ö†Ô∏è  Empty chunks: {len(empty_chunks)}")

# Optimal range (300-900 tokens)
optimal_chunks = [c for c in all_chunks if 300 <= c['tokens'] <= 900]
print(f"\n‚úì Optimal chunks (300-900 tokens): {len(optimal_chunks)} ({len(optimal_chunks)/len(all_chunks)*100:.1f}%)")

ISSUE CHECKS

‚ö†Ô∏è  Chunks < 100 tokens: 28 (1.6%)
  Sample short chunks:
    - 91 tokens:  PANSTWOWA IM PAPIEZA JANA PAWLA II W BIALEJ PODLASKIEJ PANEVEZIO VIESOJI ISTAIGA EKONOMII I INNOWAC...
    - 90 tokens: Title: Announcements
URL: https://www.izu.edu.tr/en/research/library/announcements

Research and Tea...
    - 88 tokens: Title: Announcements
URL: https://izu.edu.tr/en/research/library/announcements

Research and Teachin...

‚ö†Ô∏è  Chunks > 1000 tokens: 0 (0.0%)

‚ö†Ô∏è  Empty chunks: 0

‚úì Optimal chunks (300-900 tokens): 1455 (83.3%)


In [10]:
import json

print("=" * 70)
print("SAVING CHUNKS")
print("=" * 70)

# Save as JSON
output_file = 'chunks.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=2)

print(f"‚úì Saved {len(all_chunks)} chunks to: {output_file}")

# Also save as JSONL (one chunk per line - efficient for large datasets)
output_jsonl = 'chunks.jsonl'
with open(output_jsonl, 'w', encoding='utf-8') as f:
    for chunk in all_chunks:
        f.write(json.dumps(chunk, ensure_ascii=False) + '\n')

print(f"‚úì Saved to: {output_jsonl}")

# Save metadata
metadata = {
    'total_documents': len(data),
    'total_chunks': len(all_chunks),
    'avg_chunks_per_doc': len(all_chunks) / len(data),
    'chunk_size': 800,
    'chunk_overlap': 150,
    'avg_tokens_per_chunk': statistics.mean(chunk_tokens),
    'languages': lang_dist,
    'sections': section_dist,
}

with open('chunks_metadata.json', 'w', encoding='utf-8') as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)

print(f"‚úì Saved metadata to: chunks_metadata.json")

# Save as CSV for easy viewing
chunks_df = pd.DataFrame([
    {
        'chunk_id': c['chunk_id'],
        'chunk_index': c['chunk_index'],
        'tokens': c['tokens'],
        'language': c['metadata']['language'],
        'section': c['metadata']['section'],
        'title': c['metadata']['title'],
        'url': c['metadata']['url'],
        'content_preview': c['content'][:200] + '...'
    }
    for c in all_chunks
])

chunks_df.to_csv('chunks_preview.csv', index=False)
print(f"‚úì Saved preview to: chunks_preview.csv")

print(f"\n{'='*70}")
print("FILES CREATED:")
print(f"{'='*70}")
print("  1. chunks.json - All chunks with full metadata")
print("  2. chunks.jsonl - Same data, one per line (efficient)")
print("  3. chunks_metadata.json - Statistics and config")
print("  4. chunks_preview.csv - Easy viewing in Excel")
print(f"\nReady for embedding generation! üöÄ")

SAVING CHUNKS
‚úì Saved 1747 chunks to: chunks.json
‚úì Saved to: chunks.jsonl
‚úì Saved metadata to: chunks_metadata.json
‚úì Saved preview to: chunks_preview.csv

FILES CREATED:
  1. chunks.json - All chunks with full metadata
  2. chunks.jsonl - Same data, one per line (efficient)
  3. chunks_metadata.json - Statistics and config
  4. chunks_preview.csv - Easy viewing in Excel

Ready for embedding generation! üöÄ


In [11]:
print("=" * 70)
print("CHUNKING SUMMARY")
print("=" * 70)

print(f"\nüìä Input:")
print(f"   Documents: {len(data)}")

print(f"\nüì¶ Output:")
print(f"   Total chunks: {len(all_chunks)}")
print(f"   Avg chunks/doc: {len(all_chunks)/len(data):.1f}")

print(f"\nüéØ Quality:")
print(f"   Avg tokens/chunk: {statistics.mean(chunk_tokens):.0f}")
print(f"   Optimal chunks: {len(optimal_chunks)/len(all_chunks)*100:.1f}%")

print(f"\nüåç Languages:")
for lang, count in lang_dist.items():
    print(f"   {lang}: {count} chunks")

print(f"\n‚úÖ Next Step: Generate embeddings using chunks.json or chunks.jsonl")

CHUNKING SUMMARY

üìä Input:
   Documents: 723

üì¶ Output:
   Total chunks: 1747
   Avg chunks/doc: 2.4

üéØ Quality:
   Avg tokens/chunk: 585
   Optimal chunks: 83.3%

üåç Languages:
   en: 545 chunks
   tr: 1202 chunks

‚úÖ Next Step: Generate embeddings using chunks.json or chunks.jsonl
