In [2]:
!pip install lilac[all] pandas numpy

Collecting FlagEmbedding<2.0.0,>=1.2.3 (from lilac[all])
  Downloading FlagEmbedding-1.3.5.tar.gz (163 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/163.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.9/163.9 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cohere<5.0,>=4.32 (from lilac[all])
  Downloading cohere-4.57-py3-none-any.whl.metadata (6.2 kB)
Collecting detect-secrets<2.0.0,>=1.4.0 (from lilac[all])
  Downloading detect_secrets-1.5.0-py3-none-any.whl.metadata (23 kB)
Collecting email-reply-parser<0.6.0,>=0.5.12 (from lilac[all])
  Downloading email_reply_parser-0.5.12-py3-none-any.whl.metadata (828 bytes)
Collecting google-auth-httplib2<0.2.0,>=0.1.0 (from lilac[all])
  Downloading google_auth_httplib2-0.1.1-py2.py3-none-any.whl.metadata (2.1 kB)
Collecting langdetect<2.0.0,>=1.0.9 (from lilac[all])
  Downloading langdetect

In [3]:
import json
import uuid
import pandas as pd
from pathlib import Path
from typing import List, Dict, Any, Tuple, Optional
from functools import reduce, partial
import lilac as ll

In [4]:
def pipe(*functions):
    """Compose functions left to right (pipe operator)"""
    return lambda x: reduce(lambda acc, f: f(acc), functions, x)

def map_over(func, iterable):
    """Functional map wrapper"""
    return list(map(func, iterable))

def filter_by(predicate, iterable):
    """Functional filter wrapper"""
    return list(filter(predicate, iterable))

def create_sample_data() -> List[Dict[str, Any]]:
    """Generate realistic sample data for analysis"""
    return [
        {"id": 1, "text": "What is machine learning?", "category": "tech", "score": 0.9, "tokens": 5},
        {"id": 2, "text": "Machine learning is AI subset", "category": "tech", "score": 0.8, "tokens": 6},
        {"id": 3, "text": "Contact support for help", "category": "support", "score": 0.7, "tokens": 4},
        {"id": 4, "text": "What is machine learning?", "category": "tech", "score": 0.9, "tokens": 5},
        {"id": 5, "text": "Deep learning neural networks", "category": "tech", "score": 0.85, "tokens": 4},
        {"id": 6, "text": "How to optimize models?", "category": "tech", "score": 0.75, "tokens": 5},
        {"id": 7, "text": "Performance tuning guide", "category": "guide", "score": 0.6, "tokens": 3},
        {"id": 8, "text": "Advanced optimization techniques", "category": "tech", "score": 0.95, "tokens": 3},
        {"id": 9, "text": "Gradient descent algorithm", "category": "tech", "score": 0.88, "tokens": 3},
        {"id": 10, "text": "Model evaluation metrics", "category": "tech", "score": 0.82, "tokens": 3},
    ]

In [5]:
def setup_lilac_project(project_name: str) -> str:
    """Initialize Lilac project directory"""
    project_dir = f"./{project_name}-{uuid.uuid4().hex[:6]}"
    Path(project_dir).mkdir(exist_ok=True)
    ll.set_project_dir(project_dir)
    return project_dir

def create_dataset_from_data(name: str, data: List[Dict]) -> ll.Dataset:
    """Create Lilac dataset from data"""
    data_file = f"{name}.jsonl"
    with open(data_file, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')

    config = ll.DatasetConfig(
        namespace="tutorial",
        name=name,
        source=ll.sources.JSONSource(filepaths=[data_file])
    )

    return ll.create_dataset(config)

In [6]:
def extract_dataframe(dataset: ll.Dataset, fields: List[str]) -> pd.DataFrame:
    """Extract data as pandas DataFrame"""
    return dataset.to_pandas(fields)

def apply_functional_filters(df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
    """Apply various filters and return multiple filtered versions"""

    filters = {
        'high_score': lambda df: df[df['score'] >= 0.8],
        'tech_category': lambda df: df[df['category'] == 'tech'],
        'min_tokens': lambda df: df[df['tokens'] >= 4],
        'no_duplicates': lambda df: df.drop_duplicates(subset=['text'], keep='first'),
        'combined_quality': lambda df: df[(df['score'] >= 0.8) & (df['tokens'] >= 3) & (df['category'] == 'tech')]
    }

    return {name: filter_func(df.copy()) for name, filter_func in filters.items()}

In [7]:
def analyze_data_quality(df: pd.DataFrame) -> Dict[str, Any]:
    """Analyze data quality metrics"""
    return {
        'total_records': len(df),
        'unique_texts': df['text'].nunique(),
        'duplicate_rate': 1 - (df['text'].nunique() / len(df)),
        'avg_score': df['score'].mean(),
        'category_distribution': df['category'].value_counts().to_dict(),
        'score_distribution': {
            'high': len(df[df['score'] >= 0.8]),
            'medium': len(df[(df['score'] >= 0.6) & (df['score'] < 0.8)]),
            'low': len(df[df['score'] < 0.6])
        },
        'token_stats': {
            'mean': df['tokens'].mean(),
            'min': df['tokens'].min(),
            'max': df['tokens'].max()
        }
    }

def create_data_transformations() -> Dict[str, callable]:
    """Create various data transformation functions"""
    return {
        'normalize_scores': lambda df: df.assign(norm_score=df['score'] / df['score'].max()),
        'add_length_category': lambda df: df.assign(
            length_cat=pd.cut(df['tokens'], bins=[0, 3, 5, float('inf')], labels=['short', 'medium', 'long'])
        ),
        'add_quality_tier': lambda df: df.assign(
            quality_tier=pd.cut(df['score'], bins=[0, 0.6, 0.8, 1.0], labels=['low', 'medium', 'high'])
        ),
        'add_category_rank': lambda df: df.assign(
            category_rank=df.groupby('category')['score'].rank(ascending=False)
        )
    }

In [8]:
def apply_transformations(df: pd.DataFrame, transform_names: List[str]) -> pd.DataFrame:
    """Apply selected transformations"""
    transformations = create_data_transformations()
    selected_transforms = [transformations[name] for name in transform_names if name in transformations]

    return pipe(*selected_transforms)(df.copy()) if selected_transforms else df

def export_filtered_data(filtered_datasets: Dict[str, pd.DataFrame], output_dir: str) -> None:
    """Export filtered datasets to files"""
    Path(output_dir).mkdir(exist_ok=True)

    for name, df in filtered_datasets.items():
        output_file = Path(output_dir) / f"{name}_filtered.jsonl"
        with open(output_file, 'w') as f:
            for _, row in df.iterrows():
                f.write(json.dumps(row.to_dict()) + '\n')
        print(f"Exported {len(df)} records to {output_file}")

In [9]:
def main_analysis_pipeline():
    """Main analysis pipeline demonstrating functional approach"""

    print("🚀 Setting up Lilac project...")
    project_dir = setup_lilac_project("advanced_tutorial")

    print("📊 Creating sample dataset...")
    sample_data = create_sample_data()
    dataset = create_dataset_from_data("sample_data", sample_data)

    print("📋 Extracting data...")
    df = extract_dataframe(dataset, ['id', 'text', 'category', 'score', 'tokens'])

    print("🔍 Analyzing data quality...")
    quality_report = analyze_data_quality(df)
    print(f"Original data: {quality_report['total_records']} records")
    print(f"Duplicates: {quality_report['duplicate_rate']:.1%}")
    print(f"Average score: {quality_report['avg_score']:.2f}")

    print("🔄 Applying transformations...")
    transformed_df = apply_transformations(df, ['normalize_scores', 'add_length_category', 'add_quality_tier'])

    print("🎯 Applying filters...")
    filtered_datasets = apply_functional_filters(transformed_df)

    print("\n📈 Filter Results:")
    for name, filtered_df in filtered_datasets.items():
        print(f"  {name}: {len(filtered_df)} records")

    print("💾 Exporting filtered datasets...")
    export_filtered_data(filtered_datasets, f"{project_dir}/exports")

    print("\n🏆 Top Quality Records:")
    best_quality = filtered_datasets['combined_quality'].head(3)
    for _, row in best_quality.iterrows():
        print(f"  • {row['text']} (score: {row['score']}, category: {row['category']})")

    return {
        'original_data': df,
        'transformed_data': transformed_df,
        'filtered_data': filtered_datasets,
        'quality_report': quality_report
    }

if __name__ == "__main__":
    results = main_analysis_pipeline()
    print("\n✅ Analysis complete! Check the exports folder for filtered datasets.")

🚀 Setting up Lilac project...
📊 Creating sample dataset...
Dataset "sample_data" written to ./advanced_tutorial-28ff03/datasets/tutorial/sample_data
📋 Extracting data...
🔍 Analyzing data quality...
Original data: 10 records
Duplicates: 10.0%
Average score: 0.82
🔄 Applying transformations...
🎯 Applying filters...

📈 Filter Results:
  high_score: 7 records
  tech_category: 8 records
  min_tokens: 6 records
  no_duplicates: 9 records
  combined_quality: 7 records
💾 Exporting filtered datasets...
Exported 7 records to advanced_tutorial-28ff03/exports/high_score_filtered.jsonl
Exported 8 records to advanced_tutorial-28ff03/exports/tech_category_filtered.jsonl
Exported 6 records to advanced_tutorial-28ff03/exports/min_tokens_filtered.jsonl
Exported 9 records to advanced_tutorial-28ff03/exports/no_duplicates_filtered.jsonl
Exported 7 records to advanced_tutorial-28ff03/exports/combined_quality_filtered.jsonl

🏆 Top Quality Records:
  • Advanced optimization techniques (score: 0.95, category: 