# SciTeX Scholar Tutorial with Impact Factor Integration

This notebook demonstrates how to use the `scitex.scholar` module for scientific literature management with automatic impact factor enrichment.

## Features Covered
- Literature search from multiple sources
- **Automatic impact factor enrichment** (NEW!)
- **Journal quartile and ranking information** (NEW!)
- Enhanced BibTeX generation with journal metrics
- PDF downloads (when available)
- Building a local search index
- **Direct impact factor database queries** (NEW!)

## 1. New Unified Scholar Interface with Auto-Enrichment

The modern way to use SciTeX Scholar with automatic impact factor enrichment:

In [ ]:
import scitex
from scitex.scholar import Scholar

# Initialize Scholar with auto-enrichment (default behavior)
scholar = Scholar(
    enrich_by_default=True,  # This is the default!
    email="your.email@example.com"  # For PubMed API
)

print("✓ Scholar initialized with auto-enrichment enabled")

# Modern search with automatic enrichment
papers = scholar.search(
    query="deep learning neuroscience",
    limit=5
)

print(f"\n📚 Found {len(papers)} papers (automatically enriched)")

# Display results with impact factor information
for i, paper in enumerate(papers, 1):
    print(f"\n{i}. {paper.title}")
    print(f"   Authors: {', '.join(paper.authors[:2])}...")
    print(f"   Journal: {paper.journal}")
    print(f"   Year: {paper.year}, Citations: {paper.citation_count}")
    
    # NEW: Show impact factor information
    if hasattr(paper, 'impact_factor') and paper.impact_factor:
        print(f"   📊 Impact Factor: {paper.impact_factor}")
    if hasattr(paper, 'journal_quartile') and paper.journal_quartile:
        print(f"   🏆 Journal Quartile: {paper.journal_quartile}")
    if hasattr(paper, 'journal_ranking') and paper.journal_ranking:
        print(f"   📈 Journal Ranking: {paper.journal_ranking}")

## 2. Direct Impact Factor Database Queries

Use the integrated impact_factor package for direct journal lookups:

In [ ]:
import sqlite3
import pandas as pd
import impact_factor

def search_journal_impact_factors(keyword, limit=10):
    """Search for journals by keyword and get their impact factors."""
    db_path = impact_factor.DEFAULT_DB
    
    try:
        conn = sqlite3.connect(db_path)
        
        query = """
        SELECT journal, factor as impact_factor, jcr as quartile, 
               issn, eissn, journal_abbr
        FROM factor 
        WHERE journal LIKE ? 
        ORDER BY factor DESC
        LIMIT ?
        """
        
        results = pd.read_sql_query(
            query, 
            conn, 
            params=[f'%{keyword}%', limit]
        )
        
        conn.close()
        return results
        
    except Exception as e:
        print(f"Error searching database: {e}")
        return pd.DataFrame()

# Example: Search for neuroscience journals
print("🧠 Top Neuroscience Journals by Impact Factor:")
neuro_journals = search_journal_impact_factors("neuroscience", limit=8)
if len(neuro_journals) > 0:
    for _, journal in neuro_journals.iterrows():
        print(f"  📊 {journal['journal']}")
        print(f"      IF: {journal['impact_factor']:.3f} | Quartile: {journal['quartile']} | ISSN: {journal['issn']}")
        print()
else:
    print("No neuroscience journals found")

# Example: Search for Nature journals
print("🔬 Nature Journal Family:")
nature_journals = search_journal_impact_factors("nature", limit=5)
if len(nature_journals) > 0:
    for _, journal in nature_journals.iterrows():
        print(f"  📊 {journal['journal']}: IF = {journal['impact_factor']:.3f} ({journal['quartile']})")
else:
    print("No Nature journals found")

## 3. Enhanced BibTeX Generation with Impact Factors

Generate BibTeX entries with comprehensive journal metrics:

In [ ]:
# Generate enriched BibTeX entries
print("📝 Enhanced BibTeX with Impact Factor Information:")
print("=" * 60)

if papers and len(papers) > 0:
    for i, paper in enumerate(papers[:2], 1):  # Show first 2 papers
        print(f"\n🔗 Paper {i}:")
        bibtex = paper.to_bibtex(include_enriched=True)
        print(bibtex)
        print("-" * 40)
else:
    print("No papers available for BibTeX generation")

# Alternative: Manual BibTeX enrichment for any paper
def create_enriched_bibtex(title, authors, journal, year, doi=None):
    """Create enriched BibTeX entry with impact factor lookup."""
    
    # Look up impact factor for the journal
    journal_info = search_journal_impact_factors(journal, limit=1)
    
    impact_factor = None
    quartile = None
    if len(journal_info) > 0:
        impact_factor = journal_info.iloc[0]['impact_factor']
        quartile = journal_info.iloc[0]['quartile']
    
    # Generate BibTeX entry
    author_list = " and ".join(authors) if isinstance(authors, list) else authors
    clean_title = title.replace("{", "").replace("}", "")
    entry_key = f"{authors[0].split()[-1] if isinstance(authors, list) else 'Unknown'}{year}"
    
    bibtex = f"""@article{{{entry_key},
    title = {{{clean_title}}},
    author = {{{author_list}}},
    journal = {{{journal}}},
    year = {{{year}}}"""
    
    if doi:
        bibtex += f",\n    doi = {{{doi}}}"
    
    if impact_factor:
        bibtex += f",\n    note = {{Impact Factor: {impact_factor:.3f}"
        if quartile:
            bibtex += f", Quartile: {quartile}"
        bibtex += "}"
    
    bibtex += "\n}"
    
    return bibtex

# Example of manual enrichment
print("\n📖 Manual BibTeX Enrichment Example:")
manual_bibtex = create_enriched_bibtex(
    title="Deep learning in neuroscience",
    authors=["Smith, J.", "Doe, A."],
    journal="Nature Neuroscience",
    year=2024,
    doi="10.1038/example"
)
print(manual_bibtex)

## 4. Journal Analysis and Rankings

In [ ]:
def get_journal_statistics():
    """Get comprehensive statistics from the impact factor database."""
    db_path = impact_factor.DEFAULT_DB
    
    try:
        conn = sqlite3.connect(db_path)
        
        # Overall statistics
        stats_query = """
        SELECT 
            COUNT(*) as total_journals,
            AVG(factor) as avg_impact_factor,
            MIN(factor) as min_impact_factor,
            MAX(factor) as max_impact_factor,
            COUNT(CASE WHEN jcr = 'Q1' THEN 1 END) as q1_journals,
            COUNT(CASE WHEN jcr = 'Q2' THEN 1 END) as q2_journals,
            COUNT(CASE WHEN jcr = 'Q3' THEN 1 END) as q3_journals,
            COUNT(CASE WHEN jcr = 'Q4' THEN 1 END) as q4_journals
        FROM factor
        """
        
        stats = pd.read_sql_query(stats_query, conn)
        
        # Top journals
        top_journals_query = """
        SELECT journal, factor, jcr, issn
        FROM factor 
        ORDER BY factor DESC
        LIMIT 10
        """
        
        top_journals = pd.read_sql_query(top_journals_query, conn)
        
        conn.close()
        
        return stats.iloc[0], top_journals
        
    except Exception as e:
        print(f"Error getting statistics: {e}")
        return None, pd.DataFrame()

# Get and display journal statistics
print("📊 Impact Factor Database Statistics:")
print("=" * 50)

stats, top_journals = get_journal_statistics()

if stats is not None:
    print(f"📈 Total Journals: {stats['total_journals']:,}")
    print(f"📈 Average Impact Factor: {stats['avg_impact_factor']:.3f}")
    print(f"📈 Impact Factor Range: {stats['min_impact_factor']:.3f} - {stats['max_impact_factor']:.3f}")
    print(f"\n🏆 Quartile Distribution:")
    print(f"   Q1 (Top): {stats['q1_journals']:,} journals")
    print(f"   Q2: {stats['q2_journals']:,} journals") 
    print(f"   Q3: {stats['q3_journals']:,} journals")
    print(f"   Q4: {stats['q4_journals']:,} journals")

print(f"\n🥇 Top 10 Journals by Impact Factor:")
if len(top_journals) > 0:
    for i, journal in top_journals.iterrows():
        print(f"  {i+1:2d}. {journal['journal']}")
        print(f"      📊 IF: {journal['factor']:.3f} | 🏆 {journal['jcr']} | 📄 ISSN: {journal['issn']}")
        print()

# Search for specific fields
print("🔍 Field-Specific Journal Analysis:")
fields = ["machine learning", "artificial intelligence", "computer vision", "robotics"]

for field in fields:
    field_journals = search_journal_impact_factors(field, limit=3)
    if len(field_journals) > 0:
        avg_if = field_journals['impact_factor'].mean()
        print(f"\n🎯 {field.title()}:")
        print(f"   📊 Average IF: {avg_if:.3f}")
        print(f"   🏆 Top journal: {field_journals.iloc[0]['journal']} (IF: {field_journals.iloc[0]['impact_factor']:.3f})")

## 5. Advanced Search with AI Enhancement

Use AI to improve search results (requires API keys):

In [None]:
from scitex.scholar import PaperAcquisition

# Initialize with AI enhancement
acquisition = PaperAcquisition(
    use_ai=True,  # Enable AI features
    email="your.email@example.com"  # Required for some APIs
)

# Search with specific criteria
results = await acquisition.search(
    query="phase amplitude coupling cognitive control",
    sources=['semantic_scholar', 'pubmed'],
    max_results=10,
    start_year=2020
)

print(f"Found {len(results)} recent papers on the topic")

## 6. Building a Local Search Index

Create a searchable index of your PDF collection:

In [None]:
from scitex.scholar import build_index, search_sync

# Build index from PDFs in a directory
pdf_directory = Path("~/Documents/Papers").expanduser()
if pdf_directory.exists():
    build_index(str(pdf_directory))
    
    # Search your local collection
    local_results = search_sync(
        "neural oscillations",
        local_only=True
    )
    
    print(f"Found {len(local_results)} papers in local collection")

## Tips and Best Practices

1. **Rate Limiting**: The module automatically handles rate limiting for APIs
2. **Caching**: Search results are cached to avoid redundant API calls
3. **Error Handling**: Network errors are handled gracefully
4. **PDF Access**: Only open-access PDFs can be downloaded automatically

## Environment Variables

Set these for enhanced functionality:
- `OPENAI_API_KEY`: For AI-enhanced search
- `SEMANTIC_SCHOLAR_API_KEY`: For higher rate limits
- `ENTREZ_EMAIL`: Your email for PubMed API