# Exploring Professor Arbel's Scholarship Corpus

This notebook demonstrates how to explore and work with the corpus using the Python API.

## Setup

First, let's import the required libraries:

In [None]:
import sys
sys.path.insert(0, '..')  # Add parent directory to path

from corpus_api import ArbelCorpus
import json

## Load the Corpus

In [None]:
# Load corpus from parent directory
corpus = ArbelCorpus(base_dir='..')
print(f"Loaded corpus with {len(corpus)} papers")

## Corpus Statistics

In [None]:
stats = corpus.get_stats()
print(json.dumps(stats, indent=2))

## List All Papers

In [None]:
papers = corpus.list_papers()

print(f"Found {len(papers)} papers:\n")
for paper in papers[:5]:  # Show first 5
    print(f"- {paper.paper_id}: {paper.get_title()}")
    print(f"  Authors: {', '.join(paper.get_authors())}")
    print(f"  Year: {paper.get_year()}")
    print()

## Search Papers

In [None]:
# Search for papers about contracts
results = corpus.search_papers("contract")

print(f"Found {len(results)} papers about contracts:\n")
for paper in results[:3]:  # Show first 3
    print(f"- {paper.paper_id}")

## Explore a Specific Paper

In [None]:
# Get a specific paper
paper = corpus.get_paper('ssrn-3519630')

print(f"Paper ID: {paper.paper_id}")
print(f"Title: {paper.get_title()}")
print(f"Authors: {', '.join(paper.get_authors())}")
print(f"Year: {paper.get_year()}")
print(f"\nAvailable files: {', '.join(paper.get_available_files())}")

## Read Paper Summary

In [None]:
# Get English summary
summary = paper.get_summary()

if summary:
    print("Summary (first 500 characters):\n")
    print(summary[:500] + "...\n")
    print(f"Full summary length: {len(summary)} characters")

## Analyze Paper Distribution by Year

In [None]:
from collections import Counter

years = [p.get_year() for p in corpus.list_papers() if p.get_year()]
year_counts = Counter(years)

print("Papers by year:")
for year, count in sorted(year_counts.items()):
    print(f"{year}: {'â–ˆ' * count} ({count} papers)")

## Check for Bilingual Support

In [None]:
bilingual_papers = [
    p for p in corpus.list_papers() 
    if p.has_summary('en') and p.has_summary('zh')
]

print(f"Found {len(bilingual_papers)} papers with both English and Chinese summaries")
print(f"\nPercentage: {len(bilingual_papers) / len(corpus) * 100:.1f}%")

## Extract Training Data

In [None]:
# Create a simple training dataset
training_data = []

for paper in corpus.iterate_papers():
    summary = paper.get_summary()
    if summary:
        training_data.append({
            'id': paper.paper_id,
            'title': paper.get_title(),
            'text': summary,
            'year': paper.get_year()
        })

print(f"Created training dataset with {len(training_data)} examples")
print(f"\nFirst example:")
print(json.dumps(training_data[0], indent=2)[:300] + "...")

## Word Count Analysis

In [None]:
summary_lengths = []

for paper in corpus.iterate_papers():
    summary = paper.get_summary()
    if summary:
        word_count = len(summary.split())
        summary_lengths.append((paper.paper_id, word_count))

# Sort by length
summary_lengths.sort(key=lambda x: x[1], reverse=True)

print("Top 5 longest summaries:\n")
for paper_id, word_count in summary_lengths[:5]:
    print(f"{paper_id}: {word_count:,} words")

print(f"\nAverage summary length: {sum(w for _, w in summary_lengths) / len(summary_lengths):,.0f} words")

## Export Summaries

In [None]:
# Export all summaries to a JSON file
export_data = []

for paper in corpus.iterate_papers():
    summary = paper.get_summary()
    if summary:
        export_data.append({
            'paper_id': paper.paper_id,
            'title': paper.get_title(),
            'authors': paper.get_authors(),
            'year': paper.get_year(),
            'summary': summary
        })

# Save to file
with open('corpus_export.json', 'w', encoding='utf-8') as f:
    json.dump(export_data, f, indent=2, ensure_ascii=False)

print(f"Exported {len(export_data)} papers to corpus_export.json")

## Next Steps

- Explore individual papers in detail
- Build custom datasets for your use case
- Integrate with NLP pipelines
- Create visualizations of the corpus

See `USAGE_EXAMPLES.md` for more examples!