In [1]:
# Day 4: NLP Basics - Understanding the Foundation of AI Text Processing
# Goal: Learn tokenization, embeddings, transformers + build our first summarizer

import warnings
warnings.filterwarnings('ignore')

print("🚀 Day 4: NLP Basics")
print("Today we'll understand how AI 'reads' and processes text")

🚀 Day 4: NLP Basics
Today we'll understand how AI 'reads' and processes text


In [2]:
# CONCEPT 1: TOKENIZATION
# Breaking text into smaller, manageable pieces

text = "The quick brown fox jumps over the lazy dog."

# Simple word tokenization (splitting by spaces)
simple_tokens = text.split()
print("Simple tokenization (by spaces):")
print(simple_tokens)
print(f"Number of tokens: {len(simple_tokens)}")

Simple tokenization (by spaces):
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.']
Number of tokens: 9


In [6]:
# How AI systems ACTUALLY tokenize text
from transformers import AutoTokenizer

# Load the SAME lightweight tokenizer we'll use throughout
tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")

text = "The quick brown fox jumps over the lazy dog."

# Real AI tokenization
tokens = tokenizer.tokenize(text)
print("AI tokenization:")
print(tokens)
print(f"Number of tokens: {len(tokens)}")

# Convert tokens to numbers (what the AI actually sees)
token_ids = tokenizer.encode(text)
print(f"\nWhat the AI actually processes (numbers):")
print(token_ids)
print(f"Number of token IDs: {len(token_ids)}")

AI tokenization:
['The', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġjumps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog', '.']
Number of tokens: 10

What the AI actually processes (numbers):
[0, 133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335, 4, 2]
Number of token IDs: 12


In [8]:
# CONCEPT 2: EMBEDDINGS
# Converting tokens into vectors that capture MEANING

import numpy as np

# Get embeddings for our text
inputs = tokenizer(text, return_tensors="pt")
print("Tokenizer output structure:")
print(f"Token IDs shape: {inputs['input_ids'].shape}")
print(f"First few token IDs: {inputs['input_ids'][0][:10]}")

# Let's see what tokens these numbers represent
decoded_tokens = [tokenizer.decode([token_id]) for token_id in inputs['input_ids'][0][:10]]
print(f"\nWhat these numbers mean: {decoded_tokens}")

Tokenizer output structure:
Token IDs shape: torch.Size([1, 12])
First few token IDs: tensor([    0,   133,  2119,  6219, 23602, 13855,    81,     5, 22414,  2335])

What these numbers mean: ['<s>', 'The', ' quick', ' brown', ' fox', ' jumps', ' over', ' the', ' lazy', ' dog']


In [9]:
# Let's test word relationships with a simple example
test_words = ["dog", "cat", "car", "puppy"]

for word in test_words:
    # Tokenize each word
    tokens = tokenizer.encode(word, return_tensors="pt")
    print(f"'{word}' becomes token: {tokens[0][1].item()}")  # Skip special tokens

'dog' becomes token: 16319
'cat' becomes token: 8729
'car' becomes token: 5901
'puppy' becomes token: 642


In [12]:
# CONCEPT 3: TRANSFORMERS (Using a smaller, faster model)
# Let's use a lightweight model that downloads quickly

# Import both the tokenizer AND model classes
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

print("🤖 TRANSFORMERS - Using a lightweight model for learning")

# Using a smaller, faster model
model_name = "sshleifer/distilbart-cnn-12-6"  # Much smaller version
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"\n✅ Loaded {model_name}")
print(f"This is a 'distilled' version - smaller but still powerful!")
print("Perfect for learning and development")

🤖 TRANSFORMERS - Using a lightweight model for learning

✅ Loaded sshleifer/distilbart-cnn-12-6
This is a 'distilled' version - smaller but still powerful!
Perfect for learning and development


In [13]:
# TIME TO MAKE OUR FIRST AI SUMMARY!

# Sample text to summarize (like a business document)
business_text = """
The quarterly financial report shows that our company achieved record revenue of $2.5 million, 
representing a 35% increase from the previous quarter. Sales were particularly strong in the 
enterprise software division, which contributed 60% of total revenue. However, marketing 
expenses increased by 40% due to expanded digital advertising campaigns. The engineering 
team hired 15 new developers, increasing operational costs but positioning us for future 
product launches. Customer satisfaction scores improved to 4.2 out of 5.0, up from 3.8 
last quarter. Looking forward, we expect continued growth but will need to optimize 
marketing spend efficiency.
"""

print("📄 Original text:")
print(f"Length: {len(business_text)} characters")
print(business_text)

# Create the summary using our AI model
inputs = tokenizer.encode(business_text, return_tensors="pt", max_length=512, truncation=True)
summary_ids = model.generate(inputs, max_length=100, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(f"\n🤖 AI Summary:")
print(f"Length: {len(summary)} characters")
print(f"Compression ratio: {len(summary)/len(business_text):.2%}")
print(summary)

📄 Original text:
Length: 662 characters

The quarterly financial report shows that our company achieved record revenue of $2.5 million, 
representing a 35% increase from the previous quarter. Sales were particularly strong in the 
enterprise software division, which contributed 60% of total revenue. However, marketing 
expenses increased by 40% due to expanded digital advertising campaigns. The engineering 
team hired 15 new developers, increasing operational costs but positioning us for future 
product launches. Customer satisfaction scores improved to 4.2 out of 5.0, up from 3.8 
last quarter. Looking forward, we expect continued growth but will need to optimize 
marketing spend efficiency.


🤖 AI Summary:
Length: 277 characters
Compression ratio: 41.84%
 Sales were particularly strong in the enterprise software division, which contributed 60% of total revenue . Marketing expenses increased by 40% due to expanded digital advertising campaigns . Customer satisfaction scores improved t

In [14]:
# Let's test different document types - this is the business value!

test_documents = {
    "Legal Contract": """
This Software License Agreement ("Agreement") is entered into between TechCorp ("Licensor") 
and Client Company ("Licensee"). The Licensor grants Licensee a non-exclusive, 
non-transferable license to use the software for internal business operations only. 
The license fee is $50,000 annually, payable quarterly. Licensee may not reverse engineer, 
modify, or redistribute the software. This Agreement terminates automatically if Licensee 
breaches any terms. Licensor provides no warranty and limits liability to the license fee paid.
""",
    
    "Technical Report": """
The system performance analysis reveals that database query response times have increased 
by 200% over the past month. The primary bottleneck is identified in the user authentication 
module, which processes 10,000 requests per minute during peak hours. Memory usage has 
reached 85% of available capacity. We recommend implementing query caching, upgrading 
to SSD storage, and adding two additional server instances. These improvements should 
reduce response times by 60% and handle projected 50% traffic growth.
"""
}

for doc_type, text in test_documents.items():
    print(f"\n{'='*50}")
    print(f"📋 Document Type: {doc_type}")
    print(f"Original length: {len(text)} characters")
    
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=80, min_length=20, length_penalty=2.0, num_beams=4)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    print(f"AI Summary: {summary}")
    print(f"Compression: {len(summary)/len(text):.2%}")


📋 Document Type: Legal Contract
Original length: 538 characters
AI Summary:  The Software License Agreement ("Agreement") is entered into between TechCorp ("Licensor")  and Client Company ("Licensee") The Licensor grants Licensee a non-exclusive,                 non-transferable license to use the software for internal business operations only . The license fee is $50,000 annually, payable quarterly .
Compression: 60.97%

📋 Document Type: Technical Report
Original length: 518 characters
AI Summary:  The primary bottleneck is identified in the user authentication module, which processes 10,000 requests per minute during peak hours . Memory usage has reached 85% of available capacity .
Compression: 36.29%


In [16]:
# BUSINESS VALUE - Why this matters for real companies

print("💼 BUSINESS APPLICATIONS OF TEXT SUMMARIZATION")
print("\n1. LEGAL TEAMS:")
print("   - Summarize 50-page contracts in seconds")
print("   - Extract key terms, obligations, deadlines")
print("   - Save 2-3 hours per document review")

print("\n2. FINANCE TEAMS:")
print("   - Process quarterly reports quickly") 
print("   - Extract KPIs and financial metrics")
print("   - Analyze competitor financial filings")

print("\n3. EXECUTIVES:")
print("   - Daily news summaries for industry updates")
print("   - Board meeting prep from lengthy documents")
print("   - Quick briefings from team reports")

# Calculate potential ROI
lawyer_hourly_rate = 300
hours_saved_per_doc = 2.5
docs_per_month = 40

monthly_savings = lawyer_hourly_rate * hours_saved_per_doc * docs_per_month
annual_savings = monthly_savings * 12

print(f"\n💰 ROI EXAMPLE - Law Firm:")
print(f"Lawyer rate: ${lawyer_hourly_rate}/hour")
print(f"Time saved per document: {hours_saved_per_doc} hours")
print(f"Documents per month: {docs_per_month}")
print(f"Monthly savings: ${monthly_savings:,}")
print(f"Annual savings: ${annual_savings:,}")
print(f"That's ${annual_savings/1000:.0f}K per year!")

💼 BUSINESS APPLICATIONS OF TEXT SUMMARIZATION

1. LEGAL TEAMS:
   - Summarize 50-page contracts in seconds
   - Extract key terms, obligations, deadlines
   - Save 2-3 hours per document review

2. FINANCE TEAMS:
   - Process quarterly reports quickly
   - Extract KPIs and financial metrics
   - Analyze competitor financial filings

3. EXECUTIVES:
   - Daily news summaries for industry updates
   - Board meeting prep from lengthy documents
   - Quick briefings from team reports

💰 ROI EXAMPLE - Law Firm:
Lawyer rate: $300/hour
Time saved per document: 2.5 hours
Documents per month: 40
Monthly savings: $30,000.0
Annual savings: $360,000.0
That's $360K per year!


In [18]:
# ADVANCED: Understanding how to tune summarization

print("🎛️ TUNING YOUR SUMMARIZER")

sample_text = business_text  # Using our original business report

# Test different parameters
parameters = [
    {"max_length": 50, "min_length": 20, "name": "Very Short"},
    {"max_length": 100, "min_length": 30, "name": "Medium"},
    {"max_length": 150, "min_length": 50, "name": "Detailed"},
]

for params in parameters:
    inputs = tokenizer.encode(sample_text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(
        inputs, 
        max_length=params["max_length"], 
        min_length=params["min_length"], 
        length_penalty=2.0, 
        num_beams=4
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    print(f"\n{params['name']} Summary:")
    print(f"Length: {len(summary)} chars")
    print(summary)
    print("-" * 40)

🎛️ TUNING YOUR SUMMARIZER

Very Short Summary:
Length: 248 chars
 Sales were particularly strong in the enterprise software division, which contributed 60% of total revenue . Marketing expenses increased by 40% due to expanded digital advertising campaigns . Customer satisfaction scores improved to 4.2 out of 5.
----------------------------------------

Medium Summary:
Length: 277 chars
 Sales were particularly strong in the enterprise software division, which contributed 60% of total revenue . Marketing expenses increased by 40% due to expanded digital advertising campaigns . Customer satisfaction scores improved to 4.2 out of 5.0, up from 3.8 last quarter .
----------------------------------------

Detailed Summary:
Length: 278 chars
 Sales were particularly strong in the enterprise software division, which contributed 60% of total revenue . Marketing expenses increased by 40% due to expanded digital advertising campaigns . Customer satisfaction scores improved to 4.2 out of 5.0, up 

In [19]:
# YOUR WEEK 1 MVP - TEXT SUMMARIZER FEATURES

print("🚀 WEEK 1 MVP: SMART DOCUMENT SUMMARIZER")
print("\nCore Features we'll build:")
print("✅ Upload any text document")
print("✅ Choose summary type:")
print("   - Executive Brief (30-50 words)")
print("   - Standard Summary (80-120 words)")
print("   - Detailed Analysis (150+ words)")
print("✅ Industry-specific modes:")
print("   - Legal contracts")
print("   - Financial reports") 
print("   - Technical documentation")

# Preview of what we're building
document_types = {
    "executive": {"max_length": 50, "min_length": 20},
    "standard": {"max_length": 120, "min_length": 40},
    "detailed": {"max_length": 200, "min_length": 80}
}

print(f"\n📊 Configuration for different user types:")
for user_type, config in document_types.items():
    print(f"{user_type.title()}: {config['min_length']}-{config['max_length']} words")

🚀 WEEK 1 MVP: SMART DOCUMENT SUMMARIZER

Core Features we'll build:
✅ Upload any text document
✅ Choose summary type:
   - Executive Brief (30-50 words)
   - Standard Summary (80-120 words)
   - Detailed Analysis (150+ words)
✅ Industry-specific modes:
   - Legal contracts
   - Financial reports
   - Technical documentation

📊 Configuration for different user types:
Executive: 20-50 words
Standard: 40-120 words
Detailed: 80-200 words


In [21]:
# DAY 4 COMPLETE! 🎉

print("✅ DAY 4 ACHIEVEMENTS:")
print("🧠 Understood tokenization (how AI reads text)")
print("🧠 Understood embeddings (how AI represents meaning)")  
print("🧠 Understood transformers (how AI processes and generates)")
print("🤖 Loaded and tested a real AI summarization model")
print("💼 Identified business applications and ROI")
print("🎯 Designed MVP features for different user types")

print("\n🔮 TOMORROW (Day 5): Build Summarizer Backend")
print("We'll wrap this AI model in a FastAPI web service:")
print("- Input: Raw text document")
print("- Output: JSON with summary + metadata")
print("- Ready for web/mobile apps to consume")

print(f"\n📈 PROGRESS: 4/60 days complete")
print("Next week you'll have a live, deployed text summarizer!")

# Save a test function we'll use tomorrow
def create_summary(text, summary_type="standard"):
    """
    Our core summarization function - we'll use this in our API tomorrow!
    """
    configs = {
        "executive": {"max_length": 50, "min_length": 20},
        "standard": {"max_length": 120, "min_length": 40}, 
        "detailed": {"max_length": 200, "min_length": 80}
    }
    
    config = configs.get(summary_type, configs["standard"])
    
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(
        inputs,
        max_length=config["max_length"],
        min_length=config["min_length"], 
        length_penalty=2.0,
        num_beams=4
    )
    
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Test our function
test_summary = create_summary(business_text, "detailed")
print(f"\nTest function output: {test_summary}")

✅ DAY 4 ACHIEVEMENTS:
🧠 Understood tokenization (how AI reads text)
🧠 Understood embeddings (how AI represents meaning)
🧠 Understood transformers (how AI processes and generates)
🤖 Loaded and tested a real AI summarization model
💼 Identified business applications and ROI
🎯 Designed MVP features for different user types

🔮 TOMORROW (Day 5): Build Summarizer Backend
We'll wrap this AI model in a FastAPI web service:
- Input: Raw text document
- Output: JSON with summary + metadata
- Ready for web/mobile apps to consume

📈 PROGRESS: 4/60 days complete
Next week you'll have a live, deployed text summarizer!

Test function output:  Sales were particularly strong in the enterprise software division, which contributed 60% of total revenue . Marketing expenses increased by 40% due to expanded digital advertising campaigns . Customer satisfaction scores improved to 4.2 out of 5.0, up from 3.8  last quarter . The engineering team hired 15 new developers, increasing operational costs but position