# Sequence Models and Long Short-Term Memory Networks 🇦🇺

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vuhung16au/pytorch-mastery/blob/main/examples/pytorch-nlp/03_sequence_models_nlp.ipynb)
[![View on GitHub](https://img.shields.io/badge/View_on-GitHub-blue?logo=github)](https://github.com/vuhung16au/pytorch-mastery/blob/main/examples/pytorch-nlp/03_sequence_models_nlp.ipynb)

Master sequence modeling for NLP using PyTorch with RNN, LSTM, and GRU architectures. Features comprehensive Australian tourism examples with English-Vietnamese multilingual support for advanced sequence processing tasks.

## Learning Objectives

By the end of this notebook, you will:

- 🔄 **Master RNN architectures** including vanilla RNN, LSTM, and GRU
- 🇦🇺 **Build sequence models** for Australian tourism NLP tasks
- 🏷️ **Implement POS tagging** for Australian English and Vietnamese text
- 📊 **Create sentiment analysis** with sequential processing
- 🎯 **Apply attention mechanisms** for better sequence understanding
- 🌏 **Handle multilingual sequences** with shared and separate encoders
- 🔄 **Compare with TensorFlow** RNN implementations

## What You'll Build

1. **Australian POS Tagger** - Tag parts of speech in tourism reviews
2. **Multilingual Sentiment Analyzer** - Process English and Vietnamese sequences  
3. **Tourism Review Classifier** - Categorize reviews by Australian destinations
4. **Attention-based Sequence Model** - Focus on important parts of tourism text
5. **Sequence-to-Sequence Translator** - Basic English-Vietnamese translation

---

In [None]:
# Environment Detection and Setup
import sys
import subprocess
import os
import time

# Detect the runtime environment
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules or "kaggle" in os.environ.get('KAGGLE_URL_BASE', '')
IS_LOCAL = not (IS_COLAB or IS_KAGGLE)

print(f"🔍 Environment Detection:")
print(f"   Local Development: {IS_LOCAL}")
print(f"   Google Colab: {IS_COLAB}")
print(f"   Kaggle Notebooks: {IS_KAGGLE}")

# Platform-specific system setup
if IS_COLAB:
    print("\n⚙️  Setting up Google Colab environment...")
    !apt update -qq
    !apt install -y -qq software-properties-common
elif IS_KAGGLE:
    print("\n⚙️  Setting up Kaggle environment...")
else:
    print("\n⚙️  Setting up local environment...")

In [None]:
# Install required packages for sequence modeling
required_packages = [
    "torch",
    "transformers",
    "datasets", 
    "tokenizers",
    "pandas",
    "seaborn",
    "matplotlib",
    "scikit-learn",
    "tensorboard",
    "nltk",  # For POS tagging examples
    "plotly",  # For sequence visualizations
]

print("📦 Installing packages for sequence modeling...")
for package in required_packages:
    if IS_COLAB or IS_KAGGLE:
        !pip install -q {package}
    else:
        try:
            subprocess.run([sys.executable, "-m", "pip", "install", "-q", package], 
                          capture_output=True, check=True)
        except subprocess.CalledProcessError:
            print(f"   ⚠️  {package} installation skipped (likely already installed)")
            continue
    print(f"   ✅ {package}")

print("\n🎉 Package installation completed!")

In [None]:
# Import essential libraries for sequence modeling
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.tensorboard import SummaryWriter

# Data handling and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Machine learning and evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Text processing
import re
import string
from collections import Counter, defaultdict
import random
from itertools import zip_longest

# NLTK for linguistic features
try:
    import nltk
    # Download required NLTK data
    nltk.download('punkt', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
    nltk.download('universal_tagset', quiet=True)
    from nltk.tokenize import word_tokenize
    from nltk.tag import pos_tag
    print("✅ NLTK imported successfully")
except ImportError:
    print("⚠️  NLTK not available - will use basic tokenization")

# Set style for better notebook aesthetics
sns.set_style("whitegrid")
sns.set_palette("Set2")
plt.rcParams['figure.figsize'] = (14, 8)

# Set random seeds for reproducibility
torch.manual_seed(16)
np.random.seed(16)
random.seed(16)

print(f"🔄 Sequence Modeling Environment Ready!")
print(f"   PyTorch version: {torch.__version__}")
print(f"   Libraries loaded for RNN, LSTM, GRU modeling")

In [None]:
import platform

def detect_device():
    """Detect optimal device for sequence model training."""
    if torch.cuda.is_available():
        device = torch.device("cuda")
        gpu_name = torch.cuda.get_device_name(0)
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
        
        print(f"🚀 CUDA GPU detected: {gpu_name}")
        print(f"   GPU Memory: {gpu_memory:.1f} GB")
        print(f"   Excellent for LSTM/GRU sequence training")
        
        return device
    
    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
        device = torch.device("mps")
        system_info = platform.uname()
        
        print(f"🍎 Apple Silicon MPS detected: {system_info.machine}")
        print(f"   Optimized for M1/M2/M3 chips")
        print(f"   Good performance for sequence models")
        
        return device
    
    else:
        device = torch.device("cpu")
        cpu_count = torch.get_num_threads()
        
        print(f"💻 CPU mode: {platform.processor()}")
        print(f"   Threads: {cpu_count}")
        print(f"   💡 Tip: Use smaller hidden dimensions and shorter sequences")
        
        return device

# Detect and set device
DEVICE = detect_device()
print(f"\n✅ Device selected: {DEVICE}")

# Set device-specific parameters
if DEVICE.type == 'cuda':
    BATCH_SIZE = 64
    HIDDEN_DIM = 256
    MAX_SEQ_LENGTH = 100
elif DEVICE.type == 'mps':
    BATCH_SIZE = 32
    HIDDEN_DIM = 128  
    MAX_SEQ_LENGTH = 80
else:  # CPU
    BATCH_SIZE = 16
    HIDDEN_DIM = 64
    MAX_SEQ_LENGTH = 50

print(f"\n⚙️  Device-optimized parameters:")
print(f"   Batch size: {BATCH_SIZE}")
print(f"   Hidden dimension: {HIDDEN_DIM}")
print(f"   Max sequence length: {MAX_SEQ_LENGTH}")

In [None]:
def create_australian_sequence_datasets():
    """
    Create comprehensive datasets for sequence modeling tasks with Australian context.
    
    Returns:
        dict: Contains datasets for different sequence modeling tasks
    """
    
    # 1. Part-of-Speech Tagging Dataset
    pos_sentences = [
        # English sentences with Australian context
        "The Sydney Opera House attracts millions of tourists annually.",
        "Melbourne's coffee culture is world-renowned and deeply appreciated.",
        "Brisbane's subtropical climate makes it perfect for outdoor activities.",
        "Perth beaches offer excellent surfing conditions year-round.",
        "Adelaide wine regions produce exceptional shiraz and cabernet varieties.",
        "Darwin's tropical location provides access to unique wildlife experiences.",
        "Hobart's MONA museum showcases provocative contemporary art installations.",
        "Canberra houses Australia's most important national cultural institutions.",
        "The Great Barrier Reef supports incredible marine biodiversity.",
        "Uluru holds deep spiritual significance for Aboriginal communities.",
        "Kakadu National Park preserves ancient rock art galleries.",
        "Blue Mountains scenic railways provide spectacular valley views.",
        "Gold Coast theme parks offer thrilling rides and entertainment.",
        "Tasmania's wilderness areas protect unique endemic species.",
        "Alice Springs serves as the heart of Australia's red centre.",
        
        # Vietnamese sentences
        "Nhà hát Opera Sydney thu hút hàng triệu du khách mỗi năm.",
        "Văn hóa cà phê Melbourne được biết đến trên toàn thế giới.",
        "Khí hậu cận nhiệt đới Brisbane hoàn hảo cho hoạt động ngoài trời.",
        "Bãi biển Perth cung cấp điều kiện lướt sóng tuyệt vời.",
        "Vùng rượu vang Adelaide sản xuất shiraz và cabernet đặc biệt."
    ]
    
    # 2. Sentiment Analysis Dataset
    sentiment_data = {
        'texts': [
            # Positive sentiment
            "Sydney Harbour Bridge climb was absolutely incredible and breathtaking!",
            "Melbourne's laneways are filled with amazing street art and culture.",
            "The Great Barrier Reef snorkeling exceeded all my expectations completely.",
            "Uluru sunset viewing was a deeply spiritual and moving experience.",
            "Perth's Rottnest Island quokkas are adorable and photogenic creatures.",
            "Adelaide Central Market offers the freshest local produce available.",
            "Tasmania's Cradle Mountain hiking trails are absolutely spectacular.",
            "Darwin's mindfulness markets showcase incredible local indigenous art.",
            "Canberra's national galleries house world-class art collections.",
            "Brisbane's South Bank offers excellent riverside dining experiences.",
            
            # Vietnamese positive
            "Leo cầu Cảng Sydney thật tuyệt vời và ngoạn mục!",
            "Các con hẻm Melbourne đầy nghệ thuật đường phố tuyệt vời.",
            "Lặn ngắm Great Barrier Reef vượt xa mong đợi của tôi.",
            "Ngắm hoàng hôn Uluru là trải nghiệm tâm linh sâu sắc.",
            "Quokka ở đảo Rottnest Perth thật đáng yêu và dễ chụp ảnh.",
            
            # Neutral sentiment  
            "Sydney accommodation prices vary significantly depending on location and season.",
            "Melbourne weather can be unpredictable with four seasons in one day.",
            "Brisbane has reasonable public transport connections to major attractions.",
            "Perth is quite isolated but offers decent shopping and dining options.",
            "Adelaide is smaller than other capitals but has its own unique character.",
            "Darwin has limited attractions but interesting weekend markets available.",
            "Hobart offers a mix of historical and contemporary cultural experiences.",
            "Canberra is primarily known for its governmental and cultural institutions.",
            
            # Vietnamese neutral
            "Giá chỗ ở Sydney thay đổi tùy theo vị trí và mùa.",
            "Thời tiết Melbourne có thể khó đoán với bốn mùa trong một ngày.",
            
            # Negative sentiment
            "Sydney traffic congestion makes getting around extremely frustrating and time-consuming.",
            "Melbourne's unpredictable weather ruined our carefully planned outdoor activities.",
            "Brisbane humidity during summer is absolutely unbearable and oppressive.",
            "Perth nightlife is disappointingly limited and closes way too early.",
            "Adelaide becomes very quiet after 6 PM with limited entertainment options.",
            "Darwin is extremely expensive for basic necessities and everyday groceries.",
            "Hobart's winter weather is depressingly cold and wet for tourists.",
            "Canberra can feel somewhat sterile and lacking in vibrant culture.",
            
            # Vietnamese negative
            "Tắc đường Sydney làm di chuyển cực kỳ khó chịu và tốn thời gian.",
            "Thời tiết Melbourne không thể đoán trước làm hỏng hoạt động ngoài trời.",
            "Độ ẩm Brisbane mùa hè thật không thể chịu nổi.",
            "Cuộc sống về đêm Perth thất vọng và đóng cửa quá sớm.",
            "Darwin cực kỳ đắt đỏ cho nhu cầu cơ bản và thực phẩm."
        ],
        'labels': (
            [2] * 15 +  # Positive (English + Vietnamese)
            [1] * 10 +  # Neutral (English + Vietnamese)
            [0] * 13    # Negative (English + Vietnamese)
        )
    }
    
    # 3. Destination Classification Dataset
    destination_data = {
        'texts': [
            # Sydney
            "The iconic Opera House and Harbour Bridge dominate the skyline beautifully.",
            "Bondi Beach offers excellent surfing and vibrant beach culture year-round.",
            "The Rocks historic area preserves colonial heritage and weekend markets.",
            "Darling Harbour features world-class museums and entertainment venues.",
            
            # Melbourne  
            "Street art in hidden laneways creates an urban cultural playground.",
            "Coffee culture here is taken seriously with expert baristas everywhere.",
            "Tram networks provide convenient access to all major city attractions.",
            "Federation Square hosts cultural events and major gallery exhibitions.",
            
            # Brisbane
            "South Bank parklands offer riverside dining and cultural institutions.",
            "Subtropical climate makes outdoor activities enjoyable year-round.",
            "Story Bridge climbing provides panoramic city and river views.",
            "Queen Street Mall is the heart of the shopping district.",
            
            # Perth
            "Pristine beaches stretch along the coast with crystal clear waters.",
            "Kings Park offers botanical gardens and city skyline views.",
            "Fremantle port city features maritime heritage and weekend markets.",
            "Swan Valley wine region produces excellent local varieties.",
            
            # Adelaide
            "Festival city hosts numerous cultural celebrations throughout the year.",
            "Central Market offers the finest local and international produce.",
            "Adelaide Hills wine country produces cool climate varieties.",
            "Parklands surrounding the city provide green recreational spaces.",
            
            # Other destinations
            "Tropical climate and relaxed atmosphere define the northern lifestyle.",  # Darwin
            "MONA museum challenges visitors with provocative contemporary art.",      # Hobart
            "National institutions and planned city layout characterize the capital.", # Canberra
            "Red centre heart of Australia with outback wilderness experiences."      # Alice Springs
        ],
        'labels': (
            [0] * 4 +  # Sydney
            [1] * 4 +  # Melbourne
            [2] * 4 +  # Brisbane  
            [3] * 4 +  # Perth
            [4] * 4 +  # Adelaide
            [5, 6, 7, 8]  # Darwin, Hobart, Canberra, Alice Springs
        )
    }
    
    return {
        'pos_tagging': pos_sentences,
        'sentiment_analysis': sentiment_data,
        'destination_classification': destination_data
    }

# Create datasets
datasets = create_australian_sequence_datasets()

print("🇦🇺 Australian Tourism Sequence Datasets Created")
print("=" * 55)
print(f"   POS Tagging sentences: {len(datasets['pos_tagging'])}")
print(f"   Sentiment analysis texts: {len(datasets['sentiment_analysis']['texts'])}")
print(f"   Destination classification texts: {len(datasets['destination_classification']['texts'])}")

# Analyze sentiment distribution
sentiment_counts = Counter(datasets['sentiment_analysis']['labels'])
sentiment_labels = ['Negative', 'Neutral', 'Positive']
print(f"\n📊 Sentiment Distribution:")
for label, count in sentiment_counts.items():
    print(f"   {sentiment_labels[label]}: {count} examples")

# Analyze destination distribution
destination_counts = Counter(datasets['destination_classification']['labels'])
destination_labels = ['Sydney', 'Melbourne', 'Brisbane', 'Perth', 'Adelaide', 'Darwin', 'Hobart', 'Canberra', 'Alice Springs']
print(f"\n🏙️  Destination Distribution:")
for label, count in destination_counts.items():
    if label < len(destination_labels):
        print(f"   {destination_labels[label]}: {count} examples")

# Show sample data
print(f"\n📝 Sample POS sentence:")
print(f"   {datasets['pos_tagging'][0]}")

print(f"\n😊 Sample sentiment text (positive):")
pos_idx = datasets['sentiment_analysis']['labels'].index(2)
print(f"   {datasets['sentiment_analysis']['texts'][pos_idx]}")

print(f"\n🏢 Sample destination text (Sydney):")
sydney_idx = datasets['destination_classification']['labels'].index(0)
print(f"   {datasets['destination_classification']['texts'][sydney_idx]}")

print(f"\n✅ Sequence datasets ready for model training!")