# GeoLingua: Geographic Language Model Training on Kaggle\n\nThis notebook trains a geographic language model using GRPO techniques with proper train/validation/test splits.

In [None]:
# Install requirements\n!pip install torch transformers datasets peft accelerate pandas numpy matplotlib seaborn wandb praw newspaper3k scikit-learn nltk tqdm python-dotenv lxml feedparser beautifulsoup4 wikipedia-api

In [None]:
# Setup environment\nimport sys\nsys.path.append('/kaggle/working')\n\n# Run setup\n!python kaggle_setup.py

In [None]:
# Import project modules\nfrom src.data.preprocessors import DatasetPreprocessor\nfrom src.data.loaders import DataLoader\nfrom src.models.basemodel import GeoLinguaModel\nfrom src.models.grpo_trainer import GRPOTrainer\nfrom config.data_config import *\nfrom config.model_config import *\n\nimport json\nimport logging\nfrom collections import defaultdict

In [None]:
# Load and preprocess data\ndata_loader = DataLoader()\nprocessed_data = data_loader.load_processed_data('processed_dataset.json')\n\nprint(f"Loaded {len(processed_data)} training examples")\n\n# Show data distribution by region\nregion_counts = defaultdict(int)\nfor item in processed_data:\n    region = item.get('region', 'unknown')\n    region_counts[region] += 1\n\nprint("\nData distribution by region:")\nfor region, count in region_counts.items():\n    print(f"  {region}: {count} examples")

In [None]:
# Split data into train/validation/test sets\nimport random\nfrom typing import List, Dict, Tuple\n\ndef split_data_stratified(processed_data: List[Dict], \n                         train_ratio: float = 0.7, \n                         val_ratio: float = 0.15, \n                         test_ratio: float = 0.15,\n                         random_seed: int = 42) -> Tuple[List[Dict], List[Dict], List[Dict]]:\n    """Split data into train/validation/test sets with stratification by region."""\n    \n    # Validate ratios\n    total_ratio = train_ratio + val_ratio + test_ratio\n    if abs(total_ratio - 1.0) > 1e-6:\n        raise ValueError(f"Ratios must sum to 1.0, got {total_ratio}")\n    \n    # Set random seed for reproducibility\n    random.seed(random_seed)\n    \n    # Group data by region for stratification\n    region_data = defaultdict(list)\n    for item in processed_data:\n        region = item.get('region', 'unknown')\n        region_data[region].append(item)\n    \n    print("Data distribution by region:")\n    for region, items in region_data.items():\n        print(f"  {region}: {len(items)} examples")\n    \n    train_data, val_data, test_data = [], [], []\n    \n    # Split each region's data proportionally\n    for region, items in region_data.items():\n        # Shuffle items for this region\n        random.shuffle(items)\n        \n        n_items = len(items)\n        train_end = int(n_items * train_ratio)\n        val_end = train_end + int(n_items * val_ratio)\n        \n        # Split the data\n        train_data.extend(items[:train_end])\n        val_data.extend(items[train_end:val_end])\n        test_data.extend(items[val_end:])\n        \n        print(f"  {region}: train={train_end}, val={val_end-train_end}, test={n_items-val_end}")\n    \n    # Shuffle the final splits\n    random.shuffle(train_data)\n    random.shuffle(val_data)\n    random.shuffle(test_data)\n    \n    print(f"\nFinal split sizes:")\n    print(f"  Train: {len(train_data)} examples")\n    print(f"  Validation: {len(val_data)} examples")\n    print(f"  Test: {len(test_data)} examples")\n    \n    return train_data, val_data, test_data\n\n# Split the data\ntrain_data, val_data, test_data = split_data_stratified(\n    processed_data,\n    train_ratio=0.7,\n    val_ratio=0.15,\n    test_ratio=0.15,\n    random_seed=42\n)\n\n# Save splits for later use\nimport os\nos.makedirs('data/processed', exist_ok=True)\n\nsplits = {\n    'train': train_data,\n    'val': val_data,\n    'test': test_data\n}\n\nfor split_name, split_data in splits.items():\n    output_path = f"data/processed/{split_name}_split.json"\n    with open(output_path, 'w', encoding='utf-8') as f:\n        json.dump(split_data, f, indent=2, ensure_ascii=False)\n    print(f"Saved {split_name} split to {output_path}")

In [None]:
# Initialize model\nmodel = GeoLinguaModel(\n    model_name=MODEL_NAME,\n    regions=['us_south', 'uk', 'australia', 'india', 'nigeria'],\n    lora_config={\n        'r': LORA_R,\n        'lora_alpha': LORA_ALPHA,\n        'lora_dropout': LORA_DROPOUT,\n        'target_modules': ['q_proj', 'k_proj', 'v_proj', 'o_proj']\n    }\n)\n\nprint(f"Model initialized: {MODEL_NAME}")\nprint(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

In [None]:
# Train model\ntrainer = GRPOTrainer(\n    model=model,\n    train_datasets=train_data,\n    eval_datasets=val_data,  # Use validation data for evaluation during training\n    learning_rate=LEARNING_RATE,\n    num_epochs=NUM_EPOCHS,\n    output_dir='/kaggle/working/models/checkpoints',\n    use_wandb=False  # Disable wandb on Kaggle\n)\n\nbest_model_path = trainer.train()\nprint(f"Training completed! Best model saved at: {best_model_path}")

In [None]:
# Evaluate on test set\nprint("\nEvaluating model on test set...")\n\n# Load the best model\nimport torch\ndevice = 'cuda' if torch.cuda.is_available() else 'cpu'\ncheckpoint = torch.load(best_model_path, map_location=device)\nmodel.load_state_dict(checkpoint['model_state_dict'])\nmodel = model.to(device)\nmodel.eval()\n\n# Evaluate on test data\ntotal_loss = 0.0\nregion_metrics = defaultdict(lambda: {'loss': 0.0, 'count': 0})\n\nwith torch.no_grad():\n    for item in test_data:\n        text = item.get('text', '')\n        target_region = item.get('region', 'unknown')\n        \n        try:\n            # This is a placeholder - adapt based on your model's actual evaluation method\n            loss = model.compute_loss(text, target_region)\n            total_loss += loss.item()\n            region_metrics[target_region]['loss'] += loss.item()\n            region_metrics[target_region]['count'] += 1\n        except Exception as e:\n            print(f"Warning: Error evaluating item: {e}")\n            continue\n\n# Calculate and display results\navg_loss = total_loss / len(test_data) if test_data else 0.0\n\nprint("\n" + "="*50)\nprint("TEST SET EVALUATION RESULTS")\nprint("="*50)\nprint(f"Total test examples: {len(test_data)}")\nprint(f"Overall average loss: {avg_loss:.4f}")\n\nprint("\nResults by region:")\nprint("-" * 40)\nfor region, metrics in region_metrics.items():\n    if metrics['count'] > 0:\n        avg_region_loss = metrics['loss'] / metrics['count']\n        print(f"{region:12}: {metrics['count']:3d} examples, avg loss: {avg_region_loss:.4f}")\n\nprint("="*50)

In [None]:
# Save model for download\nimport shutil\nshutil.copy(best_model_path, '/kaggle/working/geolingua_model.pth')\nprint("Model saved for download!")