# Data Profiling Analysis - Data Collection Data

**This notebook analyzes data from your Workbench data collection and generates a comprehensive profiling report.**

## 📊 What You'll See

After running all cells, you'll get:
- **Data Overview**: Summary statistics and data structure
- **Comprehensive Profiling Report**: Automatic analysis of all columns including:
  - Data types and missing values
  - Statistical summaries (mean, median, std, etc.)
  - Distribution visualizations
  - Correlations between variables
  - Data quality alerts

## 🚀 Quick Start

Just click **"Run All"** to analyze your data from the data collection bucket!

The profiling report works with **any data structure** - no hardcoded column names required!


## 1. Import Libraries and Create Sample Data


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
import os
from pathlib import Path
warnings.filterwarnings('ignore')

# Optional: Import google.cloud.storage (installed automatically if needed)
try:
    from google.cloud import storage
    GCS_AVAILABLE = True
except ImportError:
    GCS_AVAILABLE = False
    print("ℹ️  Installing google-cloud-storage...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "google-cloud-storage"])
    from google.cloud import storage
    GCS_AVAILABLE = True
    print("✅ google-cloud-storage installed successfully!")

# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# ============================================================================
# CONFIGURATION: Data Collection Bucket and File
# ============================================================================
GCS_BUCKET = "my-gcs-experimentation-bucker-wb-steady-parsnip-7109"  # Your data collection bucket
FILE_NAME = "MUP_DPR_RY25_P04_V10_DY23_Geo.csv"   # Your data file
FILE_FORMAT = "csv"  # File format

print("="*70)
print("📊 Configuration")
print("="*70)
print(f"Bucket: {GCS_BUCKET}")
print(f"File: {FILE_NAME}")
print(f"Format: {FILE_FORMAT}")
print(f"GCS Path: gs://{GCS_BUCKET}/{FILE_NAME}")
print("="*70)


In [None]:
## 2. Load Data from GCS Bucket

def load_data_from_gcs(bucket_name, file_name, file_format="csv"):
    """Load data from GCS bucket using Google Cloud Storage client."""
    try:
        # Initialize GCS client
        client = storage.Client()
        bucket = client.bucket(bucket_name)
        blob = bucket.blob(file_name)
        
        print(f"📥 Reading file from GCS: gs://{bucket_name}/{file_name}")
        
        # Download to temporary file
        temp_file = f"/tmp/{os.path.basename(file_name)}"
        blob.download_to_filename(temp_file)
        print(f"✅ File downloaded to: {temp_file}")
        
        # Read based on file format
        if file_format.lower() == "csv":
            df = pd.read_csv(temp_file)
        elif file_format.lower() == "parquet":
            df = pd.read_parquet(temp_file)
        elif file_format.lower() == "json":
            df = pd.read_json(temp_file)
        elif file_format.lower() == "excel":
            df = pd.read_excel(temp_file)
        else:
            raise ValueError(f"Unsupported file format: {file_format}")
        
        # Clean up temp file
        os.remove(temp_file)
        print(f"✅ Data loaded successfully: {len(df)} rows, {len(df.columns)} columns")
        return df
        
    except Exception as e:
        print(f"❌ Error loading from GCS: {e}")
        raise

# Load data from GCS
print("\n" + "="*70)
print("Loading data from data collection...")
print("="*70)
bucket_name = GCS_BUCKET.replace("gs://", "").strip()
df = load_data_from_gcs(bucket_name, FILE_NAME, FILE_FORMAT)

print(f"\n✅ Dataset ready: {len(df)} records")
print(f"📋 Columns: {list(df.columns)}")
print(f"\n📊 First few records:")
df.head(10)


## 3. Comprehensive Data Profiling Report


In [None]:
# Install ydata-profiling if not available
try:
    from ydata_profiling import ProfileReport
    print("✅ ydata-profiling is available")
except ImportError:
    print("ℹ️  Installing ydata-profiling...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "ydata-profiling"])
    from ydata_profiling import ProfileReport
    print("✅ ydata-profiling installed successfully!")

# Fix: Patch numpy.asarray to handle copy parameter compatibility
import numpy as np
original_asarray = np.asarray

def patched_asarray(a, dtype=None, order=None, copy=None, **kwargs):
    """Patched asarray that handles copy parameter for older numpy versions."""
    try:
        if copy is not None:
            return original_asarray(a, dtype=dtype, order=order, copy=copy, **kwargs)
        else:
            return original_asarray(a, dtype=dtype, order=order, **kwargs)
    except TypeError:
        if 'copy' in kwargs:
            kwargs.pop('copy')
        return original_asarray(a, dtype=dtype, order=order, **kwargs)

np.asarray = patched_asarray
print("✅ Patched numpy.asarray for compatibility")

# Also disable word cloud generation as backup
try:
    import ydata_profiling.visualisation.plot as plot_module
    
    def noop_plot_word_cloud(config, word_counts):
        """Disabled word cloud to avoid issues."""
        return ""
    
    plot_module.plot_word_cloud = noop_plot_word_cloud
    if hasattr(plot_module, '_plot_word_cloud'):
        plot_module._plot_word_cloud = lambda config, series, figsize=None: None
    
    print("✅ Word cloud generation also disabled as backup")
except Exception as e:
    print(f"ℹ️  Could not disable word cloud: {e}")

# Generate comprehensive profiling report
print("\n" + "="*70)
print("📊 Generating Comprehensive Data Profiling Report...")
print("="*70)
print("This may take a few moments depending on your data size...")

# Create profile report
# Using explorative=True for comprehensive analysis
# Set minimal=True for very large datasets (>100k rows) for faster processing
profile = ProfileReport(
    df,
    title="Data Profiling Report",
    explorative=True,  # Comprehensive analysis
    minimal=False,  # Set to True for very large datasets
    progress_bar=True
)

# Save report to HTML file (more reliable than inline display)
# The file will be saved in the current working directory (typically /home/jovyan)
import os
report_file = "data_profile_report.html"
report_path = os.path.abspath(report_file)

print(f"\n💾 Saving report to: {report_path}")
profile.to_file(report_file)
print(f"✅ Report saved successfully!")
print(f"📁 Full path: {report_path}")

# Try to display inline, but if it fails, the file is already saved
try:
    from IPython.display import IFrame, display, HTML
    import os
    
    if os.path.exists(report_file):
        # Display the HTML file inline
        display(HTML(f"""
        <div style="padding: 10px; background-color: #e8f5e9; border-radius: 5px; margin-bottom: 10px;">
            <h3>📊 Data Profiling Report</h3>
            <p><strong>Dataset:</strong> {len(df)} rows × {len(df.columns)} columns</p>
            <p><strong>Report file:</strong> <code>{report_file}</code></p>
        </div>
        """))
        
        # Display the HTML file in an iframe
        display(IFrame(src=report_file, width="100%", height=800))
    else:
        print("⚠️  Report file not found, but generation completed.")
        
except Exception as e:
    print(f"ℹ️  Could not display inline: {e}")
    print(f"✅ Report saved as '{report_file}' - you can download and open it in your browser.")

print("\n" + "="*70)
print("✅ Profiling Report Complete!")
print(f"📄 Report saved as: {report_file}")
print("="*70)
