# Data Profiling Analysis - Data Collection Data

**This notebook analyzes data from your Workbench data collection and generates a comprehensive profiling report.**

## üìä What You'll See

After running all cells, you'll get:
- **Data Overview**: Summary statistics and data structure
- **Comprehensive Profiling Report**: Automatic analysis of all columns including:
  - Data types and missing values
  - Statistical summaries (mean, median, std, etc.)
  - Distribution visualizations
  - Correlations between variables
  - Data quality alerts

## üöÄ Quick Start

Just click **"Run All"** to analyze your data from the data collection bucket!

The profiling report works with **any data structure** - no hardcoded column names required!


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
import os
from pathlib import Path
warnings.filterwarnings('ignore')

# Optional: Import google.cloud.storage (installed automatically if needed)
try:
    from google.cloud import storage
    GCS_AVAILABLE = True
except ImportError:
    GCS_AVAILABLE = False
    print("‚ÑπÔ∏è  Installing google-cloud-storage...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "google-cloud-storage"])
    from google.cloud import storage
    GCS_AVAILABLE = True
    print("‚úÖ google-cloud-storage installed successfully!")

# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# ============================================================================
# CONFIGURATION: Data Collection Bucket and File
# ============================================================================
GCS_BUCKET = "my-gcs-experimentation-bucker-wb-steady-parsnip-7109"  # Your data collection bucket
FILE_NAME = "MUP_DPR_RY25_P04_V10_DY23_Geo.csv"   # Your data file
FILE_FORMAT = "csv"  # File format

print("="*70)
print("üìä Configuration")
print("="*70)
print(f"Bucket: {GCS_BUCKET}")
print(f"File: {FILE_NAME}")
print(f"Format: {FILE_FORMAT}")
print(f"GCS Path: gs://{GCS_BUCKET}/{FILE_NAME}")
print("="*70)


In [None]:
## 2. Load Data from GCS Bucket


In [None]:
def load_data_from_gcs(bucket_name, file_name, file_format="csv"):
    """Load data from GCS bucket using Google Cloud Storage client."""
    try:
        # Initialize GCS client
        client = storage.Client()
        bucket = client.bucket(bucket_name)
        blob = bucket.blob(file_name)
        
        print(f"üì• Reading file from GCS: gs://{bucket_name}/{file_name}")
        
        # Download to temporary file
        temp_file = f"/tmp/{os.path.basename(file_name)}"
        blob.download_to_filename(temp_file)
        print(f"‚úÖ File downloaded to: {temp_file}")
        
        # Read based on file format
        if file_format.lower() == "csv":
            df = pd.read_csv(temp_file)
        elif file_format.lower() == "parquet":
            df = pd.read_parquet(temp_file)
        elif file_format.lower() == "json":
            df = pd.read_json(temp_file)
        elif file_format.lower() == "excel":
            df = pd.read_excel(temp_file)
        else:
            raise ValueError(f"Unsupported file format: {file_format}")
        
        # Clean up temp file
        os.remove(temp_file)
        print(f"‚úÖ Data loaded successfully: {len(df)} rows, {len(df.columns)} columns")
        return df
        
    except Exception as e:
        print(f"‚ùå Error loading from GCS: {e}")
        raise

# Load data from GCS
print("\n" + "="*70)
print("Loading data from data collection...")
print("="*70)
bucket_name = GCS_BUCKET.replace("gs://", "").strip()
df = load_data_from_gcs(bucket_name, FILE_NAME, FILE_FORMAT)

print(f"\n‚úÖ Dataset ready: {len(df)} records")
print(f"üìã Columns: {list(df.columns)}")
print(f"\nüìä First few records:")
df.head(10)


## 3. Comprehensive Data Profiling Report


In [None]:
# Install ydata-profiling if not available
try:
    from ydata_profiling import ProfileReport
    print("‚úÖ ydata-profiling is available")
except ImportError:
    print("‚ÑπÔ∏è  Installing ydata-profiling...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "ydata-profiling"])
    from ydata_profiling import ProfileReport
    print("‚úÖ ydata-profiling installed successfully!")

# Generate comprehensive profiling report
print("\n" + "="*70)
print("üìä Generating Comprehensive Data Profiling Report...")
print("="*70)
print("This may take a few moments depending on your data size...")

# Create profile report
# Using explorative=True for comprehensive analysis
# Set minimal=True for very large datasets (>100k rows) for faster processing
profile = ProfileReport(
    df,
    title="Data Profiling Report",
    explorative=True,  # Comprehensive analysis
    minimal=False,  # Set to True for very large datasets
    progress_bar=True
)

# Save report to HTML file (more reliable than inline display)
# The file will be saved in the current working directory (typically /home/jovyan)
import os
report_file = "data_profile_report.html"
report_path = os.path.abspath(report_file)

print(f"\nüíæ Saving report to: {report_path}")
profile.to_file(report_file)
print(f"‚úÖ Report saved successfully!")
print(f"üìÅ Full path: {report_path}")

# Try to display inline, but if it fails, the file is already saved
try:
    from IPython.display import IFrame, display, HTML
    import os
    
    if os.path.exists(report_file):
        # Display the HTML file inline
        display(HTML(f"""
        <div style="padding: 10px; background-color: #e8f5e9; border-radius: 5px; margin-bottom: 10px;">
            <h3>üìä Data Profiling Report</h3>
            <p><strong>Dataset:</strong> {len(df)} rows √ó {len(df.columns)} columns</p>
            <p><strong>Report file:</strong> <code>{report_file}</code></p>
        </div>
        """))
        
        # Display the HTML file in an iframe
        display(IFrame(src=report_file, width="100%", height=800))
    else:
        print("‚ö†Ô∏è  Report file not found, but generation completed.")
        
except Exception as e:
    print(f"‚ÑπÔ∏è  Could not display inline: {e}")
    print(f"‚úÖ Report saved as '{report_file}' - you can download and open it in your browser.")

print("\n" + "="*70)
print("‚úÖ Profiling Report Complete!")
print(f"üìÑ Report saved as: {report_file}")
print("="*70)
