# Data Exploration

This notebook explores the dataset used for model training.

## Objectives
- Load and inspect the data
- Understand feature distributions
- Identify data quality issues
- Visualize relationships

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import boto3
import sagemaker

# Set display options
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

print(f"SageMaker version: {sagemaker.__version__}")

## 1. Setup

In [None]:
# SageMaker session
session = sagemaker.Session()
bucket = session.default_bucket()
region = session.boto_region_name
role = sagemaker.get_execution_role()

print(f"Region: {region}")
print(f"Bucket: {bucket}")
print(f"Role: {role}")

## 2. Load Data

In [None]:
# Load data from S3
data_key = 'mlops-demo/input/data.csv'
data_location = f's3://{bucket}/{data_key}'

print(f"Loading data from: {data_location}")

# Read data
df = pd.read_csv(data_location)

print(f"\nDataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

## 3. Basic Statistics

In [None]:
# Display first few rows
print("First 5 rows:")
df.head()

In [None]:
# Data types and missing values
print("Data Info:")
df.info()

In [None]:
# Summary statistics
print("Summary Statistics:")
df.describe()

In [None]:
# Check for missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})

print("\nMissing Values:")
print(missing_df[missing_df['Missing Count'] > 0])

## 4. Target Variable Analysis

In [None]:
# Assuming first column is target
target_col = df.columns[0]

print(f"Target variable: {target_col}")
print(f"\nClass distribution:")
print(df[target_col].value_counts())
print(f"\nClass proportions:")
print(df[target_col].value_counts(normalize=True))

In [None]:
# Visualize target distribution
plt.figure(figsize=(8, 5))
df[target_col].value_counts().plot(kind='bar')
plt.title('Target Variable Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## 5. Feature Distributions

In [None]:
# Select numeric features (excluding target)
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
if target_col in numeric_features:
    numeric_features.remove(target_col)

print(f"Numeric features: {len(numeric_features)}")
print(numeric_features[:10])  # Show first 10

In [None]:
# Plot distributions of first 9 features
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.ravel()

for idx, col in enumerate(numeric_features[:9]):
    axes[idx].hist(df[col].dropna(), bins=30, edgecolor='black')
    axes[idx].set_title(f'{col}')
    axes[idx].set_xlabel('Value')
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 6. Correlation Analysis

In [None]:
# Calculate correlation matrix
correlation = df[numeric_features[:20]].corr()  # First 20 features

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation, annot=False, cmap='coolwarm', center=0)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

## 7. Feature vs Target Analysis

In [None]:
# Box plots for features by target class
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(numeric_features[:6]):
    df.boxplot(column=col, by=target_col, ax=axes[idx])
    axes[idx].set_title(f'{col} by {target_col}')
    axes[idx].set_xlabel(target_col)
    axes[idx].set_ylabel(col)

plt.tight_layout()
plt.show()

## 8. Data Quality Checks

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

# Check for infinite values
inf_count = np.isinf(df.select_dtypes(include=[np.number])).sum().sum()
print(f"Infinite values: {inf_count}")

# Check for constant features
constant_features = [col for col in numeric_features if df[col].nunique() == 1]
print(f"\nConstant features: {len(constant_features)}")
if constant_features:
    print(constant_features)

## 9. Key Findings

### Summary
- Dataset size: [rows] x [columns]
- Target distribution: [balanced/imbalanced]
- Missing values: [count]
- Data quality issues: [list]

### Recommendations
1. [Recommendation 1]
2. [Recommendation 2]
3. [Recommendation 3]

### Next Steps
- Feature engineering (see notebook 02)
- Model experimentation (see notebook 03)
- Update preprocessing script with findings

## 10. Export Findings

In [None]:
# Save summary statistics
summary = {
    'dataset_shape': df.shape,
    'missing_values': missing.to_dict(),
    'target_distribution': df[target_col].value_counts().to_dict(),
    'duplicates': int(duplicates),
    'constant_features': constant_features
}

import json
with open('data_exploration_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("Summary saved to data_exploration_summary.json")