# Data Analysis: Raw and Processed Datasets

This notebook provides comprehensive exploratory data analysis (EDA) for:
1. **Raw Dataset**: Initial survey data before preprocessing
2. **Processed Dataset**: Cleaned and encoded data ready for modeling

## Analysis includes:
- Data shape and structure
- Missing value analysis
- Target variable distribution
- Categorical feature distributions
- Class imbalance checks
- Train/test split validation
- Feature correlations


In [None]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Set paths
RAW_DATA = Path("../../data/raw/stack-overflow-developer-survey-2025-2/survey_results_public.csv")
CLEAN_DATA = Path("../../data/interim/so_2025_clean.csv")
PROCESSED_DATA = Path("../../data/processed/so_2025_model_ready.parquet")
TRAIN_DATA = Path("../../data/processed/so_2025_train.parquet")
TEST_DATA = Path("../../data/processed/so_2025_test.parquet")

print("Imports successful!")


# Part 1: Raw Dataset Analysis


In [None]:
# Load raw dataset (only relevant columns)
USE_COLUMNS = [
    "Country",
    "EdLevel",
    "YearsCode",
    "Employment",
    "DevType",
    "ConvertedCompYearly",
    "RemoteWork",
    "Currency",
]

df_raw = pd.read_csv(RAW_DATA, usecols=USE_COLUMNS, na_values=["NA", "Other (please specify):"])

print(f"Raw Dataset Shape: {df_raw.shape}")
print(f"\nColumns: {list(df_raw.columns)}")
print(f"\nData Types:")
print(df_raw.dtypes)
df_raw.head()


In [None]:
# Missing values analysis
missing = df_raw.isna().sum()
missing_pct = (missing / len(df_raw) * 100).round(2)

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
}).sort_values('Missing %', ascending=False)

print("Missing Values in Raw Dataset:")
print(missing_df[missing_df['Missing Count'] > 0])

# Visualize missing values
if missing_df['Missing Count'].sum() > 0:
    plt.figure(figsize=(10, 6))
    missing_df[missing_df['Missing Count'] > 0].plot(kind='barh', y='Missing %', legend=False)
    plt.title('Missing Values Percentage by Column (Raw Data)')
    plt.xlabel('Missing Percentage (%)')
    plt.tight_layout()
    plt.show()


In [None]:
# Target variable analysis (salary)
salary_col = 'ConvertedCompYearly'
valid_salaries = df_raw[salary_col].dropna()

print(f"Valid salary entries: {len(valid_salaries):,} ({len(valid_salaries)/len(df_raw)*100:.1f}%)")
print(f"\nSalary Statistics:")
print(valid_salaries.describe())

# Distribution plots
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(valid_salaries, bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('Salary Distribution (Raw Data)')
axes[0].set_xlabel('Annual Salary (USD)')
axes[0].set_ylabel('Frequency')
axes[0].axvline(valid_salaries.median(), color='red', linestyle='--', label=f'Median: ${valid_salaries.median():,.0f}')
axes[0].legend()

# Box plot
axes[1].boxplot(valid_salaries, vert=True)
axes[1].set_title('Salary Box Plot (Raw Data)')
axes[1].set_ylabel('Annual Salary (USD)')

plt.tight_layout()
plt.show()

# Outliers
Q1 = valid_salaries.quantile(0.25)
Q3 = valid_salaries.quantile(0.75)
IQR = Q3 - Q1
outliers = valid_salaries[(valid_salaries < Q1 - 1.5*IQR) | (valid_salaries > Q3 + 1.5*IQR)]
print(f"\nOutliers (IQR method): {len(outliers):,} ({len(outliers)/len(valid_salaries)*100:.1f}%)")


In [None]:
# Categorical feature distributions
categorical_cols = ['Employment', 'Country', 'EdLevel', 'RemoteWork']

for col in categorical_cols:
    if col in df_raw.columns:
        print(f"\n{col} Distribution:")
        value_counts = df_raw[col].value_counts()
        print(value_counts.head(10))
        print(f"Unique values: {df_raw[col].nunique()}")
        
        # Plot top 10
        if len(value_counts) > 10:
            top_10 = value_counts.head(10)
            plt.figure(figsize=(10, 6))
            top_10.plot(kind='barh')
            plt.title(f'{col} Distribution (Top 10)')
            plt.xlabel('Count')
            plt.tight_layout()
            plt.show()
        else:
            plt.figure(figsize=(10, 6))
            value_counts.plot(kind='barh')
            plt.title(f'{col} Distribution')
            plt.xlabel('Count')
            plt.tight_layout()
            plt.show()


# Part 2: Processed Dataset Analysis


In [None]:
# Load processed datasets
df_clean = pd.read_csv(CLEAN_DATA)
df_processed = pd.read_parquet(PROCESSED_DATA)
df_train = pd.read_parquet(TRAIN_DATA)
df_test = pd.read_parquet(TEST_DATA)

print("Processed Datasets Loaded:")
print(f"  Clean dataset: {df_clean.shape}")
print(f"  Processed dataset: {df_processed.shape}")
print(f"  Train dataset: {df_train.shape}")
print(f"  Test dataset: {df_test.shape}")

print(f"\nClean Dataset Columns:")
print(list(df_clean.columns))
df_clean.head()


In [None]:
# Check for missing values in processed data
print("Missing Values in Clean Dataset:")
missing_clean = df_clean.isna().sum()
print(missing_clean[missing_clean > 0] if missing_clean.sum() > 0 else "No missing values!")

print("\nMissing Values in Processed Dataset:")
missing_processed = df_processed.isna().sum()
print(missing_processed[missing_processed > 0] if missing_processed.sum() > 0 else "No missing values!")


In [None]:
# Target variable analysis after preprocessing
target_col = 'CompYearlyUSD'

print("Clean Dataset - Salary Statistics:")
print(df_clean[target_col].describe())

# Distribution comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Clean dataset distribution
axes[0, 0].hist(df_clean[target_col], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Salary Distribution (Clean Dataset)')
axes[0, 0].set_xlabel('Annual Salary (USD)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(df_clean[target_col].median(), color='red', linestyle='--', 
                   label=f'Median: ${df_clean[target_col].median():,.0f}')
axes[0, 0].legend()

# Log-transformed distribution
axes[0, 1].hist(df_clean['SalaryLog10'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Log10(Salary) Distribution')
axes[0, 1].set_xlabel('Log10(Annual Salary)')
axes[0, 1].set_ylabel('Frequency')

# Train/Test comparison
axes[1, 0].hist(df_train[target_col], bins=50, alpha=0.7, label='Train', edgecolor='black')
axes[1, 0].hist(df_test[target_col], bins=50, alpha=0.7, label='Test', edgecolor='black')
axes[1, 0].set_title('Salary Distribution: Train vs Test')
axes[1, 0].set_xlabel('Annual Salary (USD)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()

# Box plot comparison
box_data = [df_train[target_col], df_test[target_col]]
axes[1, 1].boxplot(box_data, labels=['Train', 'Test'])
axes[1, 1].set_title('Salary Box Plot: Train vs Test')
axes[1, 1].set_ylabel('Annual Salary (USD)')

plt.tight_layout()
plt.show()


In [None]:
# Train/Test split validation
print("Train/Test Split Statistics:")
print(f"\nTrain set: {len(df_train):,} samples ({len(df_train)/(len(df_train)+len(df_test))*100:.1f}%)")
print(f"Test set: {len(df_test):,} samples ({len(df_test)/(len(df_train)+len(df_test))*100:.1f}%)")

print("\nTarget Variable Statistics:")
print("\nTrain:")
print(df_train[target_col].describe())
print("\nTest:")
print(df_test[target_col].describe())

# Check if distributions are similar (stratified split validation)
from scipy import stats
ks_statistic, p_value = stats.ks_2samp(df_train[target_col], df_test[target_col])
print(f"\nKolmogorov-Smirnov Test (Train vs Test):")
print(f"  KS Statistic: {ks_statistic:.4f}")
print(f"  p-value: {p_value:.4f}")
print(f"  Interpretation: {'Distributions are similar' if p_value > 0.05 else 'Distributions differ significantly'}")


## Class Imbalance Analysis

For regression, we check for imbalance in:
1. Categorical features (Country, Education, DevType, etc.)
2. Salary distribution (binned for stratification)


In [None]:
# Analyze categorical feature distributions for imbalance
categorical_features = ['Country', 'EdLevelSimplified', 'DevTypePrimary', 'RemoteCategory', 'Employment']

for col in categorical_features:
    if col in df_clean.columns:
        print(f"\n{'='*60}")
        print(f"{col} Distribution:")
        print('='*60)
        value_counts = df_clean[col].value_counts()
        percentages = (value_counts / len(df_clean) * 100).round(2)
        
        dist_df = pd.DataFrame({
            'Count': value_counts,
            'Percentage': percentages
        })
        print(dist_df)
        
        # Calculate imbalance metrics
        max_pct = percentages.max()
        min_pct = percentages.min()
        imbalance_ratio = max_pct / min_pct if min_pct > 0 else float('inf')
        
        print(f"\nImbalance Metrics:")
        print(f"  Most common: {value_counts.index[0]} ({max_pct}%)")
        print(f"  Least common: {value_counts.index[-1]} ({min_pct}%)")
        print(f"  Imbalance ratio: {imbalance_ratio:.2f}x")
        print(f"  {'⚠️ Significant imbalance' if imbalance_ratio > 10 else '✓ Relatively balanced'}")
        
        # Visualize
        plt.figure(figsize=(12, 6))
        if len(value_counts) > 20:
            # Show top 20
            top_20 = value_counts.head(20)
            top_20.plot(kind='barh')
            plt.title(f'{col} Distribution (Top 20)')
        else:
            value_counts.plot(kind='barh')
            plt.title(f'{col} Distribution')
        plt.xlabel('Count')
        plt.tight_layout()
        plt.show()


In [None]:
# Salary distribution binned for stratification analysis
n_bins = 10
df_clean['SalaryBin'] = pd.qcut(df_clean[target_col], q=n_bins, labels=False, duplicates='drop')

print("Salary Distribution by Bins (for Stratification):")
bin_counts = df_clean['SalaryBin'].value_counts().sort_index()
bin_pct = (bin_counts / len(df_clean) * 100).round(2)

bin_df = pd.DataFrame({
    'Count': bin_counts,
    'Percentage': bin_pct
})
print(bin_df)

# Visualize bin distribution
plt.figure(figsize=(12, 6))
bin_counts.sort_index().plot(kind='bar')
plt.title('Salary Distribution Across Bins (Stratification)')
plt.xlabel('Bin Number')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Check bin balance
min_bin_pct = bin_pct.min()
max_bin_pct = bin_pct.max()
balance_ratio = max_bin_pct / min_bin_pct if min_bin_pct > 0 else float('inf')

print(f"\nBin Balance Metrics:")
print(f"  Min bin size: {min_bin_pct}%")
print(f"  Max bin size: {max_bin_pct}%")
print(f"  Balance ratio: {balance_ratio:.2f}x")
print(f"  {'⚠️ Some bins are very small' if balance_ratio > 2 else '✓ Bins are well balanced'}")


In [None]:
# Feature correlation analysis (for numeric features)
numeric_features = ['YearsCodeNum', 'CompYearlyUSD', 'SalaryLog10']

corr_matrix = df_clean[numeric_features].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix: Numeric Features')
plt.tight_layout()
plt.show()

print("Correlation with Target (CompYearlyUSD):")
correlations = df_clean[numeric_features].corr()['CompYearlyUSD'].sort_values(ascending=False)
print(correlations)


In [None]:
# Summary statistics for processed features
print("Processed Dataset Summary:")
print(f"  Total features: {df_processed.shape[1] - 1}")  # Exclude target
print(f"  Total samples: {len(df_processed):,}")

# Check feature types
feature_cols = [col for col in df_processed.columns if col != 'CompYearlyUSD']
binary_features = [col for col in feature_cols if df_processed[col].nunique() == 2]
numeric_features = [col for col in feature_cols if col not in binary_features]

print(f"\nFeature Types:")
print(f"  Binary/One-hot encoded: {len(binary_features)}")
print(f"  Numeric: {len(numeric_features)}")

print(f"\nSample binary features: {binary_features[:5]}")
print(f"\nNumeric features: {numeric_features}")


## Summary of Findings


In [None]:
# Final summary
print("="*60)
print("DATA PREPARATION SUMMARY")
print("="*60)

print(f"\n1. Data Reduction:")
print(f"   Raw dataset: {len(df_raw):,} rows")
print(f"   Clean dataset: {len(df_clean):,} rows")
print(f"   Reduction: {(1 - len(df_clean)/len(df_raw))*100:.1f}%")

print(f"\n2. Train/Test Split:")
print(f"   Train: {len(df_train):,} samples ({len(df_train)/(len(df_train)+len(df_test))*100:.1f}%)")
print(f"   Test: {len(df_test):,} samples ({len(df_test)/(len(df_train)+len(df_test))*100:.1f}%)")
print(f"   Stratified: ✓ Yes (by salary bins)")

print(f"\n3. Target Variable:")
print(f"   Mean: ${df_clean[target_col].mean():,.0f}")
print(f"   Median: ${df_clean[target_col].median():,.0f}")
print(f"   Std Dev: ${df_clean[target_col].std():,.0f}")
print(f"   Range: ${df_clean[target_col].min():,.0f} - ${df_clean[target_col].max():,.0f}")

print(f"\n4. Features:")
print(f"   Total encoded features: {len(feature_cols)}")
print(f"   Categorical (one-hot): {len(binary_features)}")
print(f"   Numeric: {len(numeric_features)}")

print(f"\n5. Data Quality:")
print(f"   Missing values in clean data: {df_clean.isna().sum().sum()}")
print(f"   Missing values in processed data: {df_processed.isna().sum().sum()}")

print("\n✅ Dataset is ready for modeling!")
print("="*60)
