# Data Exploration and Model Building

This notebook guides you through the process of exploring your data and building a predictive model.

## 1. Setup and Data Loading

First, let's import the necessary libraries and load our data.

In [2]:
# To inspect the file
with open('../src/modeling.py', 'r') as f:
    print(f.read())

import pandas as pd
import numpy as np
from typing import Optional, List, Dict, Any, Tuple
import pickle
from pathlib import Path

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,  # Regression metrics
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,  # Classification metrics
    roc_auc_score, roc_curve, precision_recall_curve
)

# Import various models
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
fro

In [1]:
import os

# Check the exact path being used
expected_path = os.path.abspath(os.path.join('..', 'src', 'modeling.py'))
print(f"Looking for modeling.py at: {expected_path}")
print(f"File exists: {os.path.exists(expected_path)}")

# Read the first few lines to confirm it's the correct file
if os.path.exists(expected_path):
    with open(expected_path, 'r') as f:
        content = f.read(500)  # Read first 500 characters
        print("First few lines of the file:")
        print(content)

Looking for modeling.py at: /workspaces/IDRPublic/src/modeling.py
File exists: True
First few lines of the file:
import pandas as pd
import numpy as np
from typing import Optional, List, Dict, Any, Tuple
import pickle
from pathlib import Path

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,  # Regression metrics
    acc


In [3]:
import importlib.util
import sys

# Load the module directly from the file path
expected_path = '/workspaces/IDRPublic/src/modeling.py'
spec = importlib.util.spec_from_file_location("modeling_direct", expected_path)
modeling_direct = importlib.util.module_from_spec(spec)
sys.modules["modeling_direct"] = modeling_direct
spec.loader.exec_module(modeling_direct)

# Check if the function exists in the loaded module
print("Function exists in direct import:", hasattr(modeling_direct, "get_numeric_and_categorical_columns"))

# Try to access the function
if hasattr(modeling_direct, "get_numeric_and_categorical_columns"):
    print("Function signature:", modeling_direct.get_numeric_and_categorical_columns.__annotations__)

Function exists in direct import: True
Function signature: {'df': <class 'pandas.core.frame.DataFrame'>, 'return': typing.Tuple[typing.List[str], typing.List[str]]}


In [None]:
# Import the needed functions directly from our manually loaded module
get_numeric_and_categorical_columns = modeling_direct.get_numeric_and_categorical_columns
create_preprocessing_pipeline = modeling_direct.create_preprocessing_pipeline
evaluate_regression_models = modeling_direct.evaluate_regression_models
evaluate_classification_models = modeling_direct.evaluate_classification_models

# Test that the function works
import pandas as pd
test_df = pd.DataFrame({'A': [1, 2, 3], 'B': ['x', 'y', 'z']})
num_cols, cat_cols = get_numeric_and_categorical_columns(test_df)
print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

In [4]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 8)

# Import custom modules (adjust the path if needed)
import sys
sys.path.append('../src')
from data_loader import load_excel_data, clean_data, split_data
from visualization import plot_numeric_distribution, plot_correlation_matrix, plot_feature_importance
from modeling import (get_numeric_and_categorical_columns, create_preprocessing_pipeline,
                      evaluate_regression_models, evaluate_classification_models)

In [None]:
# Load your data - replace with your actual file path
file_path = "../data/raw/2023-q1-federal-idr-puf.xlsx"

# Try to load the data
try:
    df = load_excel_data(file_path)
    
    # Display first few rows
    print("\nFirst few rows:")
    display(df.head())
    
except Exception as e:
    print(f"Error: {e}")
    print("Please make sure the file exists and the path is correct.")

## 2. Data Cleaning and Preprocessing

Let's clean the data and perform some basic preprocessing.

In [None]:
# Clean the data
clean_df = clean_data(df)

# Display information about the cleaned data
print("Data information:")
clean_df.info()

# Display summary statistics
print("\nSummary statistics:")
display(clean_df.describe())

# Check for missing values
print("\nMissing values per column:")
display(clean_df.isna().sum())

# Identify numeric and categorical columns
numeric_cols, categorical_cols = get_numeric_and_categorical_columns(clean_df)
print(f"\nNumeric columns: {numeric_cols}")
print(f"Categorical columns: {categorical_cols}")

# Let's examine a sample of categorical columns if any exist
if categorical_cols:
    for col in categorical_cols[:3]:  # Show first 3 at most
        print(f"\nUnique values in {col}:")
        display(clean_df[col].value_counts())

## 3. Data Visualization

Let's visualize the data to understand it better.

In [None]:
# Distribution of numeric features
if numeric_cols:
    print("Distribution of numeric features:")
    plot_numeric_distribution(clean_df, numeric_cols)

In [None]:
# Correlation matrix
if len(numeric_cols) > 1:
    print("Correlation matrix:")
    plot_correlation_matrix(clean_df, numeric_cols)

## 4. Selecting Target Variable

Now, let's select the target variable for our predictive model.

In [None]:
# List all columns for user to select as target
print("Available columns for target variable:")
for i, col in enumerate(clean_df.columns):
    print(f"{i}. {col}")

# Select your target variable - change this to your actual target column name
target_column = 'your_target_column'  # Replace with the actual column name

# Check data type of target variable to determine if this is a regression or classification problem
if target_column in clean_df.columns:
    print(f"\nTarget variable: {target_column}")
    print(f"Data type: {clean_df[target_column].dtype}")
    
    # For numeric target, show distribution
    if pd.api.types.is_numeric_dtype(clean_df[target_column]):
        print("\nTarget distribution:")
        plt.figure(figsize=(10, 6))
        sns.histplot(clean_df[target_column], kde=True)
        plt.title(f'Distribution of {target_column}')
        plt.show()
        
        problem_type = 'regression'
        print("\nThis appears to be a regression problem.")
    else:
        # For categorical target, show value counts
        print("\nTarget value counts:")
        display(clean_df[target_column].value_counts())
        
        plt.figure(figsize=(10, 6))
        sns.countplot(y=clean_df[target_column])
        plt.title(f'Count of {target_column}')
        plt.show()
        
        problem_type = 'classification'
        print("\nThis appears to be a classification problem.")
else:
    print(f"Error: '{target_column}' not found in the data. Please select a valid column.")

## 5. Data Splitting

Split the data into training and testing sets.

In [None]:
# Split the data - make sure to use your actual target column name
try:
    X_train, X_test, y_train, y_test = split_data(clean_df, target_column)
    
    print(f"Training features shape: {X_train.shape}")
    print(f"Testing features shape: {X_test.shape}")
    print(f"Training target shape: {y_train.shape}")
    print(f"Testing target shape: {y_test.shape}")
except Exception as e:
    print(f"Error: {e}")
    print("Please make sure you've selected a valid target column.")

## 6. Model Building and Evaluation

Build and evaluate predictive models.

In [None]:
# Create preprocessing pipeline
numeric_cols, categorical_cols = get_numeric_and_categorical_columns(X_train)
preprocessor = create_preprocessing_pipeline(numeric_cols, categorical_cols)

# Evaluate models based on problem type
if 'problem_type' in locals() and problem_type == 'regression':
    # Evaluate regression models
    print("Evaluating regression models...")
    regression_results = evaluate_regression_models(X_train, y_train, X_test, y_test, preprocessor)
    display(regression_results)
    
    # Plot model comparison
    plt.figure(figsize=(12, 6))
    sns.barplot(x='R²', y='Model', data=regression_results)
    plt.title('Model Comparison - R² Score')
    plt.xlim(0, 1)  # R² typically ranges from 0 to 1
    plt.show()
    
elif 'problem_type' in locals() and problem_type == 'classification':
    # Evaluate classification models
    print("Evaluating classification models...")
    classification_results = evaluate_classification_models(X_train, y_train, X_test, y_test, preprocessor)
    display(classification_results)
    
    # Plot model comparison
    plt.figure(figsize=(12, 6))
    sns.barplot(x='F1 Score', y='Model', data=classification_results)
    plt.title('Model Comparison - F1 Score')
    plt.xlim(0, 1)  # F1 score ranges from 0 to 1
    plt.show()
else:
    print("Please run the previous cell to determine the problem type.")

## 7. Feature Importance

Let's examine which features are most important for the best model.

In [None]:
# Select the best model based on problem type
if 'problem_type' in locals() and problem_type == 'regression' and 'regression_results' in locals():
    best_model_name = regression_results.iloc[0]['Model']
    print(f"Best regression model: {best_model_name}")
    
    # For tree-based models, we can extract feature importance
    if best_model_name in ['Random Forest', 'Gradient Boosting', 'Decision Tree']:
        # Import and get models
        from modeling import get_regression_models
        models = get_regression_models()
        
        # Get the best model
        best_model = models[best_model_name]
        
        # Create full pipeline
        from sklearn.pipeline import Pipeline
        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', best_model)])
        
        # Fit model
        pipeline.fit(X_train, y_train)
        
        # Get feature names after preprocessing
        feature_names = numeric_cols.copy()
        # For categorical features, get one-hot encoded column names
        for cat_col in categorical_cols:
            unique_values = X_train[cat_col].unique()
            for value in unique_values:
                feature_names.append(f"{cat_col}_{value}")
        
        # Get feature importances
        importances = pipeline.named_steps['model'].feature_importances_
        
        # Get the right number of feature names
        if len(feature_names) > len(importances):
            feature_names = feature_names[:len(importances)]
        
        # Plot feature importance
        plot_feature_importance(feature_names, importances, title=f"{best_model_name} - Feature Importance")
    
elif 'problem_type' in locals() and problem_type == 'classification' and 'classification_results' in locals():
    best_model_name = classification_results.iloc[0]['Model']
    print(f"Best classification model: {best_model_name}")
    
    # For tree-based models, we can extract feature importance
    if best_model_name in ['Random Forest', 'Gradient Boosting', 'Decision Tree']:
        # Import and get models
        from modeling import get_classification_models
        models = get_classification_models()
        
        # Get the best model
        best_model = models[best_model_name]
        
        # Create full pipeline
        from sklearn.pipeline import Pipeline
        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', best_model)])
        
        # Fit model
        pipeline.fit(X_train, y_train)
        
        # Get feature names after preprocessing
        feature_names = numeric_cols.copy()
        # For categorical features, get one-hot encoded column names
        for cat_col in categorical_cols:
            unique_values = X_train[cat_col].unique()
            for value in unique_values:
                feature_names.append(f"{cat_col}_{value}")
        
        # Get feature importances
        importances = pipeline.named_steps['model'].feature_importances_
        
        # Get the right number of feature names
        if len(feature_names) > len(importances):
            feature_names = feature_names[:len(importances)]
        
        # Plot feature importance
        plot_feature_importance(feature_names, importances, title=f"{best_model_name} - Feature Importance")
else:
    print("Please run the model evaluation first.")

## 8. Save the Best Model

Save the best performing model for later use.

In [None]:
# Save the best model
if 'pipeline' in locals():
    from modeling import save_model
    
    # Create models directory if it doesn't exist
    import os
    os.makedirs('../models', exist_ok=True)
    
    # Save model
    model_path = f'../models/best_{problem_type}_model.pkl'
    save_model(pipeline, model_path)
    
    print(f"Model saved to {model_path}")
else:
    print("Please run the model evaluation first.")

## 9. Next Steps

Here are some suggestions for next steps:

1. **Feature Engineering**: Create new features or transform existing ones to improve model performance
2. **Hyperparameter Tuning**: Fine-tune the best model to improve its performance
3. **Model Interpretation**: Use tools like SHAP values to better understand model predictions
4. **Cross-Validation**: Perform more robust model evaluation using cross-validation
5. **Create a Prediction Pipeline**: Build a reusable pipeline for making predictions on new data