# Datathon Analysis Template

**Project:** Inter-Uni Datathon 2025  
**Author:** [Your Name]  
**Date:** [Date]  
**Objective:** [Describe the analysis objective]

## Table of Contents
1. [Setup & Configuration](#setup)
2. [Data Loading & Exploration](#data-loading)
3. [Data Preprocessing](#preprocessing)
4. [Exploratory Data Analysis](#eda)
5. [Feature Engineering](#feature-engineering)
6. [Model Development](#modeling)
7. [Results & Evaluation](#results)
8. [Conclusions & Next Steps](#conclusions)

## 1. Setup & Configuration {#setup}

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error

# Custom utilities
import sys
sys.path.append('../src')
from utils import load_and_inspect, quick_eda, create_baseline_plots
from preprocessing import DataPreprocessor, create_features

# Configuration
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
pd.set_option('display.max_columns', 100)

print("✅ Setup complete!")

## 2. Data Loading & Exploration {#data-loading}

In [None]:
# Load dataset
# Replace 'your_dataset.csv' with actual dataset path
data_path = '../data/raw/your_dataset.csv'

# Uncomment and modify based on your dataset
# df = load_and_inspect(data_path)

print("📊 Data loaded successfully!")
print(f"Dataset shape: {df.shape if 'df' in locals() else 'Load your dataset first'}")

In [None]:
# Quick data exploration
if 'df' in locals():
    # Display first few rows
    display(df.head())
    
    # Data types and missing values
    print("\n📋 Data Info:")
    df.info()
    
    # Missing values analysis
    missing_data = df.isnull().sum()
    if missing_data.sum() > 0:
        print("\n❗ Missing Values:")
        print(missing_data[missing_data > 0].sort_values(ascending=False))
    else:
        print("✅ No missing values found!")

## 3. Data Preprocessing {#preprocessing}

In [None]:
# Define target variable
target_column = 'your_target_column'  # Replace with actual target

# Initialize preprocessor
if 'df' in locals():
    preprocessor = DataPreprocessor()
    
    # Apply preprocessing
    df_processed = preprocessor.fit_transform(df, target_column)
    
    print(f"✅ Preprocessing complete!")
    print(f"Original shape: {df.shape}")
    print(f"Processed shape: {df_processed.shape}")
    
    # Display processed data sample
    display(df_processed.head())
else:
    print("⚠️ Load dataset first")

## 4. Exploratory Data Analysis {#eda}

In [None]:
# Statistical summary
if 'df_processed' in locals():
    print("📊 Statistical Summary:")
    display(df_processed.describe())
    
    # Quick EDA using utility function
    quick_eda(df_processed, target_column)
else:
    print("⚠️ Process data first")

In [None]:
# Visualizations
if 'df_processed' in locals():
    create_baseline_plots(df_processed, target_column)
    
    # Additional custom plots based on your data
    # Add domain-specific visualizations here
    
    plt.show()

## 5. Feature Engineering {#feature-engineering}

In [None]:
# Create additional features
if 'df_processed' in locals():
    df_features = create_features(df_processed)
    
    print(f"✅ Feature engineering complete!")
    print(f"Original features: {df_processed.shape[1]}")
    print(f"With new features: {df_features.shape[1]}")
    
    # Display new feature names
    new_features = set(df_features.columns) - set(df_processed.columns)
    if new_features:
        print(f"\n🆕 New features created: {list(new_features)}")
else:
    print("⚠️ Process data first")

## 6. Model Development {#modeling}

In [None]:
# Prepare data for modeling
if 'df_features' in locals() and target_column in df_features.columns:
    X = df_features.drop(columns=[target_column])
    y = df_features[target_column]
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, 
        stratify=y if len(y.unique()) < 10 else None
    )
    
    print(f"✅ Data split complete!")
    print(f"Training set: {X_train.shape}")
    print(f"Test set: {X_test.shape}")
else:
    print("⚠️ Prepare features first")

In [None]:
# Model training and evaluation
if 'X_train' in locals():
    models = {}
    results = {}
    
    # Determine problem type
    is_classification = len(y.unique()) < 10 and y.dtype == 'object' or y.dtype == 'int64'
    
    if is_classification:
        print("🎯 Classification Problem Detected")
        models = {
            'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
            'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
        }
        metric_name = 'Accuracy'
    else:
        print("📈 Regression Problem Detected")
        models = {
            'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
            'Linear Regression': LinearRegression()
        }
        metric_name = 'R² Score'
    
    # Train and evaluate models
    for name, model in models.items():
        print(f"\n🔄 Training {name}...")
        
        # Cross-validation
        cv_scores = cross_val_score(model, X_train, y_train, cv=5)
        
        # Fit and predict
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        
        # Calculate metrics
        if is_classification:
            score = accuracy_score(y_test, predictions)
        else:
            from sklearn.metrics import r2_score
            score = r2_score(y_test, predictions)
        
        results[name] = {
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'test_score': score,
            'model': model
        }
        
        print(f"CV {metric_name}: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        print(f"Test {metric_name}: {score:.4f}")
    
    print("\n✅ Model training complete!")
else:
    print("⚠️ Prepare training data first")

## 7. Results & Evaluation {#results}

In [None]:
# Model comparison
if 'results' in locals():
    print("📊 Model Comparison:")
    
    comparison_df = pd.DataFrame({
        'Model': list(results.keys()),
        'CV Score': [results[model]['cv_mean'] for model in results.keys()],
        'CV Std': [results[model]['cv_std'] for model in results.keys()],
        'Test Score': [results[model]['test_score'] for model in results.keys()]
    })
    
    display(comparison_df)
    
    # Best model
    best_model_name = comparison_df.loc[comparison_df['Test Score'].idxmax(), 'Model']
    best_model = results[best_model_name]['model']
    
    print(f"\n🏆 Best Model: {best_model_name}")
    print(f"Test Score: {results[best_model_name]['test_score']:.4f}")

In [None]:
# Feature importance (if available)
if 'best_model' in locals() and hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\n🔍 Top 10 Feature Importances:")
    display(feature_importance.head(10))
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
    plt.title('Top 10 Feature Importances')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()

In [None]:
# Detailed evaluation for best model
if 'best_model' in locals() and 'is_classification' in locals():
    predictions = best_model.predict(X_test)
    
    if is_classification:
        from sklearn.metrics import classification_report, confusion_matrix
        
        print("\n📋 Classification Report:")
        print(classification_report(y_test, predictions))
        
        # Confusion Matrix
        plt.figure(figsize=(8, 6))
        cm = confusion_matrix(y_test, predictions)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()
    else:
        from sklearn.metrics import mean_absolute_error, mean_squared_error
        
        mae = mean_absolute_error(y_test, predictions)
        mse = mean_squared_error(y_test, predictions)
        rmse = np.sqrt(mse)
        
        print(f"\n📊 Regression Metrics:")
        print(f"MAE: {mae:.4f}")
        print(f"RMSE: {rmse:.4f}")
        
        # Residual plot
        plt.figure(figsize=(10, 6))
        residuals = y_test - predictions
        plt.scatter(predictions, residuals, alpha=0.6)
        plt.axhline(y=0, color='r', linestyle='--')
        plt.xlabel('Predicted Values')
        plt.ylabel('Residuals')
        plt.title('Residual Plot')
        plt.show()

## 8. Conclusions & Next Steps {#conclusions}

### Key Findings
- [ ] Summary of main insights from the data
- [ ] Model performance summary
- [ ] Most important features identified
- [ ] Business implications

### Next Steps
- [ ] Hyperparameter tuning for best model
- [ ] Try additional algorithms (XGBoost, Neural Networks)
- [ ] Feature engineering improvements
- [ ] Cross-validation strategy refinement
- [ ] Deploy model for production use

### Presentation Points
- [ ] Problem statement and approach
- [ ] Data insights and preprocessing steps
- [ ] Model comparison and selection
- [ ] Business recommendations
- [ ] Future work and improvements

In [None]:
# Save results and model
import pickle
import os

if 'best_model' in locals():
    # Create models directory
    os.makedirs('../models', exist_ok=True)
    
    # Save best model
    model_filename = f'../models/best_model_{best_model_name.lower().replace(" ", "_")}.pkl'
    with open(model_filename, 'wb') as f:
        pickle.dump(best_model, f)
    
    print(f"✅ Model saved: {model_filename}")
    
    # Save results summary
    results_summary = {
        'best_model': best_model_name,
        'test_score': results[best_model_name]['test_score'],
        'cv_score': results[best_model_name]['cv_mean'],
        'feature_count': X.shape[1],
        'training_samples': X_train.shape[0],
        'test_samples': X_test.shape[0]
    }
    
    print("\n📄 Final Results Summary:")
    for key, value in results_summary.items():
        print(f"{key}: {value}")

print("\n🎉 Analysis complete!")