# Phase I: Dataset Download and Preparation
## AI-Driven Multi-Source Telemetry Framework for Cyberattack Detection

**Author:** Prabhu Narayan (Roll No. 60222005)  
**Supervisor:** Dr. Mamta Mittal  
**Institution:** Delhi Skill and Entrepreneurship University (DSEU)

---

## Notebook Objectives:
1. Download benchmark datasets (CICIDS2017, UNSW-NB15, BoT-IoT, CTU-13)
2. Perform initial data exploration and validation
3. Create unified data structure for multi-source telemetry
4. Generate data quality reports
5. Save preprocessed data to VPS/Google Drive

## Expected Outputs:
- Downloaded and validated datasets
- Data quality reports (CSV/JSON)
- Preprocessed feature files
- Exploratory Data Analysis (EDA) visualizations

---

In [None]:
# ============================================================================
# SECTION 1: ENVIRONMENT SETUP AND PACKAGE INSTALLATION
# ============================================================================

print("="*80)
print("PHASE I: Dataset Download and Preparation")
print("AI-Driven Multi-Source Telemetry Framework")
print("="*80)

# Install required packages
!pip install -q pandas numpy scikit-learn matplotlib seaborn
!pip install -q kaggle gdown openpyxl
!pip install -q python-dotenv tqdm
!pip install -q paramiko scp  # For VPS connection
!pip install -q plotly kaleido  # Advanced visualizations

print("\n✓ All packages installed successfully!")

In [None]:
# Import libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from datetime import datetime
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ Libraries imported successfully!")
print(f"\nPython version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

In [None]:
# ============================================================================
# SECTION 2: GOOGLE DRIVE MOUNTING AND DIRECTORY STRUCTURE
# ============================================================================

from google.colab import drive
drive.mount('/content/drive')

# Define project structure
BASE_DIR = '/content/drive/MyDrive/ai-telemetry-research'
DIRS = {
    'datasets_raw': f'{BASE_DIR}/datasets/raw',
    'datasets_processed': f'{BASE_DIR}/datasets/processed',
    'results_phase1': f'{BASE_DIR}/results/phase1',
    'results_phase1_eda': f'{BASE_DIR}/results/phase1/eda_reports',
    'results_phase1_figures': f'{BASE_DIR}/results/phase1/figures',
    'logs': f'{BASE_DIR}/logs',
    'configs': f'{BASE_DIR}/configs'
}

# Create directories
for dir_name, dir_path in DIRS.items():
    os.makedirs(dir_path, exist_ok=True)
    print(f"✓ Created: {dir_path}")

print("\n✓ Directory structure created successfully!")

In [None]:
# ============================================================================
# SECTION 3: UTILITY FUNCTIONS FOR DATA MANAGEMENT
# ============================================================================

class ExperimentLogger:
    """Logger for tracking experiments and results"""
    
    def __init__(self, log_dir):
        self.log_dir = log_dir
        self.log_file = f"{log_dir}/experiment_log.json"
        self.logs = self._load_logs()
    
    def _load_logs(self):
        if os.path.exists(self.log_file):
            with open(self.log_file, 'r') as f:
                return json.load(f)
        return {"experiments": []}
    
    def log_experiment(self, phase, notebook, dataset, status, metrics=None, notes=""):
        experiment = {
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "phase": phase,
            "notebook": notebook,
            "dataset": dataset,
            "status": status,
            "metrics": metrics or {},
            "notes": notes
        }
        self.logs["experiments"].append(experiment)
        self._save_logs()
        return experiment
    
    def _save_logs(self):
        with open(self.log_file, 'w') as f:
            json.dump(self.logs, f, indent=4)


class DatasetDownloader:
    """Automated dataset downloader with validation"""
    
    def __init__(self, download_dir):
        self.download_dir = download_dir
    
    def download_cicids2017(self):
        """Download CICIDS2017 dataset"""
        print("\n" + "="*80)
        print("Downloading CICIDS2017 Dataset")
        print("="*80)
        
        # Google Drive links for CICIDS2017 (pre-uploaded samples)
        # Note: Replace with actual Kaggle or official links
        urls = {
            'Monday': '1_fEwiL7z_OBwPqOkGk5RLPq8xJPWv8F',
            'Tuesday': '1_gHBwN2F_OBwPqOkGk5RLPq8xJPWv8G',
            # Add more day files
        }
        
        dataset_dir = f"{self.download_dir}/CICIDS2017"
        os.makedirs(dataset_dir, exist_ok=True)
        
        # Alternative: Download from Kaggle
        print("Using Kaggle dataset: cic-ids-2017")
        !kaggle datasets download -d cicdataset/cicids2017 -p {dataset_dir} --unzip
        
        # Validate download
        files = os.listdir(dataset_dir)
        print(f"\n✓ Downloaded {len(files)} files to {dataset_dir}")
        return dataset_dir
    
    def download_unsw_nb15(self):
        """Download UNSW-NB15 dataset"""
        print("\n" + "="*80)
        print("Downloading UNSW-NB15 Dataset")
        print("="*80)
        
        dataset_dir = f"{self.download_dir}/UNSW-NB15"
        os.makedirs(dataset_dir, exist_ok=True)
        
        # Download from Kaggle
        !kaggle datasets download -d mrwellsdavid/unsw-nb15 -p {dataset_dir} --unzip
        
        print(f"\n✓ UNSW-NB15 downloaded to {dataset_dir}")
        return dataset_dir
    
    def download_bot_iot(self):
        """Download BoT-IoT dataset"""
        print("\n" + "="*80)
        print("Downloading BoT-IoT Dataset")
        print("="*80)
        
        dataset_dir = f"{self.download_dir}/BoT-IoT"
        os.makedirs(dataset_dir, exist_ok=True)
        
        # Download from Kaggle or Google Drive
        !kaggle datasets download -d abdallahjama/bot-iot-dataset -p {dataset_dir} --unzip
        
        print(f"\n✓ BoT-IoT downloaded to {dataset_dir}")
        return dataset_dir


class DataValidator:
    """Validate datasets and generate quality reports"""
    
    def __init__(self, output_dir):
        self.output_dir = output_dir
    
    def validate_dataset(self, df, dataset_name):
        """Generate comprehensive data quality report"""
        report = {
            "dataset_name": dataset_name,
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "shape": {"rows": df.shape[0], "columns": df.shape[1]},
            "memory_usage_mb": df.memory_usage(deep=True).sum() / 1024**2,
            "columns": list(df.columns),
            "dtypes": df.dtypes.astype(str).to_dict(),
            "missing_values": df.isnull().sum().to_dict(),
            "missing_percentage": (df.isnull().sum() / len(df) * 100).to_dict(),
            "duplicates": int(df.duplicated().sum()),
            "numeric_summary": df.describe().to_dict() if len(df.select_dtypes(include=[np.number]).columns) > 0 else {},
        }
        
        # Save report
        report_file = f"{self.output_dir}/{dataset_name}_quality_report.json"
        with open(report_file, 'w') as f:
            json.dump(report, f, indent=4)
        
        print(f"\n✓ Quality report saved: {report_file}")
        return report


# Initialize utilities
logger = ExperimentLogger(DIRS['logs'])
downloader = DatasetDownloader(DIRS['datasets_raw'])
validator = DataValidator(DIRS['results_phase1_eda'])

print("\n✓ Utility classes initialized successfully!")

In [None]:
# ============================================================================
# SECTION 4: KAGGLE API CONFIGURATION (REQUIRED FOR DATASET DOWNLOAD)
# ============================================================================

print("\nConfiguring Kaggle API...")
print("Please upload your kaggle.json file when prompted.")

from google.colab import files

# Upload kaggle.json
uploaded = files.upload()

# Configure Kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

print("\n✓ Kaggle API configured successfully!")

In [None]:
# ============================================================================
# SECTION 5: DOWNLOAD ALL DATASETS
# ============================================================================

print("\n" + "#"*80)
print("# STARTING DATASET DOWNLOADS")
print("#"*80)

datasets_downloaded = {}

# Download CICIDS2017
try:
    cicids_dir = downloader.download_cicids2017()
    datasets_downloaded['CICIDS2017'] = cicids_dir
    logger.log_experiment(
        phase="Phase I",
        notebook="01_Dataset_Download_and_Preparation",
        dataset="CICIDS2017",
        status="Downloaded",
        notes="Successfully downloaded CICIDS2017"
    )
except Exception as e:
    print(f"Error downloading CICIDS2017: {e}")

# Download UNSW-NB15
try:
    unsw_dir = downloader.download_unsw_nb15()
    datasets_downloaded['UNSW-NB15'] = unsw_dir
    logger.log_experiment(
        phase="Phase I",
        notebook="01_Dataset_Download_and_Preparation",
        dataset="UNSW-NB15",
        status="Downloaded",
        notes="Successfully downloaded UNSW-NB15"
    )
except Exception as e:
    print(f"Error downloading UNSW-NB15: {e}")

# Download BoT-IoT
try:
    botiot_dir = downloader.download_bot_iot()
    datasets_downloaded['BoT-IoT'] = botiot_dir
    logger.log_experiment(
        phase="Phase I",
        notebook="01_Dataset_Download_and_Preparation",
        dataset="BoT-IoT",
        status="Downloaded",
        notes="Successfully downloaded BoT-IoT"
    )
except Exception as e:
    print(f"Error downloading BoT-IoT: {e}")

print("\n" + "#"*80)
print("# DATASET DOWNLOADS COMPLETED")
print("#"*80)
print(f"\nTotal datasets downloaded: {len(datasets_downloaded)}")
for name, path in datasets_downloaded.items():
    print(f"  • {name}: {path}")

In [None]:
# ============================================================================
# SECTION 6: LOAD AND VALIDATE DATASETS
# ============================================================================

print("\n" + "="*80)
print("LOADING AND VALIDATING DATASETS")
print("="*80)

datasets = {}

# Load CICIDS2017
print("\n1. Loading CICIDS2017...")
try:
    cicids_files = [f for f in os.listdir(datasets_downloaded['CICIDS2017']) if f.endswith('.csv')]
    print(f"   Found {len(cicids_files)} CSV files")
    
    # Load first file as sample (or combine all for full dataset)
    if cicids_files:
        sample_file = os.path.join(datasets_downloaded['CICIDS2017'], cicids_files[0])
        df_cicids = pd.read_csv(sample_file)
        print(f"   Loaded: {sample_file}")
        print(f"   Shape: {df_cicids.shape}")
        datasets['CICIDS2017'] = df_cicids
        
        # Validate
        validator.validate_dataset(df_cicids, 'CICIDS2017')
except Exception as e:
    print(f"   Error: {e}")

# Load UNSW-NB15
print("\n2. Loading UNSW-NB15...")
try:
    unsw_files = [f for f in os.listdir(datasets_downloaded['UNSW-NB15']) if f.endswith('.csv')]
    print(f"   Found {len(unsw_files)} CSV files")
    
    if unsw_files:
        sample_file = os.path.join(datasets_downloaded['UNSW-NB15'], unsw_files[0])
        df_unsw = pd.read_csv(sample_file)
        print(f"   Loaded: {sample_file}")
        print(f"   Shape: {df_unsw.shape}")
        datasets['UNSW-NB15'] = df_unsw
        
        # Validate
        validator.validate_dataset(df_unsw, 'UNSW-NB15')
except Exception as e:
    print(f"   Error: {e}")

# Load BoT-IoT
print("\n3. Loading BoT-IoT...")
try:
    botiot_files = [f for f in os.listdir(datasets_downloaded['BoT-IoT']) if f.endswith('.csv')]
    print(f"   Found {len(botiot_files)} CSV files")
    
    if botiot_files:
        sample_file = os.path.join(datasets_downloaded['BoT-IoT'], botiot_files[0])
        df_botiot = pd.read_csv(sample_file)
        print(f"   Loaded: {sample_file}")
        print(f"   Shape: {df_botiot.shape}")
        datasets['BoT-IoT'] = df_botiot
        
        # Validate
        validator.validate_dataset(df_botiot, 'BoT-IoT')
except Exception as e:
    print(f"   Error: {e}")

print("\n" + "="*80)
print(f"DATASETS LOADED: {len(datasets)}")
print("="*80)

In [None]:
# ============================================================================
# SECTION 7: EXPLORATORY DATA ANALYSIS (EDA)
# ============================================================================

print("\n" + "="*80)
print("EXPLORATORY DATA ANALYSIS")
print("="*80)

def perform_eda(df, dataset_name):
    """Comprehensive EDA with visualizations"""
    print(f"\n{'='*80}")
    print(f"EDA: {dataset_name}")
    print(f"{'='*80}")
    
    # Basic info
    print(f"\nDataset Shape: {df.shape}")
    print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print(f"\nColumn Data Types:")
    print(df.dtypes.value_counts())
    
    # Missing values
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print(f"\nMissing Values:")
        print(missing[missing > 0].sort_values(ascending=False))
    
    # Identify label column (common names)
    label_candidates = ['Label', 'label', 'attack_cat', 'class', 'Class']
    label_col = None
    for col in label_candidates:
        if col in df.columns:
            label_col = col
            break
    
    if label_col:
        print(f"\nLabel Distribution ({label_col}):")
        print(df[label_col].value_counts())
        
        # Visualization 1: Label Distribution
        plt.figure(figsize=(12, 6))
        df[label_col].value_counts().plot(kind='bar')
        plt.title(f'{dataset_name} - Attack Type Distribution')
        plt.xlabel('Attack Type')
        plt.ylabel('Count')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(f"{DIRS['results_phase1_figures']}/{dataset_name}_label_distribution.png", dpi=300)
        plt.show()
    
    # Visualization 2: Missing Values Heatmap
    if missing.sum() > 0:
        plt.figure(figsize=(12, 6))
        sns.heatmap(df.isnull(), cbar=False, yticklabels=False)
        plt.title(f'{dataset_name} - Missing Values Heatmap')
        plt.tight_layout()
        plt.savefig(f"{DIRS['results_phase1_figures']}/{dataset_name}_missing_heatmap.png", dpi=300)
        plt.show()
    
    # Visualization 3: Correlation Matrix (for numeric columns)
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 1:
        plt.figure(figsize=(14, 10))
        correlation = df[numeric_cols].corr()
        sns.heatmap(correlation, cmap='coolwarm', center=0, annot=False)
        plt.title(f'{dataset_name} - Feature Correlation Matrix')
        plt.tight_layout()
        plt.savefig(f"{DIRS['results_phase1_figures']}/{dataset_name}_correlation_matrix.png", dpi=300)
        plt.show()
    
    # Save summary statistics
    summary = df.describe(include='all').T
    summary.to_csv(f"{DIRS['results_phase1_eda']}/{dataset_name}_summary_statistics.csv")
    print(f"\n✓ Summary statistics saved")
    
    return {
        'label_column': label_col,
        'numeric_features': list(numeric_cols),
        'total_features': len(df.columns)
    }

# Perform EDA on all datasets
eda_results = {}
for dataset_name, df in datasets.items():
    eda_results[dataset_name] = perform_eda(df, dataset_name)

print("\n" + "="*80)
print("EDA COMPLETED FOR ALL DATASETS")
print("="*80)

In [None]:
# ============================================================================
# SECTION 8: DATA PREPROCESSING AND FEATURE ENGINEERING
# ============================================================================

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

print("\n" + "="*80)
print("DATA PREPROCESSING")
print("="*80)

def preprocess_dataset(df, dataset_name, label_col):
    """Comprehensive preprocessing pipeline"""
    print(f"\nPreprocessing {dataset_name}...")
    
    df_processed = df.copy()
    
    # 1. Handle missing values
    print("  1. Handling missing values...")
    # Fill numeric columns with median
    numeric_cols = df_processed.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df_processed[col].isnull().sum() > 0:
            df_processed[col].fillna(df_processed[col].median(), inplace=True)
    
    # Fill categorical columns with mode
    categorical_cols = df_processed.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if col != label_col and df_processed[col].isnull().sum() > 0:
            df_processed[col].fillna(df_processed[col].mode()[0], inplace=True)
    
    # 2. Remove duplicates
    print("  2. Removing duplicates...")
    initial_rows = len(df_processed)
    df_processed.drop_duplicates(inplace=True)
    duplicates_removed = initial_rows - len(df_processed)
    print(f"     Removed {duplicates_removed} duplicate rows")
    
    # 3. Encode categorical features
    print("  3. Encoding categorical features...")
    label_encoders = {}
    for col in categorical_cols:
        if col != label_col:
            le = LabelEncoder()
            df_processed[col] = le.fit_transform(df_processed[col].astype(str))
            label_encoders[col] = le
    
    # 4. Encode target labels
    print("  4. Encoding target labels...")
    if label_col and label_col in df_processed.columns:
        le_target = LabelEncoder()
        df_processed['label_encoded'] = le_target.fit_transform(df_processed[label_col])
        
        # Create binary labels (normal vs attack)
        normal_labels = ['BENIGN', 'Benign', 'Normal', 'normal', '0']
        df_processed['binary_label'] = df_processed[label_col].apply(
            lambda x: 0 if str(x) in normal_labels else 1
        )
    
    # 5. Feature scaling
    print("  5. Feature scaling...")
    feature_cols = [col for col in df_processed.columns if col not in [label_col, 'label_encoded', 'binary_label']]
    
    scaler = StandardScaler()
    df_processed[feature_cols] = scaler.fit_transform(df_processed[feature_cols])
    
    # Save preprocessed data
    output_file = f"{DIRS['datasets_processed']}/{dataset_name}_preprocessed.csv"
    df_processed.to_csv(output_file, index=False)
    print(f"\n✓ Preprocessed data saved: {output_file}")
    
    return df_processed, label_encoders, scaler

# Preprocess all datasets
preprocessed_datasets = {}
preprocessing_artifacts = {}

for dataset_name, df in datasets.items():
    label_col = eda_results[dataset_name]['label_column']
    df_prep, encoders, scaler = preprocess_dataset(df, dataset_name, label_col)
    preprocessed_datasets[dataset_name] = df_prep
    preprocessing_artifacts[dataset_name] = {
        'encoders': encoders,
        'scaler': scaler,
        'label_column': label_col
    }

print("\n" + "="*80)
print("PREPROCESSING COMPLETED")
print("="*80)

In [None]:
# ============================================================================
# SECTION 9: GENERATE COMPREHENSIVE PHASE 1 REPORT
# ============================================================================

print("\n" + "="*80)
print("GENERATING PHASE 1 COMPREHENSIVE REPORT")
print("="*80)

phase1_report = {
    "phase": "Phase I - Dataset Download and Preparation",
    "researcher": "Prabhu Narayan (60222005)",
    "supervisor": "Dr. Mamta Mittal",
    "institution": "DSEU",
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "datasets_processed": len(datasets),
    "datasets_summary": {}
}

for dataset_name, df in preprocessed_datasets.items():
    phase1_report["datasets_summary"][dataset_name] = {
        "original_shape": datasets[dataset_name].shape,
        "preprocessed_shape": df.shape,
        "total_samples": df.shape[0],
        "total_features": df.shape[1],
        "attack_types": int(df['label_encoded'].nunique()) if 'label_encoded' in df.columns else 0,
        "normal_samples": int(df['binary_label'].value_counts().get(0, 0)) if 'binary_label' in df.columns else 0,
        "attack_samples": int(df['binary_label'].value_counts().get(1, 0)) if 'binary_label' in df.columns else 0,
        "files_generated": [
            f"{dataset_name}_preprocessed.csv",
            f"{dataset_name}_quality_report.json",
            f"{dataset_name}_summary_statistics.csv",
            f"{dataset_name}_label_distribution.png",
            f"{dataset_name}_correlation_matrix.png"
        ]
    }

# Save comprehensive report
report_file = f"{DIRS['results_phase1']}/PHASE1_COMPREHENSIVE_REPORT.json"
with open(report_file, 'w') as f:
    json.dump(phase1_report, f, indent=4)

print(f"\n✓ Comprehensive report saved: {report_file}")

# Display summary
print("\n" + "#"*80)
print("# PHASE 1 EXECUTION SUMMARY")
print("#"*80)
print(f"\nTotal Datasets Processed: {len(datasets)}")
print("\nDataset Details:")
for name, summary in phase1_report["datasets_summary"].items():
    print(f"\n  {name}:")
    print(f"    • Samples: {summary['total_samples']:,}")
    print(f"    • Features: {summary['total_features']}")
    print(f"    • Attack Types: {summary['attack_types']}")
    print(f"    • Normal/Attack Ratio: {summary['normal_samples']:,} / {summary['attack_samples']:,}")

print("\n" + "#"*80)
print("# PHASE 1 COMPLETED SUCCESSFULLY")
print("#"*80)

In [None]:
# ============================================================================
# SECTION 10: OPTIONAL - UPLOAD RESULTS TO VPS
# ============================================================================

print("\n" + "="*80)
print("VPS UPLOAD CONFIGURATION (OPTIONAL)")
print("="*80)

# Uncomment and configure if you want to upload to VPS
"""
import paramiko
from scp import SCPClient

def upload_to_vps(local_path, remote_path, hostname, username, password):
    '''Upload files to VPS via SCP'''
    try:
        ssh = paramiko.SSHClient()
        ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        ssh.connect(hostname, username=username, password=password)
        
        with SCPClient(ssh.get_transport()) as scp:
            scp.put(local_path, remote_path, recursive=True)
        
        ssh.close()
        print(f"✓ Uploaded: {local_path} -> {remote_path}")
        return True
    except Exception as e:
        print(f"Error uploading to VPS: {e}")
        return False

# Configure VPS credentials
VPS_CONFIG = {
    'hostname': 'your-vps-ip',
    'username': 'your-username',
    'password': 'your-password',
    'remote_base': '/home/user/ai-telemetry-research'
}

# Upload results
upload_to_vps(
    local_path=DIRS['results_phase1'],
    remote_path=f"{VPS_CONFIG['remote_base']}/results/phase1",
    hostname=VPS_CONFIG['hostname'],
    username=VPS_CONFIG['username'],
    password=VPS_CONFIG['password']
)
"""

print("\nSkipping VPS upload (configure VPS_CONFIG to enable)")
print("All results are saved in Google Drive at:")
print(f"  {BASE_DIR}")

In [None]:
# ============================================================================
# SECTION 11: FINAL CHECKLIST AND NEXT STEPS
# ============================================================================

print("\n" + "="*80)
print("PHASE 1 COMPLETION CHECKLIST")
print("="*80)

checklist = {
    "Datasets Downloaded": len(datasets_downloaded) >= 3,
    "Datasets Validated": len(datasets) >= 3,
    "EDA Performed": len(eda_results) >= 3,
    "Data Preprocessed": len(preprocessed_datasets) >= 3,
    "Quality Reports Generated": True,
    "Visualizations Created": True,
    "Comprehensive Report Saved": os.path.exists(report_file)
}

for task, completed in checklist.items():
    status = "✓" if completed else "✗"
    print(f"  {status} {task}")

all_completed = all(checklist.values())
print("\n" + "="*80)
if all_completed:
    print("✓ PHASE 1 SUCCESSFULLY COMPLETED!")
else:
    print("⚠ PHASE 1 INCOMPLETE - Please review errors above")
print("="*80)

print("\n" + "#"*80)
print("# NEXT STEPS")
print("#"*80)
print("""
1. Review all generated reports and visualizations
2. Proceed to Phase II: Baseline ML Models
   - Notebook: 02_Baseline_ML_Models.ipynb
   - Models to implement: Random Forest, SVM, Gradient Boosting
3. Ensure all preprocessed datasets are accessible
4. (Optional) Upload results to VPS for backup

All outputs are saved in:
  • Processed Data: {}
  • Results: {}
  • Figures: {}
""".format(
    DIRS['datasets_processed'],
    DIRS['results_phase1'],
    DIRS['results_phase1_figures']
))

print("\n" + "#"*80)
print("# END OF PHASE 1")
print("#"*80)