## DAILY DATA BASED TRAINING

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os
import pickle
import matplotlib.pyplot as plt

def create_comprehensive_autoencoder(input_dim):
    """
    Create a comprehensive autoencoder to handle multiple features
    
    Parameters:
    - input_dim: Number of input features
    
    Returns:
    - Compiled autoencoder model
    """
    # Input layer
    input_layer = tf.keras.layers.Input(shape=(input_dim,))
    
    # Encoder with multiple layers and increasing complexity
    encoded = tf.keras.layers.Dense(
        max(input_dim, 64),  # Increased base layer size
        activation='relu', 
        kernel_regularizer=tf.keras.regularizers.l2(0.001)
    )(input_layer)
    
    # Batch normalization for stability
    encoded = tf.keras.layers.BatchNormalization()(encoded)
    
    # Additional hidden layers with decreasing dimensions
    encoded = tf.keras.layers.Dense(
        max(input_dim // 2, 32),  
        activation='relu', 
        kernel_regularizer=tf.keras.regularizers.l2(0.001)
    )(encoded)
    
    # Batch normalization
    encoded = tf.keras.layers.BatchNormalization()(encoded)
    
    # Bottleneck layer
    bottleneck = tf.keras.layers.Dense(
        max(input_dim // 4, 16),  # Tight bottleneck
        activation='relu', 
        kernel_regularizer=tf.keras.regularizers.l2(0.001)
    )(encoded)
    
    # Decoder with symmetric architecture
    decoded = tf.keras.layers.Dense(
        max(input_dim // 2, 32), 
        activation='relu', 
        kernel_regularizer=tf.keras.regularizers.l2(0.001)
    )(bottleneck)
    
    # Batch normalization
    decoded = tf.keras.layers.BatchNormalization()(decoded)
    
    decoded = tf.keras.layers.Dense(
        max(input_dim, 64), 
        activation='relu', 
        kernel_regularizer=tf.keras.regularizers.l2(0.001)
    )(decoded)
    
    # Final reconstruction layer
    decoded = tf.keras.layers.Dense(
        input_dim, 
        activation='linear'
    )(decoded)
    
    # Create autoencoder
    autoencoder = tf.keras.Model(input_layer, decoded)
    
    # Compile with adaptive optimizer
    autoencoder.compile(
        optimizer=tf.keras.optimizers.Adam(
            learning_rate=0.001, 
            clipnorm=1.0  # Gradient clipping for stability
        ),
        loss='mean_squared_error'
    )
    
    return autoencoder

def plot_training_history(history):
    """
    Generate and save a plot of training and validation loss
    
    Parameters:
    - history: Model training history
    """
    plt.figure(figsize=(10, 6))
    plt.plot(history['loss'], label='Training Loss')
    plt.plot(history['val_loss'], label='Validation Loss')
    plt.title('Autoencoder Model Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss (Mean Squared Error)')
    plt.legend()
    plt.tight_layout()
    
    # Ensure models directory exists
    os.makedirs("models", exist_ok=True)
    
    # Save the plot
    plt.savefig('models/training_loss_plot.png')
    plt.close()

def iqr_outlier_removal(data, columns):
    """
    Remove outliers using Interquartile Range (IQR) method
    
    Parameters:
    - data: DataFrame to clean
    - columns: List of columns to apply IQR cleaning
    
    Returns:
    - Cleaned DataFrame
    """
    cleaned_data = data.copy()
    
    for column in columns:
        # Calculate Q1, Q3, and IQR
        Q1 = cleaned_data[column].quantile(0.05)
        Q3 = cleaned_data[column].quantile(0.95)
        IQR = Q3 - Q1
        
        # Define outlier bounds
        lower_bound = Q1 - 0.1 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Remove outliers
        cleaned_data = cleaned_data[
            (cleaned_data[column] >= lower_bound) & 
            (cleaned_data[column] <= upper_bound)
        ]
        
        print(f"\nOutlier Cleaning for {column}:")
        print(f"  Q1: {Q1:.4f}")
        print(f"  Q3: {Q3:.4f}")
        print(f"  IQR: {IQR:.4f}")
        print(f"  Lower Bound: {lower_bound:.4f}")
        print(f"  Upper Bound: {upper_bound:.4f}")
        print(f"  Removed Rows: {len(data) - len(cleaned_data)}")
    
    return cleaned_data

def advanced_data_preprocessing(data):
    """
    Comprehensive data preprocessing pipeline
    
    Parameters:
    - data: Original DataFrame
    
    Returns:
    - Preprocessed DataFrame
    """
    # Select relevant columns
    relevant_columns = [
        'MO','ALLSKY_SFC_SW_DWN', 'T2M', 'T2MDEW', 'T2M_RANGE', 
        'T2M_MAX', 'T2M_MIN', 'QV2M', 'RH2M', 'PRECTOTCORR', 
        'PS', 'WS10M', 'WS10M_MAX', 'WS10M_MIN',
        'WS50M', 'WS50M_MAX', 'WS50M_MIN', 'NDVI', 'CI', 'ELEVATION'
    ]
    
    # Initial data cleaning
    cleaned_data = data[relevant_columns].copy()
    
    # Convert to numeric and handle errors
    cleaned_data = cleaned_data.apply(pd.to_numeric, errors='coerce')
    
    # Remove rows with NaN values
    cleaned_data = cleaned_data.dropna()
    
    # IQR Outlier Removal
    cleaned_data = iqr_outlier_removal(cleaned_data, relevant_columns)
    
    # Additional preprocessing steps
    def additional_cleaning(df):
        # Remove extreme outliers (6 standard deviations)
        for column in df.columns:
            mean = df[column].mean()
            std = df[column].std()
            df = df[
                (df[column] >= mean - 6*std) & 
                (df[column] <= mean + 6*std)
            ]
        return df
    
    cleaned_data = additional_cleaning(cleaned_data)
    
    # Descriptive statistics before and after cleaning
    print("\n--- Data Cleaning Summary ---")
    print(f"Original Data Rows: {len(data)}")
    print(f"Cleaned Data Rows: {len(cleaned_data)}")
    print(f"Rows Removed: {len(data) - len(cleaned_data)}")
    
    # Visualization of data distribution before and after cleaning
    def plot_distribution_comparison(original, cleaned, columns):
        plt.figure(figsize=(20, 15))
        for i, column in enumerate(columns, 1):
            plt.subplot(5, 4, i)
            plt.hist(original[column], bins=50, alpha=0.5, label='Original')
            plt.hist(cleaned[column], bins=50, alpha=0.5, label='Cleaned')
            plt.title(column)
            plt.legend()
        plt.tight_layout()
        plt.savefig('distribution_comparison.png')
        plt.close()
    
    plot_distribution_comparison(data[relevant_columns], cleaned_data, relevant_columns)
    
    return cleaned_data


def train_comprehensive_anomaly_model(data):
    """
    Train a comprehensive anomaly detection model using all relevant features
    
    Parameters:
    - data: Full dataset
    
    Returns:
    - Dictionary with model and scaler
    """
    try:
        # Ensure models directory exists
        os.makedirs("models", exist_ok=True)

        # Select all relevant columns
        relevant_columns = [
            'MO', 'ALLSKY_SFC_SW_DWN', 'T2M', 'T2MDEW', 'T2M_RANGE', 'T2M_MAX', 'T2M_MIN',
            'QV2M', 'RH2M', 'PRECTOTCORR', 'PS', 'WS10M', 'WS10M_MAX', 'WS10M_MIN',
            'WS50M', 'WS50M_MAX', 'WS50M_MIN', 'NDVI', 'CI', 'ELEVATION'
        ]
        
        feature_data = advanced_data_preprocessing(data)
        
        # Convert to numeric and handle errors
        feature_data = feature_data.apply(pd.to_numeric, errors='coerce')
        
        # Skip if insufficient data
        if len(feature_data) < 500:  # Increased minimum samples
            print("Insufficient data for comprehensive model")
            return None
        
        # Scale the data
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(feature_data.values)
        
        # Split data (90% train, 10% test)
        X_train, X_test = train_test_split(X_scaled, test_size=0.1, random_state=42)
        
        # Create and train autoencoder
        input_dim = X_scaled.shape[1]
        autoencoder = create_comprehensive_autoencoder(input_dim)
        
        # Advanced callbacks
        early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', 
            patience=20, 
            restore_best_weights=True,
            min_delta=0.0001
        )
        
        reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss', 
            factor=0.5, 
            patience=10, 
            min_lr=0.00001
        )
        
        # Train the model
        history = autoencoder.fit(
            X_train, X_train,
            epochs=250,  # Increased epochs
            batch_size=64,
            shuffle=True,
            validation_data=(X_test, X_test),
            callbacks=[early_stopping, reduce_lr],
            verbose=1
        )

        # Plot training history
        plot_training_history(history.history)

        # Compute reconstruction error for each sample
        train_pred = autoencoder.predict(X_train)
        test_pred = autoencoder.predict(X_test)
        
        train_mse = np.mean(np.power(X_train - train_pred, 2), axis=1)
        test_mse = np.mean(np.power(X_test - test_pred, 2), axis=1)
        
        # Print reconstruction statistics
        print("Training Reconstruction Error:")
        print(f"Mean: {np.mean(train_mse)}")
        print(f"Std: {np.std(train_mse)}")
        print(f"Max: {np.max(train_mse)}")
        
        print("\nTesting Reconstruction Error:")
        print(f"Mean: {np.mean(test_mse)}")
        print(f"Std: {np.std(test_mse)}")
        print(f"Max: {np.max(test_mse)}")

        # Save comprehensive model
        model_path = "models/comprehensive_autoencoder.keras"
        autoencoder.save(model_path)

        # Save scaler using pickle
        scaler_path = "models/comprehensive_scaler.pkl"
        with open(scaler_path, "wb") as f:
            pickle.dump(scaler, f)

        print(f"Saved comprehensive model to {model_path}")
        print (f"Saved scaler to {scaler_path}")
        
        return {
            'autoencoder': autoencoder,
            'scaler': scaler,
            'history': history.history,
            'train_mse': train_mse,
            'test_mse': test_mse
        }
    
    except Exception as e:
        print(f"Error training comprehensive model: {e}")
        return None

def main():
    """
    Train a comprehensive anomaly detection model
    
    Returns:
    - Trained comprehensive model
    """
    # Load your dataset
    df = pd.read_excel("./data/train.xlsx")
    
    # Remove rows with any empty values
    df = df.dropna()
    
    # Train comprehensive model
    comprehensive_model = train_comprehensive_anomaly_model(df)
    
    return comprehensive_model

if __name__ == "__main__":
    main()


Outlier Cleaning for MO:
  Q1: 1.0000
  Q3: 12.0000
  IQR: 11.0000
  Lower Bound: -0.1000
  Upper Bound: 28.5000
  Removed Rows: 0

Outlier Cleaning for ALLSKY_SFC_SW_DWN:
  Q1: 2.2200
  Q3: 6.1500
  IQR: 3.9300
  Lower Bound: 1.8270
  Upper Bound: 12.0450
  Removed Rows: 2911

Outlier Cleaning for T2M:
  Q1: 12.1900
  Q3: 29.0500
  IQR: 16.8600
  Lower Bound: 10.5040
  Upper Bound: 54.3400
  Removed Rows: 6438

Outlier Cleaning for T2MDEW:
  Q1: 7.6700
  Q3: 25.4500
  IQR: 17.7800
  Lower Bound: 5.8920
  Upper Bound: 52.1200
  Removed Rows: 9185

Outlier Cleaning for T2M_RANGE:
  Q1: 5.2500
  Q3: 15.1800
  IQR: 9.9300
  Lower Bound: 4.2570
  Upper Bound: 30.0750
  Removed Rows: 11052

Outlier Cleaning for T2M_MAX:
  Q1: 19.6700
  Q3: 34.3900
  IQR: 14.7200
  Lower Bound: 18.1980
  Upper Bound: 56.4700
  Removed Rows: 13536

Outlier Cleaning for T2M_MIN:
  Q1: 9.8600
  Q3: 25.7600
  IQR: 15.9000
  Lower Bound: 8.2700
  Upper Bound: 49.6100
  Removed Rows: 14339

Outlier Cleaning for Q

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

class FeatureOptimizer:
    def __init__(self, model_path='models/comprehensive_autoencoder.keras', 
                 scaler_path='models/comprehensive_scaler.pkl'):
        """
        Initialize the FeatureOptimizer with pre-trained model and scaler
        
        Parameters:
        - model_path: Path to saved Keras model
        - scaler_path: Path to saved StandardScaler
        """
        # Load the trained model
        self.autoencoder = tf.keras.models.load_model(model_path)
        
        # Load the scaler
        with open(scaler_path, 'rb') as f:
            self.scaler = pickle.load(f)
        
        # Feature names (ensure this matches the order in your original preprocessing)
        self.feature_names = [
            'MO', 'ALLSKY_SFC_SW_DWN', 'T2M', 'T2MDEW', 'T2M_RANGE', 
            'T2M_MAX', 'T2M_MIN', 'QV2M', 'RH2M', 'PRECTOTCORR', 
            'PS', 'WS10M', 'WS10M_MAX', 'WS10M_MIN',
            'WS50M', 'WS50M_MAX', 'WS50M_MIN', 'NDVI', 'CI', 'ELEVATION'
        ]

    def analyze_feature_importance(self, data):
        """
        Analyze the importance of each feature in reconstruction
        
        Parameters:
        - data: Input data DataFrame
        
        Returns:
        - Dictionary of feature importances
        """
        # Preprocess the data
        X_scaled = self.scaler.transform(data[self.feature_names].values)
        
        # Get reconstructed data
        X_reconstructed = self.autoencoder.predict(X_scaled)
        
        # Calculate reconstruction error for each feature
        feature_errors = {}
        for i, feature in enumerate(self.feature_names):
            # Calculate mean squared error for this feature
            feature_mse = np.mean((X_scaled[:, i] - X_reconstructed[:, i])**2)
            feature_errors[feature] = feature_mse
        
        # Sort features by reconstruction error
        sorted_features = sorted(feature_errors.items(), key=lambda x: x[1], reverse=True)
        
        return dict(sorted_features)

    def find_optimal_feature_ranges(self, data):
        """
        Find optimal ranges for features based on reconstruction error
        
        Parameters:
        - data: Input data DataFrame
        
        Returns:
        - Dictionary of optimal feature ranges
        """
        # Preprocess the data
        X_scaled = self.scaler.transform(data[self.feature_names].values)
        
        # Get reconstructed data
        X_reconstructed = self.autoencoder.predict(X_scaled)
        
        # Calculate reconstruction error
        reconstruction_errors = np.mean(np.power(X_scaled - X_reconstructed, 2), axis=1)
        
        # Find optimal ranges
        optimal_ranges = {}
        for i, feature in enumerate(self.feature_names):
            # Original data
            original_data = data[feature]
            
            # Find indices of low reconstruction error (good samples)
            low_error_mask = reconstruction_errors < np.percentile(reconstruction_errors, 25)
            
            # Calculate optimal range for this feature
            optimal_range = {
                'min': original_data[low_error_mask].min(),
                'max': original_data[low_error_mask].max(),
                'mean': original_data[low_error_mask].mean(),
                'median': original_data[low_error_mask].median(),
                'std': original_data[low_error_mask].std()
            }
            
            optimal_ranges[feature] = optimal_range
        
        return optimal_ranges

    def visualize_feature_distributions(self, data):
        """
        Create visualizations of feature distributions
        
        Parameters:
        - data: Input data DataFrame
        """
        # Preprocess the data
        X_scaled = self.scaler.transform(data[self.feature_names].values)
        
        # Get reconstructed data
        X_reconstructed = self.autoencoder.predict(X_scaled)
        
        # Calculate reconstruction error
        reconstruction_errors = np.mean(np.power(X_scaled - X_reconstructed, 2), axis=1)
        
        # Create plots
        plt.figure(figsize=(20, 15))
        for i, feature in enumerate(self.feature_names, 1):
            plt.subplot(5, 4, i)
            
            # Scatter plot of original vs reconstructed with error color
            plt.scatter(
                data[feature], 
                X_reconstructed[:, i-1], 
                c=reconstruction_errors, 
                cmap='viridis'
            )
            plt.title(feature)
            plt.xlabel('Original')
            plt.ylabel('Reconstructed')
        
        plt.tight_layout()
        plt.savefig('feature_reconstruction_analysis.png')
        plt.close()

    def generate_comprehensive_report(self, data):
        """
        Generate a comprehensive report of feature analysis
        
        Parameters:
        - data: Input data DataFrame
        """
        # Feature Importance Analysis
        feature_importance = self.analyze_feature_importance(data)
        print("\n--- Feature Reconstruction Importance ---")
        for feature, error in feature_importance.items():
            print(f"{feature}: {error:.4f}")
        
        # Optimal Ranges
        optimal_ranges = self.find_optimal_feature_ranges(data)
        print("\n--- Optimal Feature Ranges ---")
        for feature, ranges in optimal_ranges.items():
            print(f"\n{feature}:")
            for key, value in ranges.items():
                print(f"  {key}: {value:.4f}")
        
        # Visualizations
        self.visualize_feature_distributions(data)
        
        return {
            'feature_importance': feature_importance,
            'optimal_ranges': optimal_ranges
        }

def main():
    # Load your dataset
    df = pd.read_excel("./data/train.xlsx")
    df = df.dropna()
    
    # Initialize Feature Optimizer
    optimizer = FeatureOptimizer()
    
    # Generate Comprehensive Report
    report = optimizer.generate_comprehensive_report(df)
    
    return report

if __name__ == "__main__":
    main()

[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 772us/step

--- Feature Reconstruction Importance ---
PS: 0.0202
PRECTOTCORR: 0.0145
T2MDEW: 0.0124
ELEVATION: 0.0095
NDVI: 0.0092
RH2M: 0.0076
T2M_MIN: 0.0070
T2M: 0.0066
WS10M: 0.0061
T2M_MAX: 0.0058
ALLSKY_SFC_SW_DWN: 0.0055
WS50M: 0.0054
WS50M_MAX: 0.0051
T2M_RANGE: 0.0047
WS10M_MIN: 0.0035
WS50M_MIN: 0.0029
WS10M_MAX: 0.0028
QV2M: 0.0028
CI: 0.0028
MO: 0.0016
[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 679us/step

--- Optimal Feature Ranges ---

MO:
  min: 1.0000
  max: 12.0000
  mean: 7.4944
  median: 9.0000
  std: 4.1656

ALLSKY_SFC_SW_DWN:
  min: 0.8100
  max: 7.5600
  mean: 4.1826
  median: 4.1800
  std: 0.9513

T2M:
  min: 10.2100
  max: 32.3700
  mean: 21.2108
  median: 20.5100
  std: 4.6418

T2MDEW:
  min: 5.1000
  max: 27.5200
  mean: 16.3951
  median: 15.4100
  std: 5.1856

T2M_RANGE:
  min: 2.3700
  max: 18.1800
  mean: 9.8843
  median: 10.0000
  std: 2.5422

T2M_MAX:
  min: 15.20

In [4]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

def custom_percentile_rank(values, x):
    """
    Calculate percentile rank for a value or array
    """
    return np.array([np.sum(values <= val) / len(values) * 100 for val in x])

def calculate_similarity(new_place_data, training_data, scaler, weights, autoencoder):
    """
    Calculate similarity between new place data and training data using autoencoder
    
    Parameters:
    - new_place_data: Feature vector of the new place
    - training_data: Scaled feature matrix of training data
    - scaler: Fitted StandardScaler object
    - weights: List of feature weights
    - autoencoder: Trained autoencoder model
    
    Returns:
    - DataFrame with distances and indices of similar places
    """
    # Scale the new place data
    new_place_scaled = scaler.transform(new_place_data.values.reshape(1, -1))
    
    # Apply weights
    training_data_weighted = training_data * weights
    new_place_weighted = new_place_scaled * weights
    
    # Use autoencoder for feature extraction
    training_encoded = autoencoder.predict(training_data_weighted)
    new_place_encoded = autoencoder.predict(new_place_weighted)
    
    # Compute distances (Euclidean and Cosine Similarity) in encoded space
    euclidean_distances = np.linalg.norm(training_encoded - new_place_encoded, axis=1)
    cosine_similarities = cosine_similarity(new_place_encoded, training_encoded).flatten()
    
    # Compute percentile ranks
    euclidean_percentile = custom_percentile_rank(euclidean_distances, euclidean_distances)
    cosine_percentile = custom_percentile_rank(cosine_similarities, cosine_similarities)
    
    # Combine results into a DataFrame
    results = pd.DataFrame({
        'Euclidean_Distance': euclidean_distances,
        'Cosine_Similarity': cosine_similarities,
        'Euclidean_Percentile': euclidean_percentile,
        'Cosine_Percentile': cosine_percentile
    })
    
    return results

def visualize_similarity_distribution(similarity_results, new_place_data):
    """
    Create visualizations of similarity distributions
    """
    plt.figure(figsize=(15, 5))
    
    # Euclidean Distance Distribution
    plt.subplot(1, 3, 1)
    sns.histplot(similarity_results['Euclidean_Distance'], kde=True)
    plt.title('Euclidean Distance Distribution')
    plt.xlabel('Distance')
    plt.ylabel('Frequency')
    plt.axvline(x=similarity_results['Euclidean_Distance'].min(), color='r', linestyle='--', 
                label='Most Similar')
    plt.legend()
    
    # Cosine Similarity Distribution
    plt.subplot(1, 3, 2)
    sns.histplot(similarity_results['Cosine_Similarity'], kde=True)
    plt.title('Cosine Similarity Distribution')
    plt.xlabel('Similarity')
    plt.ylabel('Frequency')
    plt.axvline(x=similarity_results['Cosine_Similarity'].max(), color='r', linestyle='--', 
                label='Most Similar')
    plt.legend()
    
    # Scatter plot of Euclidean vs Cosine
    plt.subplot(1, 3, 3)
    plt.scatter(similarity_results['Euclidean_Distance'], 
                similarity_results['Cosine_Similarity'], 
                alpha=0.5)
    plt.title('Euclidean Distance vs Cosine Similarity')
    plt.xlabel('Euclidean Distance')
    plt.ylabel('Cosine Similarity')
    
    plt.tight_layout()
    plt.savefig('models/similarity_analysis.png')
    plt.close()

def comprehensive_similarity_analysis(new_place_data, training_data, scaler, weights, autoencoder, euclidean_threshold, cosine_threshold):
    """
    Perform comprehensive similarity analysis with dissimilarity check
    """
    # Calculate similarity
    similarity_results = calculate_similarity(
        new_place_data, 
        scaler.transform(training_data.values), 
        scaler, 
        weights,
        autoencoder
    )
    
    # Check if the new place is dissimilar
    min_euclidean_distance = similarity_results['Euclidean_Distance'].min()
    max_cosine_similarity = similarity_results['Cosine_Similarity'].max()
    
    is_dissimilar = (
        min_euclidean_distance > euclidean_threshold or 
        max_cosine_similarity < cosine_threshold
    )
    
    # Find top similar places
    top_similar_euclidean = similarity_results.nsmallest(5, 'Euclidean_Distance')
    top_similar_cosine = similarity_results.nlargest(5, 'Cosine_Similarity')
    
    # Visualize similarity distribution
    visualize_similarity_distribution(similarity_results, new_place_data)
    
    # Detailed analysis
    analysis_results = {
        'new_place_data': new_place_data,
        'most_similar_euclidean': training_data.iloc[top_similar_euclidean.index],
        'most_similar_cosine': training_data.iloc[top_similar_cosine.index],
        'similarity_metrics': similarity_results,
        'is_dissimilar': is_dissimilar
    }
    
    return analysis_results

def main():
    """
    Compare a new place with training places for similarity and dissimilarity
    """
    # Load the scaler
    with open("models/comprehensive_scaler.pkl", "rb") as f:
        scaler = pickle.load(f)
    
    # Load the autoencoder model
    model_path = "models/comprehensive_autoencoder.keras"
    autoencoder = tf.keras.models.load_model(model_path)
    
    # Load your training dataset
    training_data = pd.read_excel("./data/train.xlsx")
    
    # Filter and scale the training data
    relevant_columns = [
        'MO', 'ALLSKY_SFC_SW_DWN', 'T2M', 'T2MDEW', 'T2M_RANGE', 'T2M_MAX', 'T2M_MIN',
        'QV2M', 'RH2M', 'PRECTOTCORR', 'PS', 'WS10M', 'WS10M_MAX', 'WS10M_MIN',
        'WS50M', 'WS50M_MAX', 'WS50M_MIN', 'NDVI', 'CI', 'ELEVATION'
    ]
    training_features = training_data[relevant_columns].dropna()
    
    # Define weights
    weights = np.ones(len(relevant_columns))
    weights[relevant_columns.index('NDVI')] = 2.7  # Vegetation Density
    weights[relevant_columns.index('CI')] = 2.5    # Choloropyhll Index
    weights[relevant_columns.index('ELEVATION')] = 2.2  # Topographical Influence
    
    # Thresholds for similarity
    euclidean_threshold = 6.5
    cosine_threshold = 0.85
    
    # Load new places dataset
    new_places_data = pd.read_excel("./data/train_d.xlsx")
    
    # Select relevant columns, excluding the index and TARGET
    relevant_columns = [col for col in new_places_data.columns if col not in ['TARGET']]
    new_places_data = new_places_data[relevant_columns].dropna()
    
    # Group by LAT and LON and calculate mean for other columns
    grouped_places = new_places_data .groupby(['LAT', 'LON','MO']).agg({
        col: 'mean' for col in new_places_data.columns 
        if col not in ['LAT', 'LON','MO']
    })

    # Reset index to make LAT and LON regular columns again
    grouped_places_reset = grouped_places.reset_index()

    output_results = []

    # Debug: Check the shape of the grouped DataFrame
    print(f"Number of unique LAT,LON combinations: {grouped_places_reset.shape[0]}")

    # Iterate through unique LAT,LON combinations
    for idx, grouped_place_data in grouped_places_reset.iterrows():
        try:
            # Print coordinates
            print(f"\nAnalyzing place for Month{grouped_place_data['MO']} with coordinates (LAT: {grouped_place_data['LAT']}, LON: {grouped_place_data['LON']})")
            
            # Ensure data is in the correct format
            new_place_data_for_analysis = grouped_place_data.drop(['LAT', 'LON','YEAR','DY'])

            # Convert the dictionary values into a pandas DataFrame (single row)
            place_data_df = pd.DataFrame(new_place_data_for_analysis).T
            
            # Calculate similarity and print results
            analysis_results = comprehensive_similarity_analysis(
                place_data_df, 
                training_features, 
                scaler, 
                weights, 
                autoencoder,
                euclidean_threshold, 
                cosine_threshold
            )

            # Store results
            output_results.append({
                'LAT': grouped_place_data['LAT'],
                'LON': grouped_place_data['LON'],
                'Month': grouped_place_data['MO'],
                'Is_Similar': not analysis_results['is_dissimilar'],
                'Cosine_Similarity': analysis_results['similarity_metrics']['Cosine_Similarity'].max(),
                'Euclidean_Distance': analysis_results['similarity_metrics']['Euclidean_Distance'].min()
            })

        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            print(f"Problematic data: {grouped_place_data}")
            continue

        # Save all results to an Excel file
    output_df = pd.DataFrame(output_results)
    output_file = 'models/similarity_analysis_results.xlsx'
    output_df.to_excel(output_file, index=False)
    print(f"All results saved to {output_file}")

if __name__ == "__main__":
    main()

Number of unique LAT,LON combinations: 58

Analyzing place for Month12.0 with coordinates (LAT: -13.13, LON: -72.97)
[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 759us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
The place at LAT -13.13, LON -72.97 is **dissimilar** to the training set.

Analyzing place for Month8.0 with coordinates (LAT: -3.5, LON: -60.0)
[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 799us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
The place at LAT -3.5, LON -60.0 is **dissimilar** to the training set.

Analyzing place for Month1.0 with coordinates (LAT: -2.89, LON: -58.97)
[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 840us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
The place at LAT -2.89, LON -58.97 is **dissimilar** to the training set.

Analyzing place for Month1.0 with coordinates (LAT: -2.25, LON: 

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_excel('./models/similarity_analysis_results.xlsx')

def categorize_similarity(row):
    # Define thresholds for Cosine Similarity and Euclidean Distance
    if row['Cosine_Similarity'] > 0.96 and row['Euclidean_Distance'] <= 4.5:
        return 'Extremely Suitable for Hornbill'
    elif row['Cosine_Similarity'] > 0.93 and row['Euclidean_Distance'] <= 5.5:
        return 'Ideal Suitable for Hornbill'
    elif row['Cosine_Similarity'] > 0.88 and row['Euclidean_Distance'] <= 6.5:
        return 'Highly Suitable for Hornbill'
    elif row['Cosine_Similarity'] > 0.85 and row['Euclidean_Distance'] < 7.5:
        return 'Moderately Suitable for Hornbill'
    elif row['Cosine_Similarity'] > 0.80 and row['Euclidean_Distance'] < 9:
        return 'Less Suitability for Hornbill'
    elif row['Cosine_Similarity'] > 0.75 and row['Euclidean_Distance'] < 11:
        return 'Minimal Suitability for Hornbill'
    else:
        return 'unSuitable for Hornbill'


# Apply categorization to each row
df['Similarity_Category'] = df.apply(categorize_similarity, axis=1)

# Print locations with their similarity class
def print_locations_with_similarity(df):
    # Print the locations (LAT, LON) along with their similarity category
    location_similarity = df[['LAT', 'LON','Month', 'Similarity_Category']]
    location_similarity.to_excel("similarity_category.xlsx",index=False)

# Visualization Function
def visualize_similarity_categories(df):
    # Plot a pie chart for the distribution of similarity categories
    plt.figure(figsize=(8, 8))
    similarity_distribution = df['Similarity_Category'].value_counts()
    similarity_distribution.plot(kind='pie', autopct='%1.1f%%', colors=['#66b3ff', '#99ff99', '#ff6666'])
    plt.title('Similarity Category Distribution')
    plt.tight_layout()
    plt.savefig('similarity_category_distribution_pie.png')
    plt.close()

    # Scatter plot of Cosine Similarity vs Euclidean Distance colored by similarity category
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df, x='Cosine_Similarity', y='Euclidean_Distance', hue='Similarity_Category', palette='coolwarm')
    plt.title('Cosine Similarity vs Euclidean Distance')
    plt.xlabel('Cosine Similarity')
    plt.ylabel('Euclidean Distance')
    plt.tight_layout()
    plt.savefig('cosine_vs_euclidean_similarity.png')
    plt.close()

# Main Execution
def main():
    # Print locations with their similarity class
    print_locations_with_similarity(df)
    
    # Visualize similarity categories
    visualize_similarity_categories(df)

if __name__ == "__main__":
    main()


In [20]:
import pandas as pd
import numpy as np

def combine_datasets(ground_truth_path, similarity_path):
    """
    Combine ground truth and similarity datasets
    
    Parameters:
    -----------
    ground_truth_path : str
        Path to ground truth Excel file
    similarity_path : str
        Path to similarity category Excel file
    
    Returns:
    --------
    pd.DataFrame
        Combined dataset with binary suitability classification
    """
    # Read datasets
    ground_truth = pd.read_excel(ground_truth_path)
    similarity_data = pd.read_excel(similarity_path)
    
    
    # Define positive suitability categories
    positive_categories = [
        'Extremely Suitable for Hornbill',
        'Ideal Suitable for Hornbill',
        'Highly Suitable for Hornbill',
        'Moderately Suitable for Hornbill'
    ]
    
    # Aggregate similarity data by location (if multiple entries)
    grouped_similarity = similarity_data.groupby(['LAT', 'LON']).agg({
        'Similarity_Category': lambda x: x.value_counts().index[0],
        'Month': 'first'  # Keep first month if multiple
    }).reset_index()
    
    # Create binary suitability column
    grouped_similarity['is_suitable'] = grouped_similarity['Similarity_Category'].isin(positive_categories)
    
    # Combine datasets
    combined_results = pd.merge(
        ground_truth, 
        grouped_similarity, 
        on=['LAT', 'LON'], 
        how='outer',
        suffixes=('_truth', '_similarity')
    )
    
    # Fill NaN values
    combined_results['is_suitable'] = combined_results['is_suitable'].fillna(False)
    combined_results['has_hornbill'] = combined_results['has_hornbill'].fillna(False)
    
    # Verification and Analysis
    print("\nDataset Combination Summary:")
    print(f"Total Locations: {len(combined_results)}")
    print(f"Locations with Hornbills: {combined_results['has_hornbill'].sum()}")
    print(f"Locations with Suitable Habitat: {combined_results['is_suitable'].sum()}")
    
    # Confusion Matrix-like Analysis
    true_positive = ((combined_results['has_hornbill'] == True) & (combined_results['is_suitable'] == True)).sum()
    true_negative = ((combined_results['has_hornbill'] == False) & (combined_results['is_suitable'] == False)).sum()
    false_positive = ((combined_results['has_hornbill'] == False) & (combined_results['is_suitable'] == True)).sum()
    false_negative = ((combined_results['has_hornbill'] == True) & (combined_results['is_suitable'] == False)).sum()
    
    print("\nPrediction Analysis:")
    print(f"True Positives: {true_positive}")
    print(f"True Negatives: {true_negative}")
    print(f"False Positives: {false_positive}")
    print(f"False Negatives: {false_negative}")

    combined_results=combined_results[['LAT','LON','Similarity_Category','has_hornbill','is_suitable']]
    
    # Save combined results
    combined_results.to_excel('combined_hornbill_results.xlsx', index=False)
    
    return combined_results

def calculate_metrics(combined_results):
    """
    Calculate performance metrics
    
    Parameters:
    -----------
    combined_results : pd.DataFrame
        Combined dataset with ground truth and predictions
    
    Returns:
    --------
    dict: Performance metrics
    """
    from sklearn.metrics import (
        accuracy_score, 
        precision_score, 
        recall_score, 
        f1_score, 
        confusion_matrix
    )
    
    # Prepare data for metrics
    y_true = combined_results['has_hornbill']
    y_pred = combined_results['is_suitable']
    
    # Calculate metrics
    metrics = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred)
    }
    
    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Print metrics
    print("\nPerformance Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
    print("\nConfusion Matrix:")
    print(cm)
    
    return metrics

def main():
    # Paths to input files
    ground_truth_path = 'data/ground_data.xlsx'
    similarity_path = 'similarity_category.xlsx'
    
    # Combine datasets
    combined_results = combine_datasets(
        ground_truth_path, 
        similarity_path
    )
    
    # Calculate performance metrics
    metrics = calculate_metrics(combined_results)

if __name__ == "__main__":
    main()


Dataset Combination Summary:
Total Locations: 25
Locations with Hornbills: 14
Locations with Suitable Habitat: 11

Prediction Analysis:
True Positives: 10
True Negatives: 10
False Positives: 1
False Negatives: 4

Performance Metrics:
Accuracy: 0.8000
Precision: 0.9091
Recall: 0.7143
F1 Score: 0.8000

Confusion Matrix:
[[10  1]
 [ 4 10]]
