In [6]:
# Train and Evaluate Hourly Model
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import joblib

# Load Data and Preprocess
file_path = "./insights/bird_weather_data.xlsx"
data = pd.read_excel(file_path)
data['hourly_timestamp'] = pd.to_datetime(data['hourly_timestamp'])

numerical_columns = [
    "temperature", "windspeed", "humidity", "precipitation",
    "dewpoint", "cloud_cover", "pressure", "solar_radiation",
    "sunshine_duration", "speed_kmph", "direction"
]

for col in numerical_columns:
    data[col] = data[col].fillna(data[col].median())

if 'thrives' not in data.columns:
    np.random.seed(42)
    data['thrives'] = np.random.choice([0, 1], size=len(data), p=[0.6, 0.4])

# Normalize Features
scaler_hourly = StandardScaler()
hourly_features_scaled = scaler_hourly.fit_transform(data[numerical_columns])

# Split Data
X_hourly_train, X_hourly_test, y_hourly_train, y_hourly_test = train_test_split(
    hourly_features_scaled, data['thrives'], test_size=0.2, random_state=42
)

# Train Hourly Model
hourly_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
hourly_model.fit(X_hourly_train, y_hourly_train)

# Evaluate Hourly Model
hourly_predictions = hourly_model.predict(X_hourly_test)
print("\nHourly Model Performance:")
print(classification_report(y_hourly_test, hourly_predictions))

# Feature Importance
print("\nHourly Model Feature Importance:")
for feature, importance in zip(numerical_columns, hourly_model.feature_importances_):
    print(f"{feature}: {importance}")

# Save Model and Scaler
joblib.dump(hourly_model, "hourly_model.pkl")
joblib.dump(scaler_hourly, "scaler_hourly.pkl")
print("\nHourly Model and Scaler saved successfully!")


Hourly Model Performance:
              precision    recall  f1-score   support

           0       0.61      0.71      0.66       322
           1       0.41      0.31      0.35       210

    accuracy                           0.55       532
   macro avg       0.51      0.51      0.51       532
weighted avg       0.53      0.55      0.54       532


Hourly Model Feature Importance:
temperature: 0.09754540118684467
windspeed: 0.053027333329410044
humidity: 0.038802617858692594
precipitation: 0.004524087062368561
dewpoint: 0.09171024118568064
cloud_cover: 0.035075281477542256
pressure: 0.16688289849701474
solar_radiation: 0.03785649642097383
sunshine_duration: 0.013746012546719777
speed_kmph: 0.22998046452077883
direction: 0.23084916591397409

Hourly Model and Scaler saved successfully!


In [7]:
# Train and Evaluate Aggregated Model
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import joblib

# Load Data and Preprocess
file_path = "./insights/bird_weather_data.xlsx"
data = pd.read_excel(file_path)
data['hourly_timestamp'] = pd.to_datetime(data['hourly_timestamp'])

numerical_columns = [
    "temperature", "windspeed", "humidity", "precipitation",
    "dewpoint", "cloud_cover", "pressure", "solar_radiation",
    "sunshine_duration", "speed_kmph", "direction"
]

for col in numerical_columns:
    data[col] = data[col].fillna(data[col].median())

if 'thrives' not in data.columns:
    np.random.seed(42)
    data['thrives'] = np.random.choice([0, 1], size=len(data), p=[0.6, 0.4])

# Aggregate Data by Year and Month
data['year_month'] = data['hourly_timestamp'].dt.to_period('M')
aggregated_data = data.groupby('year_month').agg({
    **{col: ['mean', 'max', 'std'] for col in numerical_columns},
    'thrives': 'mean'
})

# Flatten Multi-Level Column Names
aggregated_data.columns = ['_'.join(col).strip() for col in aggregated_data.columns.values]
aggregated_data.reset_index(inplace=True)

# Convert Thrives to Binary
aggregated_data['thrives'] = (aggregated_data['thrives_mean'] >= 0.5).astype(int)

# Normalize Features
scaler_agg = StandardScaler()
agg_features_columns = [col for col in aggregated_data.columns if col not in ['year_month', 'thrives']]
aggregated_features_scaled = scaler_agg.fit_transform(aggregated_data[agg_features_columns])

# Split Data
X_agg_train, X_agg_test, y_agg_train, y_agg_test = train_test_split(
    aggregated_features_scaled, aggregated_data['thrives'], test_size=0.2, random_state=42
)

# Train Aggregated Model
aggregated_model = DecisionTreeClassifier(max_depth=3, random_state=42)
aggregated_model.fit(X_agg_train, y_agg_train)

# Evaluate Aggregated Model
aggregated_predictions = aggregated_model.predict(X_agg_test)
print("\nAggregated Model Performance:")
print(classification_report(y_agg_test, aggregated_predictions))

# Cross-Validation
agg_cross_val_scores = cross_val_score(aggregated_model, aggregated_features_scaled, aggregated_data['thrives'], cv=5)
print(f"\nAggregated Model Cross-Validation Accuracy: {np.mean(agg_cross_val_scores)}")

# Feature Importance
print("\nAggregated Model Feature Importance:")
for feature, importance in zip(agg_features_columns, aggregated_model.feature_importances_):
    print(f"{feature}: {importance}")

# Save Model and Scaler
joblib.dump(aggregated_model, "aggregated_model.pkl")
joblib.dump(scaler_agg, "scaler_agg.pkl")
print("\nAggregated Model and Scaler saved successfully!")



Aggregated Model Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2


Aggregated Model Cross-Validation Accuracy: 1.0

Aggregated Model Feature Importance:
temperature_mean: 0.0
temperature_max: 0.0
temperature_std: 0.0
windspeed_mean: 0.0
windspeed_max: 0.0
windspeed_std: 0.0
humidity_mean: 0.0
humidity_max: 0.0
humidity_std: 0.0
precipitation_mean: 0.0
precipitation_max: 0.0
precipitation_std: 0.0
dewpoint_mean: 0.0
dewpoint_max: 0.0
dewpoint_std: 0.0
cloud_cover_mean: 0.0
cloud_cover_max: 0.0
cloud_cover_std: 0.0
pressure_mean: 0.0
pressure_max: 0.0
pressure_std: 0.0
solar_radiation_mean: 0.0
solar_radiation_max: 0.0
solar_radiation_std: 0.0
sunshine_duration_mean: 0.0
sunshine_duration_max: 0.0
sunshine_duration_std: 0.0
speed_kmph_mean: 0.0
speed_kmp

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import joblib

class BirdSurvivalPredictor:
    def __init__(self, model_path='aggregated_model.pkl', scaler_path='scaler_agg.pkl', data_path='./insights/bird_weather_data.xlsx'):
        # Load the pre-trained model and scaler
        self.model = joblib.load(model_path)
        self.scaler = joblib.load(scaler_path)
        
        # Load the dataset used for training
        self.data = pd.read_excel(data_path)
        
        # Define the feature columns
        self.feature_columns = [
            'temperature_mean', 'temperature_max', 'temperature_std',
            'windspeed_mean', 'windspeed_max', 'windspeed_std',
            'humidity_mean', 'humidity_max', 'humidity_std',
            'precipitation_mean', 'precipitation_max', 'precipitation_std',
            'dewpoint_mean', 'dewpoint_max', 'dewpoint_std',
            'cloud_cover_mean', 'cloud_cover_max', 'cloud_cover_std',
            'pressure_mean', 'pressure_max', 'pressure_std',
            'solar_radiation_mean', 'solar_radiation_max', 'solar_radiation_std',
            'sunshine_duration_mean', 'sunshine_duration_max', 'sunshine_duration_std',
            'speed_kmph_mean', 'speed_kmph_max', 'speed_kmph_std',
            'direction_mean', 'direction_max', 'direction_std'
        ]
        
    def get_random_input(self):
        """
        Get a random sample of 50 rows from the training data
        """
        # Randomly select 50 rows from the dataset
        random_sample = self.data.sample(n=50, random_state=42)
        
        # Extract relevant feature columns for prediction
        feature_data = random_sample[self.feature_columns]
        
        # Return the features as a numpy array
        return feature_data.values

    def predict_survival(self, features):
        """
        Predict bird survival using the trained model
        """
        # Scale the input features
        scaled_features = self.scaler.transform(features)
        
        # Make prediction
        prediction = self.model.predict(scaled_features)
        probability = self.model.predict_proba(scaled_features)
        
        return prediction, probability

    def interpret_results(self, prediction, probability):
        """
        Interpret the prediction results
        """
        survival_status = "Survive" if prediction == 1 else "Not Survive"
        survival_probability = probability[1] if prediction == 1 else probability[0]
        
        print(f"\n--- Prediction Results ---")
        print(f"Bird Survival Status: {survival_status}")
        print(f"Confidence: {survival_probability * 100:.2f}%")
        
        # Provide additional context based on survival probability
        if survival_probability > 0.8:
            print("High confidence in the prediction.")
        elif survival_probability > 0.6:
            print("Moderate confidence in the prediction.")
        else:
            print("Low confidence in the prediction. Additional factors may influence survival.")

def main():
    predictor = BirdSurvivalPredictor()
    
    print("\n--- Bird Survival Predictor ---")
    random_features = predictor.get_random_input()
    prediction, probability = predictor.predict_survival(random_features)
    
    for i in range(len(random_features)):
        print(f"\nSample {i+1}:")
        predictor.interpret_results(prediction[i], probability[i])
    
    print("\nThank you for using the Bird Survival Predictor!")

if __name__ == "__main__":
    main()


In [9]:
import pandas as pd
import joblib

class BirdSurvivalPredictor:
    def __init__(self, model_path='hourly_model.pkl', scaler_path='scaler_hourly.pkl', data_path='./insights/bird_weather_data.xlsx'):
        # Load the pre-trained model and scaler
        self.model = joblib.load(model_path)
        self.scaler = joblib.load(scaler_path)
        
        # Load the dataset used for training
        self.data = pd.read_excel(data_path)
        
        # Define the feature columns (for hourly features)
        self.feature_columns = [
            'temperature', 'windspeed', 'humidity', 'precipitation',
            'dewpoint', 'cloud_cover', 'pressure', 'solar_radiation',
            'sunshine_duration', 'speed_kmph', 'direction'
        ]
        
    def get_random_input(self):
        """
        Get a random sample of 50 rows from the training data
        """
        # Randomly select 50 rows from the dataset
        random_sample = self.data.sample(n=50, random_state=42)
        
        # Extract relevant feature columns for prediction
        feature_data = random_sample[self.feature_columns]
        
        # Return the features as a numpy array
        return feature_data.values

    def predict_survival(self, features):
        """
        Predict bird survival using the trained model
        """
        # Scale the input features
        scaled_features = self.scaler.transform(features)
        
        # Make prediction
        prediction = self.model.predict(scaled_features)
        probability = self.model.predict_proba(scaled_features)
        
        return prediction, probability

    def interpret_results(self, prediction, probability):
        """
        Interpret the prediction results
        """
        survival_status = "Survive" if prediction == 1 else "Not Survive"
        survival_probability = probability[1] if prediction == 1 else probability[0]
        
        print(f"\n--- Prediction Results ---")
        print(f"Bird Survival Status: {survival_status}")
        print(f"Confidence: {survival_probability * 100:.2f}%")
        
        # Provide additional context based on survival probability
        if survival_probability > 0.8:
            print("High confidence in the prediction.")
        elif survival_probability > 0.6:
            print("Moderate confidence in the prediction.")
        else:
            print("Low confidence in the prediction. Additional factors may influence survival.")

def main():
    predictor = BirdSurvivalPredictor()
    
    print("\n--- Bird Survival Predictor ---")
    random_features = predictor.get_random_input()
    prediction, probability = predictor.predict_survival(random_features)
    
    for i in range(len(random_features)):
        print(f"\nSample {i+1}:")
        predictor.interpret_results(prediction[i], probability[i])
    
    print("\nThank you for using the Bird Survival Predictor!")

if __name__ == "__main__":
    main()



--- Bird Survival Predictor ---

Sample 1:

--- Prediction Results ---
Bird Survival Status: Not Survive
Confidence: 83.00%
High confidence in the prediction.

Sample 2:

--- Prediction Results ---
Bird Survival Status: Not Survive
Confidence: 83.00%
High confidence in the prediction.

Sample 3:

--- Prediction Results ---
Bird Survival Status: Not Survive
Confidence: 51.00%
Low confidence in the prediction. Additional factors may influence survival.

Sample 4:

--- Prediction Results ---
Bird Survival Status: Not Survive
Confidence: 72.00%
Moderate confidence in the prediction.

Sample 5:

--- Prediction Results ---
Bird Survival Status: Not Survive
Confidence: 75.00%
Moderate confidence in the prediction.

Sample 6:

--- Prediction Results ---
Bird Survival Status: Not Survive
Confidence: 71.00%
Moderate confidence in the prediction.

Sample 7:

--- Prediction Results ---
Bird Survival Status: Not Survive
Confidence: 51.00%
Low confidence in the prediction. Additional factors may in



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import joblib

# Load Data and Preprocess
file_path = "./insights/bird_weather_data.xlsx"
data = pd.read_excel(file_path)
data['hourly_timestamp'] = pd.to_datetime(data['hourly_timestamp'])

# Define numerical columns
numerical_columns = [
    "temperature", "windspeed", "humidity", "precipitation",
    "dewpoint", "cloud_cover", "pressure", "solar_radiation",
    "sunshine_duration", "speed_kmph", "direction"
]

# Fill missing values with median
for col in numerical_columns:
    data[col] = data[col].fillna(data[col].median())

# Generate target variable if not present
if 'thrives' not in data.columns:
    np.random.seed(42)
    data['thrives'] = np.random.choice([0, 1], size=len(data), p=[0.6, 0.4])

# Extract month from the timestamp
data['month'] = data['hourly_timestamp'].dt.month

# Initialize dictionaries to store models and scalers
models_monthly = {}
scalers_monthly = {}

# Loop through each month and train a model
for month, subset in data.groupby('month'):
    print(f"\n--- Training Model for Month: {month} ---")
    
    # Normalize features
    scaler = StandardScaler()
    subset_scaled = scaler.fit_transform(subset[numerical_columns])
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        subset_scaled, subset['thrives'], test_size=0.2, random_state=42
    )
    
    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    
    # Evaluate model
    predictions = model.predict(X_test)
    print(f"Model Performance for Month {month}:")
    print(classification_report(y_test, predictions))
    
    # Feature importance
    print("\nFeature Importance:")
    for feature, importance in zip(numerical_columns, model.feature_importances_):
        print(f"{feature}: {importance}")
    
    # Save model and scaler
    model_file = f"./models/monthly_model_{month}.pkl"
    scaler_file = f"./models/scaler_monthly_{month}.pkl"
    joblib.dump(model, model_file)
    joblib.dump(scaler, scaler_file)
    print(f"Saved Model to {model_file} and Scaler to {scaler_file}")
    
    # Store in dictionaries
    models_monthly[month] = model
    scalers_monthly[month] = scaler

print("\n--- All Monthly Models and Scalers Saved Successfully! ---")



--- Training Model for Month: 1 ---
Model Performance for Month 1:
              precision    recall  f1-score   support

           0       0.65      0.68      0.67        25
           1       0.38      0.36      0.37        14

    accuracy                           0.56        39
   macro avg       0.52      0.52      0.52        39
weighted avg       0.56      0.56      0.56        39


Feature Importance:
temperature: 0.08576777337117719
windspeed: 0.07346363899537421
humidity: 0.04851948850149759
precipitation: 0.0
dewpoint: 0.10453337580577735
cloud_cover: 0.04659926640971653
pressure: 0.15862848624903095
solar_radiation: 0.05891871606555746
sunshine_duration: 0.015189315630414358
speed_kmph: 0.2007331998781931
direction: 0.20764673909326134
Saved Model to monthly_model_1.pkl and Scaler to scaler_monthly_1.pkl

--- Training Model for Month: 2 ---
Model Performance for Month 2:
              precision    recall  f1-score   support

           0       0.62      0.59      0.60   