In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv('finalsheet.csv')

# First, let's check your actual data ranges
print("Data Analysis:")
print(f"Slope range: {df['slope_deg'].min():.1f} to {df['slope_deg'].max():.1f}")
print(f"Elevation range: {df['elevation'].min():.1f} to {df['elevation'].max():.1f}")

rainfall_cols = ['JANr', 'FEBr', 'MARr', 'APRr', 'MAYr', 'JUNr', 
                'JULr', 'AUGr', 'SEPr', 'OCTr', 'NOVr', 'DECr']
max_rain = df[rainfall_cols].max(axis=1)
print(f"Max rainfall range: {max_rain.min():.1f} to {max_rain.max():.1f}")

def generate_rockfall_labels_fixed(df):
    """Fixed with proper thresholds based on your actual data"""
    
    # Get key factors
    max_rain = df[rainfall_cols].max(axis=1)
    slope = df['slope_deg']
    elevation = df['elevation']
    
    # Adjusted thresholds based on your data ranges
    slope_risk = np.where(slope > 88, 3,        # Very steep (top 10%)
                 np.where(slope > 85, 2,        # Steep (top 30%)
                 np.where(slope > 80, 1, 0)))   # Moderate
    
    rain_risk = np.where(max_rain > 16, 3,      # Heavy rain (top 10%)
                np.where(max_rain > 12, 2,      # Moderate rain (top 30%)
                np.where(max_rain > 8, 1, 0)))  # Light rain
    
    elev_risk = np.where(elevation > 600, 1,    # High elevation
               np.where(elevation > 300, 0.5, 0)) # Medium elevation
    
    # Combined risk - need higher threshold
    total_risk = slope_risk + rain_risk + elev_risk
    
    # Much higher threshold to get realistic split
    rockfall = np.where(total_risk >= 5.5, 'Yes', 'No')  # Very high threshold
    probability = np.clip(total_risk / 7, 0, 1)
    
    return rockfall, np.round(probability, 3)

# Generate with fixed thresholds
print("\nGenerating with adjusted thresholds...")
df['rockfall'], df['rockfall_probability'] = generate_rockfall_labels_fixed(df)

df.to_csv('airockfalldata.csv', index=False)

print("Results:")
print(f"High Risk (Yes): {sum(df['rockfall'] == 'Yes')} ({100*sum(df['rockfall'] == 'Yes')/len(df):.1f}%)")
print(f"Low Risk (No): {sum(df['rockfall'] == 'No')} ({100*sum(df['rockfall'] == 'No')/len(df):.1f}%)")
print(f"Average Probability: {df['rockfall_probability'].mean():.3f}")


Data Analysis:
Slope range: 87.4 to 90.0
Elevation range: -493.0 to 1419.9
Max rainfall range: 6.4 to 32.8

Generating with adjusted thresholds...
Results:
High Risk (Yes): 294665 (29.0%)
Low Risk (No): 719741 (71.0%)
Average Probability: 0.705
