In [4]:
import h5py

file_path = 'ATL03_20240510223215_08142301_006_01.h5'  # Your file name

with h5py.File(file_path, 'r') as file:
    # Loop through all the keys and explore each one
    for key in file.keys():
        print(f"\nExploring '{key}' group:")
        if isinstance(file[key], h5py.Group):  # If it's a group, list its contents
            print(f"Subkeys in '{key}':", list(file[key].keys()))
        elif isinstance(file[key], h5py.Dataset):  # If it's a dataset, print its shape and type
            dataset = file[key]
            print(f"Dataset '{key}':", dataset.shape, dataset.dtype)


Exploring 'METADATA' group:
Subkeys in 'METADATA': ['AcquisitionInformation', 'DataQuality', 'DatasetIdentification', 'Extent', 'Lineage', 'ProcessStep', 'ProductSpecificationDocument', 'QADatasetIdentification', 'SeriesIdentification']

Exploring 'ancillary_data' group:
Subkeys in 'ancillary_data': ['atlas_sdp_gps_epoch', 'control', 'data_end_utc', 'data_start_utc', 'end_cycle', 'end_delta_time', 'end_geoseg', 'end_gpssow', 'end_gpsweek', 'end_orbit', 'end_region', 'end_rgt', 'granule_end_utc', 'granule_start_utc', 'release', 'start_cycle', 'start_delta_time', 'start_geoseg', 'start_gpssow', 'start_gpsweek', 'start_orbit', 'start_region', 'start_rgt', 'version', 'altimetry', 'atlas_engineering', 'calibrations', 'tep', 'gt2r', 'gt2l', 'gt3r', 'gt3l', 'gt1r', 'gt1l']

Exploring 'atlas_impulse_response' group:
Subkeys in 'atlas_impulse_response': ['pce1_spot1', 'pce2_spot3']

Exploring 'ds_surf_type' group:
Dataset 'ds_surf_type': (5,) int32

Exploring 'ds_xyz' group:
Dataset 'ds_xyz': 

In [2]:
import h5py

file_path = 'ATL03_20240510223215_08142301_006_01.h5'  # Your file path

with h5py.File(file_path, 'r') as file:
    # Explore the 'gt1l' group
    gt1l_keys = list(file['gt1l'].keys())
    print(f"Subkeys in 'gt1l': {gt1l_keys}")
    
    # Explore 'geolocation' subkeys under 'gt1l'
    if 'geolocation' in gt1l_keys:
        geolocation_keys = list(file['gt1l/geolocation'].keys())
        print(f"Subkeys in 'gt1l/geolocation': {geolocation_keys}")
    
    # Explore 'heights' subkeys under 'gt1l'
    if 'heights' in gt1l_keys:
        heights_keys = list(file['gt1l/heights'].keys())
        print(f"Subkeys in 'gt1l/heights': {heights_keys}")

Subkeys in 'gt1l': ['bckgrd_atlas', 'geolocation', 'geophys_corr', 'heights', 'signal_find_output']
Subkeys in 'gt1l/geolocation': ['altitude_sc', 'bounce_time_offset', 'delta_time', 'full_sat_fract', 'knn', 'near_sat_fract', 'neutat_delay_derivative', 'neutat_delay_total', 'neutat_ht', 'ph_index_beg', 'pitch', 'podppd_flag', 'range_bias_corr', 'ref_azimuth', 'ref_elev', 'reference_photon_index', 'reference_photon_lat', 'reference_photon_lon', 'roll', 'segment_dist_x', 'segment_id', 'segment_length', 'segment_ph_cnt', 'sigma_across', 'sigma_along', 'sigma_h', 'sigma_lat', 'sigma_lon', 'solar_azimuth', 'solar_elevation', 'surf_type', 'tx_pulse_energy', 'tx_pulse_skew_est', 'tx_pulse_width_lower', 'tx_pulse_width_upper', 'velocity_sc', 'yaw']
Subkeys in 'gt1l/heights': ['delta_time', 'dist_ph_across', 'dist_ph_along', 'h_ph', 'lat_ph', 'lon_ph', 'pce_mframe_cnt', 'ph_id_channel', 'ph_id_count', 'ph_id_pulse', 'quality_ph', 'signal_conf_ph', 'weight_ph']


In [12]:
import h5py
import pandas as pd

file_path = 'ATL03_20240510223215_08142301_006_01.h5'  # Your file path

# Open the HDF5 file
with h5py.File(file_path, 'r') as file:
    # Extract important datasets like latitude, longitude, height, and time
    latitudes = file['gt1l/heights/lat_ph'][:]
    longitudes = file['gt1l/heights/lon_ph'][:]
    heights = file['gt1l/heights/h_ph'][:]  # Example height data
    delta_time = file['gt1l/heights/delta_time'][:]  # Time data

# Create a pandas DataFrame with the extracted data
data = {
    'Latitude': latitudes,
    'Longitude': longitudes,
    'Height': heights,
    'Delta_Time': delta_time
}

df = pd.DataFrame(data)

# Show the first few rows of the DataFrame
print(df.head())

   Latitude   Longitude     Height    Delta_Time
0 -0.002932 -122.664507 -15.840814  2.006155e+08
1 -0.002913 -122.664509 -16.016417  2.006155e+08
2 -0.002907 -122.664509 -19.940960  2.006155e+08
3 -0.002907 -122.664509 -16.021324  2.006155e+08
4 -0.002907 -122.664509 -16.066422  2.006155e+08


In [13]:
print(df.shape)

(1601266, 4)


In [4]:
print("First ten rows of datasets",df.head(10))

First ten rows of datasets    Latitude   Longitude     Height    Delta_Time
0 -0.002932 -122.664507 -15.840814  2.006155e+08
1 -0.002913 -122.664509 -16.016417  2.006155e+08
2 -0.002907 -122.664509 -19.940960  2.006155e+08
3 -0.002907 -122.664509 -16.021324  2.006155e+08
4 -0.002907 -122.664509 -16.066422  2.006155e+08
5 -0.002894 -122.664511 -15.945132  2.006155e+08
6 -0.002862 -122.664514 -15.889396  2.006155e+08
7 -0.002855 -122.664514 -15.701778  2.006155e+08
8 -0.002842 -122.664516 -15.825165  2.006155e+08
9 -0.002836 -122.664518   4.740792  2.006155e+08


In [14]:
print("Random rows from datasets",df.sample(10))

Random rows from datasets           Latitude   Longitude      Height    Delta_Time
1324630  19.783059 -124.673339 -138.976105  2.006159e+08
1214017  16.891456 -124.372575  356.000366  2.006158e+08
468127    5.850877 -123.250547  -59.400513  2.006156e+08
1241308  17.743370 -124.460777  333.974731  2.006158e+08
1370806  21.030084 -124.804354   49.523533  2.006159e+08
201488    2.044516 -122.869279   -3.786586  2.006156e+08
524621    6.576791 -123.323500   12.005555  2.006156e+08
312274    3.135426 -122.978425    2.185738  2.006156e+08
158512    1.574013 -122.822207   -8.637660  2.006156e+08
1002362  13.928182 -124.067930   59.084339  2.006158e+08


In [7]:
df.to_csv('glacial_lake_data.csv',index=False)

In [15]:
# Check for missing values
print(df.isnull().sum())

# If there are missing values, you can drop them or fill them with a strategy
df = df.dropna()  # Option to drop rows with missing values

# Feature Scaling (Normalizing Latitude, Longitude, Height, and Delta_Time)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['Latitude', 'Longitude', 'Height', 'Delta_Time']] = scaler.fit_transform(df[['Latitude', 'Longitude', 'Height', 'Delta_Time']])

Latitude      0
Longitude     0
Height        0
Delta_Time    0
dtype: int64


In [16]:
import numpy as np

# Inverse transform the entire feature set that was scaled (Latitude, Longitude, Height, Delta_Time)
original_values = scaler.inverse_transform(df[['Latitude', 'Longitude', 'Height', 'Delta_Time']])

# Extract the original Height values (column index 2, as it's the third column)
original_height = original_values[:, 2]


# Apply the GLOF risk threshold based on the original height
certain_threshold = -20  # Set based on domain knowledge
df['GLOF_Risk'] = np.where(original_height < certain_threshold, 1, 0)

# Check the distribution of GLOF risk
print(df['GLOF_Risk'].value_counts())

GLOF_Risk
0    1439803
1     161463
Name: count, dtype: int64


In [17]:
X = df[['Latitude', 'Longitude', 'Delta_Time']]  # Feature set
y = df['GLOF_Risk']  # Target: 1 for GLOF, 0 for safe

In [18]:
from sklearn.model_selection import train_test_split
# Define X (features) and y (target)
X = df[['Latitude', 'Longitude', 'Delta_Time']]  # Features
y = df['GLOF_Risk']  # Target (Binary classification)

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

Training set: (1281012, 3), Test set: (320254, 3)


In [19]:
# Necessary imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Split the balanced dataset into training and testing sets
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Train the Random Forest Classifier on the balanced dataset
clf_smote = RandomForestClassifier(n_estimators=100, random_state=42)
clf_smote.fit(X_train_smote, y_train_smote)

# Make predictions
y_pred_smote = clf_smote.predict(X_test_smote)

# Evaluate the model
accuracy_smote = accuracy_score(y_test_smote, y_pred_smote)
print("Accuracy after SMOTE:\n", accuracy_smote)
print("Confusion Matrix after SMOTE:\n", confusion_matrix(y_test_smote, y_pred_smote))
print("\nClassification Report after SMOTE:\n", classification_report(y_test_smote, y_pred_smote))


Accuracy after SMOTE:
 0.9303551522602019
Confusion Matrix after SMOTE:
 [[266055  22196]
 [ 17914 269757]]

Classification Report after SMOTE:
               precision    recall  f1-score   support

           0       0.94      0.92      0.93    288251
           1       0.92      0.94      0.93    287671

    accuracy                           0.93    575922
   macro avg       0.93      0.93      0.93    575922
weighted avg       0.93      0.93      0.93    575922



In [24]:
import pandas as pd
from datetime import datetime

# Example user inputs (these would come from your app or live location)
user_latitude = 33.2778  # Example Latitude
user_longitude = 75.3412   # Example Longitude
user_time = datetime.now().timestamp()  # Current time as Delta_Time
user_height = 0  # Height is included but won't be used in the model

# Step 1: Create a DataFrame with user input including Height
input_data = pd.DataFrame({
    'Latitude': [user_latitude],
    'Longitude': [user_longitude],
    'Height': [user_height],
    'Delta_Time': [user_time] # Include height for other purposes if necessary
})

# Step 2: Standardize the entire input data (including 'Height')
input_data_scaled = scaler.transform(input_data)

# Convert the scaled NumPy array back to a DataFrame for easier column handling
input_data_scaled_df = pd.DataFrame(input_data_scaled, columns=input_data.columns)

# Step 3: Remove the 'Height' column from the scaled data
input_data_for_model = input_data_scaled_df.drop(columns=['Height'])

# Step 4: Predict whether the input location is at risk of GLOF using the trained model
risk_prediction = clf_smote.predict(input_data_for_model)

# Step 5: Output the result based on the prediction
if risk_prediction == 1:
    print("This location is in a high-risk GLOF zone.")
else:
    print("This location is in a safe zone.")


This location is in a safe zone.
