In [20]:
# the purpose of this notebook is to explore the separated 10m, 30m, and 50m datasets and their aggregate data (average
# pitch/roll, variance in turbulence, battery loss, all that jazz)

In [21]:
# import tings
import pandas as pd
import numpy as np
from pathlib import Path

In [22]:
# fix individual logs
# running the function(s) below, I found that Jan07.3-35_10m and Jan06.4-47_30m had mistakenly collected data on the way DOWN, too, which confounds
# the data. here, we reassign that file itself but without the distracting irrelevant data

badfile1 = pd.read_csv(Path('UsableLogs/Jan07.3-35_10m.csv'))
badfile1 = badfile1[badfile1['index'] <= 381]
badfile1.to_csv('UsableLogs/Jan07.3-35_10m.csv', index=False)

badfile2 = pd.read_csv(Path('UsableLogs/Jan06.4-47_30m.csv'))
badfile2 = badfile2[badfile2['index'] <= 603]
badfile2.to_csv('UsableLogs/Jan06.4-47_30m.csv', index=False)

In [27]:
# Takes a 5-second window of raw data and returns the exact feature dictionary  needed for the Random Forest.

def extract_features(window):
    # 1. Vector Yaw (The Fix)
    # Handles the 0/360 degree wrap-around bug
    compass_rads = np.radians(window['compass_heading(degrees)'])
    yaw_sd = np.sqrt(np.sin(compass_rads).std()**2 + np.cos(compass_rads).std()**2)
    
    # 2. Power Stats
    # Power = Voltage * Current
    power_watts = window['voltage(v)'] * window['current(A)']
    p_mean = power_watts.mean()
    p_std = power_watts.std()
    
    # 3. Tilt Stats
    # Calculate Magnitude if not already present
    if 'tilt_magnitude' not in window.columns:
        tilt_mag = np.sqrt(window['pitch(degrees)']**2 + window['roll(degrees)']**2)
    else:
        tilt_mag = window['tilt_magnitude']
        
    tilt_mean = tilt_mag.mean()
    tilt_std = tilt_mag.std()
    
    # 4. Return the Dictionary (Keys must match CSV columns EXACTLY)
    return {
        'speed_avg': window['speed(mph)'].mean(),
        'speed_sd': window['speed(mph)'].std(),
        'zSpeed_sd': window['zSpeed(mph)'].std(),
        'yaw_sd': yaw_sd,            
        'power_avg': p_mean,
        'power_sd': p_std,
        'power_intensity': p_std / (p_mean + 0.05),
        'tilt_avg': tilt_mean,
        'tilt_sd': tilt_std,
        'turbulence_intensity': tilt_std / (tilt_mean + 0.05)
    }

In [24]:
# function that turns the files fed into it into a single row of aggregate data

# if file ends with _10m, entry in 'altitude' section is 10m, etc.
# include file name to check weird outliers out
# find average of all speeds, voltage, current, pitch, roll, battery_percent lost over time
# each time, append the row to the dataframe previously made

directory = Path('UsableLogs')
preaggregate_logs = directory.glob('*.csv')
output_dir = Path('AggregateLogs')

def process_usable_logs():
    aggregated_data = []

    for file_path in preaggregate_logs:        
        # Parse altitude from filename
        altitude = None
        if '_10m' in file_path.name:
            altitude = 10
        elif '_30m' in file_path.name:
            altitude = 30
        elif '_50m' in file_path.name:
            altitude = 50
        
        try:
            # Read the CSV
            df = pd.read_csv(file_path)
            start_time, end_time = pd.to_datetime(df['timestamp'].iloc[0]), pd.to_datetime(df['timestamp'].iloc[-1])
            flight_time = (end_time - start_time).total_seconds()
            df['tilt_magnitude'] = np.sqrt(df['pitch(degrees)']**2 + df['roll(degrees)']**2)
            df['power(watt)'] = df['voltage(v)'] * df['current(A)']
            
            # calculate the aggregate stats
            stats = {
                'file_name': file_path.stem,
                'flight_time(seconds)': flight_time,
                'altitude(m)': altitude,
                'zSpeed_avg': df['zSpeed(mph)'].mean(),
                'zSpeed_sd': df['zSpeed(mph)'].std(),
                'speed_avg': df['speed(mph)'].mean(),
                'speed_sd': df['speed(mph)'].std(),
                'power_avg': df['power(watt)'].mean(),
                'power_sd': df['power(watt)'].std(),
                'power_intensity': df['power(watt)'].std() / (df['power(watt)'].mean() + 0.05),
                'tilt_avg': df['tilt_magnitude'].mean(),
                'tilt_sd': df['tilt_magnitude'].std(),
                'turbulence_intensity': df['tilt_magnitude'].std() / (df['tilt_magnitude'].mean() + 0.05) # + 0.05 to avoid div by zero

            }
            
            aggregated_data.append(stats)
            
        except Exception as e:
            print(f"Error processing {file_path.name}: {e}")

    return pd.DataFrame(aggregated_data).sort_values('file_name')
aggregate_df = process_usable_logs()

In [34]:
# function that turns the files fed into it into a several more rows of data to feed to the random forest model.
# uses a sliding window to collect data at every 5 second interval (only moving forward 0.5 seconds per step, leading
# to more overlap for more thorough data). naming conventions and other data collected is otherwise similar

directory = Path('UsableLogs')
preaggregate_logs = directory.glob('*.csv')
output_dir = Path('')

def apply_window_to_usable_logs():
    windowed_data = []

    for file_path in preaggregate_logs:        
        # Parse altitude from filename
        altitude = None
        if '_10m' in file_path.name:
            altitude = 10
        elif '_30m' in file_path.name:
            altitude = 30
        elif '_50m' in file_path.name:
            altitude = 50
        
        try:
            # Read the CSV
            df = pd.read_csv(file_path)
            start_time, end_time = pd.to_datetime(df['timestamp'].iloc[0]), pd.to_datetime(df['timestamp'].iloc[-1])
            flight_time = (end_time - start_time).total_seconds()
            df['tilt_magnitude'] = np.sqrt(df['pitch(degrees)']**2 + df['roll(degrees)']**2)
            df['power(watt)'] = df['voltage(v)'] * df['current(A)']

            WINDOW_SIZE = 50 # cut up the data every 5 seconds (10Hz)
            STEP_SIZE = 5 # 0.5 second overlap so all gusts can be caught (moving the window 0.5 seconds at a time)

            for i in range(0, len(df) - WINDOW_SIZE, STEP_SIZE):
                # Add the labels (target)
                window = df.iloc[i:i + WINDOW_SIZE]
                features = extract_features(window)
    
                features['altitude(m)'] = altitude
                features['file_name'] = file_path.stem
                windowed_data.append(features)
            
        except Exception as e:
            print(f"Error processing {file_path.name}: {e}")

    return pd.DataFrame(windowed_data).sort_values('file_name')
windowed_df = apply_window_to_usable_logs()

In [35]:
# push to csv
aggregate_df.to_csv('aggregate_data.csv', index=False) 
windowed_df.to_csv('windowed_data.csv', index=False) 

In [None]:
aggregate_df.groupby('altitude(m)')[['tilt_avg']].mean()

In [None]:
# seeing which values matter most to determining altitude
# inititally these results look a little disappointing, but I have a theory that the wind shear is not linear.
# We'll use a random forest decision tree to get the real pattern out of these variables in another notebook.

aggregate_df.drop(columns={'file_name', 'flight_time(seconds)'}).corr()['altitude(m)'].sort_values(ascending=False)