# Import libraries

In [2]:
import pandas as pd
import numpy as np
import logging

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import plotly.express as px
import folium

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import BallTree

import joblib

import time
import os

#Configure logging

In [3]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Import Data

In [4]:
def import_data(file_path):
    # Read data from the specified file path
    df_raw=pd.read_csv(file_path)

    # Return the imported DataFrame
    return df_raw

# Cleaning Data

In [5]:
def reduce_sampling_rate(df):
    df.set_index('Timestamp', inplace=True)
    # Set the 'Timestamp' column as the DataFrame's index
    df_resampled = df.resample('S').first()
    # Reset the index to create a standard numerical index
    df_resampled.reset_index(inplace=True)
    # option
    # export to CSV
    # df_resampled.to_csv('df_206_reduced_sample_rate.csv', index=False)
    return df_resampled

# Feature Engineering

In [6]:

def add_engineered_features(df, alt_threshold=20, speed_threshold=3, inplace=False):
    """
    Calculate the altitude change, speed change, and course change between consecutive rows and remove outliers.

    Args:
        df (pd.DataFrame): DataFrame containing the altitude, speed, and course data.
        alt_threshold (float): Threshold value for altitude change outlier detection.
        speed_threshold (float): Threshold value for speed change outlier detection.
        inplace (bool): Whether to modify the original DataFrame or create a copy.

    Returns:
        tuple: A tuple containing the modified DataFrame and a dictionary with shape information.
    """
    # Check if required columns exist
    required_columns = ['Alt(m)', 'Speed(m/s)', 'Course']
    if not all(col in df.columns for col in required_columns):
        logger.error("Required columns not found in DataFrame.")
        return None, {}

    # Store the initial DataFrame size
    initial_size = len(df)

    # Calculate changes
    df['Alt(m)_change'] = df['Alt(m)'].diff().fillna(0)
    df['Speed(m/s)_change'] = df['Speed(m/s)'].diff().fillna(0)
    df['Course_change'] = df['Course'].diff().fillna(0)

    # Remove outliers
    mask = (df['Alt(m)_change'].abs() <= alt_threshold) & \
           (df['Speed(m/s)_change'].abs() <= speed_threshold)
    filtered_df = df[mask] if inplace else df.copy()[mask]

    # Reset the index of the filtered DataFrame and drop null values
    filtered_df = filtered_df.dropna()
    filtered_df = filtered_df.reset_index(drop=True)

    # Log the shape after outlier removal
    logger.info("Shape before outlier removal: %d", initial_size)
    logger.info("Shape after outlier removal: %d", len(filtered_df))

    return filtered_df

In [7]:
#updated convert_datetime function
def convert_datetime(df, inplace=False):
    """
    Convert the 'Timestamp' column in a DataFrame to datetime format.

    Args:
        df (pd.DataFrame): DataFrame containing the 'Timestamp' column.
        inplace (bool): Whether to modify the original DataFrame or create a copy.

    Returns:
        pd.DataFrame: The DataFrame with the 'Timestamp' column converted to datetime format.
    """
    # Check if 'Timestamp' column exists
    if 'Timestamp' not in df.columns:
        raise ValueError("Column 'Timestamp' not found in DataFrame.")

    # Convert 'Timestamp' column to datetime format
    if inplace:
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='ns')
    else:
        df = df.copy()
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='ns')

    return df


# Prediction

In [8]:
def load_model(file_path_to_model):

    # Load the saved model
    return joblib.load(file_path_to_model)


In [9]:
def show_hyperparameters(model):
    # show hyperparameters
    return model.get_params()


In [24]:
def predict_on_features(model, df):
    #    V2 combined old select features and predict on features functions
    """
    Select a subset of features from a DataFrame and make a prediction based on those features, returns full dataframe including prediction.

    Args:
        df (pd.DataFrame): DataFrame containing the features.

    Returns:
        pd.DataFrame: A DataFrame containing only the selected features.
    """
    features_to_use = ['accelX(g)', 'accelY(g)', 'accelZ(g)', 'accelUserX(g)', 'accelUserY(g)',
                       'accelUserZ(g)', 'gyroX(rad/s)', 'gyroY(rad/s)', 'gyroZ(rad/s)',
                       'Roll(rads)', 'Pitch(rads)', 'Yaw(rads)', 'Lat', 'Long', 'Speed(m/s)',
                       'HorizontalAccuracy(m)', 'VerticalAccuracy(m)', 'Course', 'calMagX(µT)',
                       'calMagY(µT)', 'calMagZ(µT)', 'Alt(m)_change',
                       'Speed(m/s)_change', 'Course_change']

    # Check if all features exist in the DataFrame
    missing_features = [feature for feature in features_to_use if feature not in df.columns]
    if missing_features:
        raise ValueError(f"Features not found in DataFrame: {missing_features}")

    # Select the features
    X = df.copy()[features_to_use]

    #predict on selected features
    predictions = model.predict(X)
    df['predicted'] = predictions

    return df


# Visualisation

### Plotting

In [12]:
def plot_prediction(df, target_column='predicted', cmap=None):
    # Define the plot title based on the target column
    if target_column == 'on_lift':
        plot_title = 'Predictions'
    elif target_column == 'mask':
        plot_title = 'Clean Predictions with mask'
    elif target_column == 'event':
        plot_title = 'Lift Events'
    else:
        plot_title = 'Predictions'


    # Use the passed colormap if available, otherwise use the default
    if cmap is None:
        dark2_cmap = ListedColormap(plt.cm.Dark2(range(8)))
        cmap = {str(idx): color for idx, color in enumerate(dark2_cmap.colors)}


    # Check if 'Timestamp' column exists and is in datetime format
    if 'Timestamp' in df.columns and pd.api.types.is_datetime64_any_dtype(df['Timestamp']):
        # Create a scatter plot for Altitude over Time, colored by target_column with an accessible color scheme
        fig = px.scatter(df, x='Timestamp',
                         y='Alt(m)', color=target_column,
                         labels={'Alt(m)': 'Altitude (m)'},
                         title=plot_title,
                         color_discrete_map=cmap)

        fig.update_traces(marker=dict(size=8),
                          selector=dict(mode='markers'))

        # Customize the legend
        fig.update_layout(
            legend_title_text='Status',
            width=1000,
            height=600
        )

        # Update legend labels based on target_column
        if target_column == 'on_lift':
            fig.for_each_trace(lambda trace: trace.update(name='Not on the lift' if trace.name == '0' else 'On the lift'))
        # Add more conditions if there are different classes for other target_columns

        # Show the plot
        fig.show()
    else:
        print("Warning: DataFrame's 'Timestamp' column is not in datetime format and must be converted first.")


In [13]:
def plot_total_alt_over_time(df, plot_title='Total Tracked Altitude Over Time'):

    # Create a line plot using Plotly
    fig = go.Figure()

    # Add a trace for altitude over time
    fig.add_trace(go.Scatter(x=df['Timestamp'],
                             y=df['Alt(m)'],
                             mode='lines',
                             name='Altitude'))

    # Update layout
    fig.update_layout(title=plot_title,
                      xaxis_title='Timestamp',
                      yaxis_title='Altitude (m)')

    # Show plot
    fig.show()

### Mapping

In [14]:
#Map all tracked movement based on lat and long of GPS data

def map_tracked_movement(df, zoom_start=12):
    # Create a map centered on the mean latitude and longitude
    map_center = [df['Lat'].mean(), df['Long'].mean()]
    movement_on_map = folium.Map(location=map_center, zoom_start=zoom_start)

    # Add CircleMarkers for each data point
    for index, row in df.iterrows():
        folium.CircleMarker(location=[row['Lat'], row['Long']], radius=5, color='blue', fill=True, fill_color='blue').add_to(movement_on_map)

    # Display the map
    return movement_on_map

In [15]:
#Map lifts rides in red and all the other movement in blue

def map_lifts_and_other_movement(df, column='on_lift', zoom_start=15):
    # Create a map centered on the mean latitude and longitude
    map_center = [df['Lat'].mean(), df['Long'].mean()]
    tracking_map = folium.Map(location=map_center, zoom_start=zoom_start)

    # Plot data points with on_lift type
    on_lift = df[df[column] == 1]
    for _, row in on_lift.iterrows():
        folium.CircleMarker(location=[row['Lat'], row['Long']], radius=5, color='red', fill=True, fill_color='red', tooltip=str(row['Timestamp'])).add_to(tracking_map)

    # Plot data points with not_on_lift  types
    not_on_lift = df[df[column] != 1]
    for _, row in not_on_lift.iterrows():
        folium.CircleMarker(location=[row['Lat'], row['Long']], radius=5, color='blue', fill=True, fill_color='blue', tooltip=str(row['Timestamp'])).add_to(tracking_map)

    # Return the map object
    return tracking_map

In [16]:
#Map only lifts rides

def map_lift_rides(df, column='on_lift', zoom_start=15):
    # Create a map centered on the mean latitude and longitude
    map_center = [df['Lat'].mean(), df['Long'].mean()]
    lift_map = folium.Map(location=map_center, zoom_start=zoom_start)

    # Plot data points with on_lift type
    on_lift = df[df[column] == 1]
    for _, row in on_lift.iterrows():
        folium.CircleMarker(location=[row['Lat'],
                                      row['Long']],
                                      radius=5,
                                      color='red',
                                      fill=True,
                                      fill_color='red',
                                      tooltip=str(row['Timestamp'])).add_to(lift_map)

    # Return the map object
    return lift_map

# Post-Processing

In [17]:
### updated misclassification mask v0.2

import pandas as pd

def generate_misclassification_mask(df, column_to_mask='predicted', chunk_size=60, threshold=0.3):
    """
    Apply a binary mask to each row in a DataFrame based on the average value of a specified column in chunks.

    Args:
        df (pd.DataFrame): DataFrame containing the data.
        column_to_mask (str): Name of the column to calculate the mean and apply the mask.
        chunk_size (int): Size of the chunks to divide the DataFrame into.
        threshold (float): Threshold value for determining the mask value.

    Returns:
        tuple: A tuple containing the updated DataFrame with the mask applied and the event log.
    """

    # Check if the 'mask' column already exists
    if 'mask' in df.columns:
        raise ValueError("The 'mask' column already exists in the DataFrame.")

    # Initialize the event log
    event_log = {}

    # Calculate the total number of chunks
    total_chunks = len(df) // chunk_size
    remainder = len(df) % chunk_size

    # Process each chunk
    for i in range(total_chunks):
        start_index = i * chunk_size
        end_index = start_index + chunk_size

        # Calculate the mean of the chunk and create the mask
        mean_value = df[column_to_mask].iloc[start_index:end_index].mean()
        mask_value =  1 if mean_value >= threshold else  0

        # Update the DataFrame with the mask value
        df.loc[start_index:end_index, 'mask'] = mask_value

        # Record the event log
        event_log[i] = (start_index, end_index, mask_value)

    # Process the remainder if any
    if remainder >  0:
        start_index = total_chunks * chunk_size
        end_index = len(df)

        # Calculate the mean of the remainder and create the mask
        mean_value = df[column_to_mask].iloc[start_index:end_index].mean()
        mask_value =  1 if mean_value >= threshold else  0

        # Update the DataFrame with the mask value
        df.loc[start_index:end_index, 'mask'] = mask_value

        # Record the event log for the remainder
        event_log[total_chunks] = (start_index, end_index, mask_value)

    # Return the updated DataFrame and the event log
    return df, event_log



In [18]:
# Updated Function for defining on-lift identification v0.2

def on_lift_event_identification(df, event_log):
    """
    Identify continuous events in a DataFrame based on an event log and assign a unique label to each event.

    Args:
        df (pd.DataFrame): DataFrame containing the data.
        event_log (dict): Event log generated by the generate_misclassification_mask function.

    Returns:
        pd.DataFrame: The updated DataFrame with a new 'event' column indicating the event label for each row.
    """
    # Validate the event log
    if not isinstance(event_log, dict) or not all(isinstance(v, tuple) and len(v) ==  3 for v in event_log.values()):
        raise ValueError("Invalid event log format.")

    continuous_events_dict = {}
    event_index = 1

    start = None
    end = None

    for key in sorted(event_log.keys()):
        if event_log[key][2] > 0:
          if start is None:
              start = event_log[key][0]
              end = event_log[key][1]
          elif end == event_log[key][0]:
              end = event_log[key][1]
          else:
              continuous_events_dict[event_index] = (start, end)
              event_index += 1
              start, end, _ = event_log[key]

          # Append the last continuous event
        if start is not None and end is not None:
              continuous_events_dict[event_index] = (start, end)
              event_index +=  1
              start = None
              end = None

    # Assign event labels to the DataFrame
    df['event'] = 0
    for label in continuous_events_dict:
      range_val = [x for x in range(continuous_events_dict[label][0], continuous_events_dict[label][1] + 1)]
      df.loc[range_val, 'event'] = label

    return df


# UX

In [None]:
def count_number_of_rides_per_lift(df, lifts_db):
    # Initialise counter for lift usage
    lift_usage_counter = {}

    # Group df by  'event' column
    df_grouped = df.groupby('event')

    # Iterate over each group
    for event, group in df_grouped:
        # Extract start and end coordinates from the first and last rows of the group
        start_row = group.iloc[0]
        end_row = group.iloc[-1]

        # Extract start coordinates of events
        start_coords = (start_row['Lat'], start_row['Long'])

        # Convert start coordinates to radians
        start_coords_rad = np.radians([start_coords])

        # get hold of start and end alt
        start_alt = start_row['Alt(m)']
        end_alt = end_row['Alt(m)']

        # compare start_alt and end_alt to decide if start_coords should be compared to top_coord or base_coord
        if start_alt < end_alt:
            # Convert lift base locations to radians
            lift_base_locations_rad = np.radians([[lift['base_latitude'], lift['base_longitude']] for _, lift in lifts_db.iterrows()])
            # Use BallTree to find the nearest lift for the start coordinates
            base_tree = BallTree(lift_base_locations_rad, metric='haversine')
            _, base_indices = base_tree.query(start_coords_rad, k=1)
            # Get the lift name for the nearest lift to the start coordinates
            base_lift_name = lifts_db.iloc[base_indices.flatten()[0]]['lift_name']
            # Update lift usage counter
            lift_usage_counter[base_lift_name] = lift_usage_counter.get(base_lift_name, 0) + 1
        elif start_alt > end_alt:
             # Convert lift top locations to radians
            lift_top_locations_rad = np.radians([[lift['top_latitude'], lift['top_longitude']] for _, lift in lifts_db.iterrows()])
            # Use BallTree to find the nearest lift for the start coordinates
            top_tree = BallTree(lift_top_locations_rad, metric='haversine')
            _, top_indices = top_tree.query(start_coords_rad, k=1)
            # Get the lift name for the nearest lift to the start coordinates
            top_lift_name = lifts_db.iloc[top_indices.flatten()[0]]['lift_name']
            # Update lift usage counter
            lift_usage_counter[top_lift_name] = lift_usage_counter.get(top_lift_name, 0) + 1

    # Print lift usage information
    print("Lifts used today:\n")
    for lift_name, count in lift_usage_counter.items():
        print(f"Lift {lift_name} was used {count} times.")

# Prediction steps in one function

In [19]:
# use this to predict on a csv that already has all data points
def predict_on_data(path_to_csv_file, file_path_to_model):

    # Load the data
    df = pd.read_csv(path_to_csv_file)

    # Preprocess and feature engineering
    df = convert_datetime(df)
    df = reduce_sampling_rate(df)
    df = add_engineered_features(df)

    # Load the model
    rfc = joblib.load(file_path_to_model)

    # Make predictions
    df = predict_on_features(rfc, df)

    # Update preidctions with mask
    df, event_log = generate_misclassification_mask(df)

    #Generate on lift event assignments
    df = on_lift_event_identification(df, event_log)

    # Plot predictions
    plot_prediction(df, target_column='mask')
    plot_prediction(df, target_column='event')
    return df



# "Real time" prediction simulation

fetch_and_process_data_beta_v0.2

In [20]:
def real_time_prediction_beta(path_to_csv_file, chunk_size=60, wait_time=2):

    # Load the dataset
    df_input = pd.read_csv(path_to_csv_file)
    df_input = convert_datetime(df_input)
    df_input = reduce_sampling_rate(df_input)


    # Calculate the total number of chunks
    total_chunks = len(df_input) // chunk_size

    # Initialize an empty list to store all calculations
    all_calculations = pd.DataFrame()

    fig = px.scatter()

    for i in range(total_chunks):
        # Fetch a chunk of 60 observations
        start_index = i * chunk_size
        end_index = start_index + chunk_size
        data_chunk = df_input.iloc[start_index:end_index]

        # Append the processed chunk to the list of all calculations
        all_calculations = pd.concat([all_calculations, data_chunk], ignore_index=True)

        # Preprocess and feature engineering
        df = add_engineered_features(all_calculations)

        # # Make predictions
        df = predict_on_features(rfc, df)
        df, event_log = generate_misclassification_mask(df)

        plot_prediction(df, target_column='mask')

        # Wait for declared wait_time before processing the next chunk
        time.sleep(wait_time)

    df = on_lift_event_identification(df, event_log)
    plot_prediction(df, target_column='event')

# To do before going for one of the options

Make sure that you specify the paths and other variables needed

In [None]:
# Run predictions on this file/df
path_to_csv_file='/Users/ze/Documents/Coding/Projects/2024/alturos/data/raw/v5_20240210_093434_131m.csv'
file_path_to_model = '/Users/ze/Documents/Coding/Projects/2024/alturos/models/rf_v_0.4.pkl'

In [None]:
# Load the lifts_db

# path to file
file_lift_db='/Users/ze/Documents/Coding/Projects/2024/alturos/data/lift_data/lifts_db_v0.1.csv'
# Load df
lifts_db = pd.read_csv(file_lift_db)

In [22]:
# Run predictions on this file/df
path_to_csv_file='/content/df_95_labeled_on_lift.csv'
file_path_to_model = '/content/rf_v_0.4.pkl'

# Option 1: 'Real time' prediction

In [None]:
# # Call this function to simulate prediction in 'real time'
df_option1 = real_time_prediction_beta(path_to_csv_file,
                       chunk_size=60,
                       wait_time=0.1)

# Option 2: One step to prediction

In [25]:
df_option2 = predict_on_data(path_to_csv_file, file_path_to_model)

# Option 3: Go through everything step by step

In [26]:
df_option3=import_data(path_to_csv_file)

In [27]:
df_option3=convert_datetime(df_option3)

In [28]:
df_option3 = reduce_sampling_rate(df_option3)

In [29]:
df_option3=add_engineered_features(df_option3)

In [30]:
rfc=load_model(file_path_to_model)

In [31]:
df_option3=predict_on_features(rfc, df_option3)

In [32]:
df_option3, event_log = generate_misclassification_mask(df_option3)

In [33]:
df_option3 = on_lift_event_identification(df_option3, event_log)

In [34]:
df_option3.columns

Index(['Timestamp', 'accelX(g)', 'accelY(g)', 'accelZ(g)', 'accelUserX(g)',
       'accelUserY(g)', 'accelUserZ(g)', 'gyroX(rad/s)', 'gyroY(rad/s)',
       'gyroZ(rad/s)', 'Roll(rads)', 'Pitch(rads)', 'Yaw(rads)', 'm11', 'm12',
       'm13', 'm21', 'm22', 'm23', 'm31', 'm32', 'm33', 'qX', 'qY', 'qZ', 'qW',
       'Lat', 'Long', 'Speed(m/s)', 'TrueHeading', 'Alt(m)',
       'HorizontalAccuracy(m)', 'VerticalAccuracy(m)', 'Course',
       'ActivityType', 'ActivityConfidence', 'Pressure(kilopascals)',
       'RelativeAltitude(meters)', 'magX(µT)', 'magY(µT)', 'magZ(µT)',
       'calMagX(µT)', 'calMagY(µT)', 'calMagZ(µT)', 'Cluster_1', 'on_lift',
       'Alt(m)_change', 'Speed(m/s)_change', 'Course_change', 'predicted',
       'mask', 'event'],
      dtype='object')

In [None]:
# Tell me how often I have used which lift today
count_number_of_rides_per_lift(df_option3, lifts_db)

In [36]:
df_option3['event'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26])

In [None]:
#Double check and map only lift rides
map_lift_rides(df_option3,column='mask')

In [None]:
plot_prediction(df_option3, target_column='predicted')

In [None]:
plot_prediction(df_option3, target_column='mask')

In [None]:
plot_prediction(df_option3, target_column='event')

In [None]:
map = map_tracked_movement(df_option3)