# Import libraries

In [64]:
import pandas as pd
import numpy as np
import logging

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

import plotly.express as px
import plotly.graph_objects as go


from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import joblib

import time
import os

#Configure logging

In [65]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Import Data

In [66]:
def import_data(file_path):
    # Read data from the specified file path
    df_raw=pd.read_csv(file_path)

    # Return the imported DataFrame
    return df_raw

# Cleaning Data

# Feature Engineering

In [67]:

def add_engineered_features(df, alt_threshold=20, speed_threshold=3):
    """
    Calculate the altitude change, speed change, and course change between consecutive rows and remove outliers.

    Args:
        df (pd.DataFrame): DataFrame containing the altitude, speed, and course data.
        alt_threshold (float): Threshold value for altitude change outlier detection.
        speed_threshold (float): Threshold value for speed change outlier detection.
       
    Returns:
        tuple: A tuple containing the modified DataFrame and a dictionary with shape information.
    """
    # Check if required columns exist
    required_columns = ['Alt(m)', 'Speed(m/s)', 'Course']
    if not all(col in df.columns for col in required_columns):
        logger.error("Required columns not found in DataFrame.")
        return None, {}

    # Store the initial DataFrame size
    initial_size = len(df)

    # Calculate changes
    # df['Alt(m)_change'] = df['Alt(m)'].diff().fillna(0)
    # df['Speed(m/s)_change'] = df['Speed(m/s)'].diff().fillna(0)
    # df['Course_change'] = df['Course'].diff().fillna(0)
    df.loc[:, 'Alt(m)_change'] = df['Alt(m)'].diff().fillna(0)
    df.loc[:, 'Speed(m/s)_change'] = df['Speed(m/s)'].diff().fillna(0)
    df.loc[:, 'Course_change'] = df['Course'].diff().fillna(0)

    # Remove outliers
    mask = (df['Alt(m)_change'].abs() <= alt_threshold) & \
           (df['Speed(m/s)_change'].abs() <= speed_threshold)
    filtered_df = df.copy()[mask]

    # Reset the index of the filtered DataFrame
    filtered_df = filtered_df.reset_index(drop=True)

    # Log the shape after outlier removal
    logger.info("Shape before outlier removal: %d", initial_size)
    logger.info("Shape after outlier removal: %d", len(filtered_df))

    return filtered_df

In [68]:
#updated convert_datetime function
def convert_datetime(df, inplace=True):
    """
    Convert the 'Timestamp' column in a DataFrame to datetime format.

    Args:
        df (pd.DataFrame): DataFrame containing the 'Timestamp' column.
        inplace (bool): Whether to modify the original DataFrame or create a copy.

    Returns:
        pd.DataFrame: The DataFrame with the 'Timestamp' column converted to datetime format.
    """
    # Check if 'Timestamp' column exists
    if 'Timestamp' not in df.columns:
        raise ValueError("Column 'Timestamp' not found in DataFrame.")

    # Convert 'Timestamp' column to datetime format
    if inplace:
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='ns')
    else:
        df = df.copy()
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='ns')

    return df


# Prediction

In [69]:
def select_features(df):
    """
    Select a subset of features from a DataFrame.

    Args:
        df (pd.DataFrame): DataFrame containing the features.

    Returns:
        pd.DataFrame: A DataFrame containing only the selected features.
    """
    features_to_use = ['accelX(g)', 'accelY(g)', 'accelZ(g)', 'accelUserX(g)', 'accelUserY(g)',
                       'accelUserZ(g)', 'gyroX(rad/s)', 'gyroY(rad/s)', 'gyroZ(rad/s)',
                       'Roll(rads)', 'Pitch(rads)', 'Yaw(rads)', 'Lat', 'Long', 'Speed(m/s)',
                       'HorizontalAccuracy(m)', 'VerticalAccuracy(m)', 'Course', 'calMagX(µT)',
                       'calMagY(µT)', 'calMagZ(µT)', 'Alt(m)_change',
                       'Speed(m/s)_change', 'Course_change']

    # Check if all features exist in the DataFrame
    missing_features = [feature for feature in features_to_use if feature not in df.columns]
    if missing_features:
        raise ValueError(f"Features not found in DataFrame: {missing_features}")

    # Select the features
    X = df[features_to_use]

    return X


In [70]:
def load_model(file_path_to_model):

    # Load the saved model
    return joblib.load(file_path_to_model)


In [71]:
def show_hyperparameters(model):
    # show hyperparameters
    return model.get_params()


In [72]:
def predict_on_features(model, df, features):

    predictions=model.predict(features)

    df['predicted']=predictions

    return df.reset_index()


# Visualisation

### Plotting

In [73]:
def plot_prediction(df, target_column='predicted'):
    # Define the plot title based on the target column
    if target_column == 'on_lift':
        plot_title = 'Predictions'
    elif target_column == 'mask':
        plot_title = 'Clean Predictions with mask'
    elif target_column == 'event':
        plot_title = 'Lift Events'
    else:
        plot_title = 'Predictions'


    # Use the passed colormap if available, otherwise use the default
    if cmap is None:
        dark2_cmap = ListedColormap(plt.cm.Dark2(range(8)))
        cmap = {str(idx): color for idx, color in enumerate(dark2_cmap.colors)}


    # Check if 'Timestamp' column exists and is in datetime format
    if 'Timestamp' in df.columns and pd.api.types.is_datetime64_any_dtype(df['Timestamp']):
        # Create a scatter plot for Altitude over Time, colored by target_column with an accessible color scheme
        fig = px.scatter(df, x='Timestamp',
                         y='Alt(m)', color=target_column,
                         labels={'Alt(m)': 'Altitude (m)'},
                         title=plot_title,
                         color_discrete_map=cmap)

        fig.update_traces(marker=dict(size=8),
                          selector=dict(mode='markers'))

        # Customize the legend
        fig.update_layout(
            legend_title_text='Status',
            width=1000,
            height=600
        )

        # Update legend labels based on target_column
        if target_column == 'on_lift':
            fig.for_each_trace(lambda trace: trace.update(name='Not on the lift' if trace.name == '0' else 'On the lift'))
        # Add more conditions if there are different classes for other target_columns

        # Show the plot
        fig.show()
    else:
        print("Warning: DataFrame's 'Timestamp' column is not in datetime format and must be converted first.")


In [74]:
def plot_total_alt_over_time(df, plot_title='Total Tracked Altitude Over Time'):

    # Create a line plot using Plotly
    fig = go.Figure()

    # Add a trace for altitude over time
    fig.add_trace(go.Scatter(x=df['Timestamp'],
                             y=df['Alt(m)'],
                             mode='lines',
                             name='Altitude'))

    # Update layout
    fig.update_layout(title=plot_title,
                      xaxis_title='Timestamp',
                      yaxis_title='Altitude (m)')

    # Show plot
    fig.show()

### Mapping

In [75]:
#Map all tracked movement based on lat and long of GPS data

def map_tracked_movement(df, zoom_start=12):
    # Create a map centered on the mean latitude and longitude
    map_center = [df['Lat'].mean(), df['Long'].mean()]
    movement_on_map = folium.Map(location=map_center, zoom_start=zoom_start)

    # Add CircleMarkers for each data point
    for index, row in df.iterrows():
        folium.CircleMarker(location=[row['Lat'], row['Long']], radius=5, color='blue', fill=True, fill_color='blue').add_to(movement_on_map)

    # Display the map
    return movement_on_map

In [76]:
#Map lifts rides in red and all the other movement in blue

def map_lifts_and_other_movement(df, column='on_lift', zoom_start=15):
    # Create a map centered on the mean latitude and longitude
    map_center = [df['Lat'].mean(), df['Long'].mean()]
    tracking_map = folium.Map(location=map_center, zoom_start=zoom_start)

    # Plot data points with on_lift type
    on_lift = df[df[column] == 1]
    for _, row in on_lift.iterrows():
        folium.CircleMarker(location=[row['Lat'], row['Long']], radius=5, color='red', fill=True, fill_color='red', tooltip=str(row['Timestamp'])).add_to(tracking_map)

    # Plot data points with not_on_lift  types
    not_on_lift = df[df[column] != 1]
    for _, row in not_on_lift.iterrows():
        folium.CircleMarker(location=[row['Lat'], row['Long']], radius=5, color='blue', fill=True, fill_color='blue', tooltip=str(row['Timestamp'])).add_to(tracking_map)

    # Return the map object
    return tracking_map

In [77]:
#Map only lifts rides

def map_lift_rides(df, column='on_lift', zoom_start=15):
    # Create a map centered on the mean latitude and longitude
    map_center = [df['Lat'].mean(), df['Long'].mean()]
    lift_map = folium.Map(location=map_center, zoom_start=zoom_start)

    # Plot data points with on_lift type
    on_lift = df[df[column] == 1]
    for _, row in on_lift.iterrows():
        folium.CircleMarker(location=[row['Lat'],
                                      row['Long']],
                                      radius=5,
                                      color='red',
                                      fill=True,
                                      fill_color='red',
                                      tooltip=str(row['Timestamp'])).add_to(lift_map)

    # Return the map object
    return lift_map

# Post-Processing

In [78]:
### updated misclassification mask v0.2

import pandas as pd

def generate_misclassification_mask(df, column_to_mask='predicted', chunk_size=60, threshold=0.3):
    """
    Apply a binary mask to each row in a DataFrame based on the average value of a specified column in chunks.

    Args:
        df (pd.DataFrame): DataFrame containing the data.
        column_to_mask (str): Name of the column to calculate the mean and apply the mask.
        chunk_size (int): Size of the chunks to divide the DataFrame into.
        threshold (float): Threshold value for determining the mask value.

    Returns:
        tuple: A tuple containing the updated DataFrame with the mask applied and the event log.
    """

    # Check if the 'mask' column already exists
    if 'mask' in df.columns:
        raise ValueError("The 'mask' column already exists in the DataFrame.")

    # Initialize the event log
    event_log = {}

    # Calculate the total number of chunks
    total_chunks = len(df) // chunk_size
    remainder = len(df) % chunk_size

    # Process each chunk
    for i in range(total_chunks):
        start_index = i * chunk_size
        end_index = start_index + chunk_size

        # Calculate the mean of the chunk and create the mask
        mean_value = df[column_to_mask].iloc[start_index:end_index].mean()
        mask_value =  1 if mean_value >= threshold else  0

        # Update the DataFrame with the mask value
        df.loc[start_index:end_index, 'mask'] = mask_value

        # Record the event log
        event_log[i] = (start_index, end_index, mask_value)

    # Process the remainder if any
    if remainder >  0:
        start_index = total_chunks * chunk_size
        end_index = len(df)

        # Calculate the mean of the remainder and create the mask
        mean_value = df[column_to_mask].iloc[start_index:end_index].mean()
        mask_value =  1 if mean_value >= threshold else  0

        # Update the DataFrame with the mask value
        df.loc[start_index:end_index, 'mask'] = mask_value

        # Record the event log for the remainder
        event_log[total_chunks] = (start_index, end_index, mask_value)

    # Return the updated DataFrame and the event log
    return df, event_log



In [79]:
# Updated Function for defining on-lift identification v0.2

def on_lift_event_identification(df, event_log):
    """
    Identify continuous events in a DataFrame based on an event log and assign a unique label to each event.

    Args:
        df (pd.DataFrame): DataFrame containing the data.
        event_log (dict): Event log generated by the generate_misclassification_mask function.

    Returns:
        pd.DataFrame: The updated DataFrame with a new 'event' column indicating the event label for each row.
    """
    # Validate the event log
    if not isinstance(event_log, dict) or not all(isinstance(v, tuple) and len(v) ==  3 for v in event_log.values()):
        raise ValueError("Invalid event log format.")

    continuous_events_dict = {}
    event_index = 1

    start = None
    end = None

    for key in sorted(event_log.keys()):
        if event_log[key][2] > 0:
          if start is None:
              start = event_log[key][0]
              end = event_log[key][1]
          elif end == event_log[key][0]:
              end = event_log[key][1]
          else:
              continuous_events_dict[event_index] = (start, end)
              event_index += 1
              start, end, _ = event_log[key]

          # Append the last continuous event
        if start is not None and end is not None:
              continuous_events_dict[event_index] = (start, end)
              event_index +=  1
              start = None
              end = None

    # Assign event labels to the DataFrame
    df['event'] = 0
    for label in continuous_events_dict:
      range_val = [x for x in range(continuous_events_dict[label][0], continuous_events_dict[label][1] + 1)]
      df.loc[range_val, 'event'] = label

    return df


# Prediction steps in one function

In [80]:
# use this to predict on a csv that already has all data points
def predict_on_data(path_to_csv_file, file_path_to_model):

    # Load the data
    df = pd.read_csv(path_to_csv_file)

    # Preprocess and feature engineering
    df = add_engineered_features(df)
    df = convert_datetime(df)

    # Feature selection
    features = select_features(df)

    # Load the model
    rfc = joblib.load(file_path_to_model)

    # Make predictions
    df = predict_on_features(rfc, df, features)

    # Update preidctions with mask
    df, event_log = generate_misclassification_mask(df)

    #Generate on lift event assignments
    df = on_lift_event_identification(df, event_log)

    # Plot predictions
    plot_prediction(df, target_column='mask')
    plot_prediction(df, target_column='event')
    return df



# To do before going for one of the options

Make sure that you specify the paths and other variables needed

In [82]:
# # These are the default values used in the function fetch_and_process_data, change if needed
# chunk_size=60
# wait_time=0.1

In [83]:
path_to_csv_file='../../data/raw/v5_20230407_091351_95m.csv'
#data\raw\v5_20230408_082538_310m.csv

In [84]:
file_path_to_model = '../../models/rf_v_0.4.pkl'

# Option 1: 'Real time' prediction

In [None]:
# # Call this function to simulate prediction in 'real time'
df_option1 = real_time_prediction_beta(path_to_csv_file,
                       chunk_size=60,
                       wait_time=0.1)

# Option 2: One step to prediction

In [None]:
df_option2 = predict_on_data(path_to_csv_file, file_path_to_model)

# Option 3: Go through everything step by step

In [67]:
df_option3=import_data(path_to_csv_file)

In [60]:
df_option3=add_engineered_features(df_option3)

2024-02-15 10:59:17,585 - INFO - Shape before outlier removal: 5717
2024-02-15 10:59:17,586 - INFO - Shape after outlier removal: 5653


In [61]:
df_option3=convert_datetime(df_option3)

In [62]:
features=select_features(df_option3)

In [63]:
rfc=load_model(file_path_to_model)

In [64]:
df_option3=predict_on_features(rfc, df_option3, features)

In [65]:
df_option3, event_log = generate_misclassification_mask(df_option3)

In [66]:
df_option3 = on_lift_event_identification(df_option3, event_log)

In [52]:
plot_prediction(df_option3, target_column='predicted')

In [33]:
plot_prediction(df_option3, target_column='mask')

In [34]:
plot_prediction(df_option3, target_column='event')

# Real Time simulation

In [85]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import matplotlib.dates as mdates

matplotlib.use('TkAgg')

In [91]:
class DataAnimation:
    def __init__(self, df, file_path_to_model):
        self.df = df
        self.file_path_to_model = file_path_to_model
        self.chunk_size = 60
        self.data_agg = pd.DataFrame()
        self.fig, self.ax = plt.subplots()
        self.data_generator = DataAnimation.get_next_chunk(self, self.df, self.chunk_size)
        self.total_chunks = len(df) // 60
        self.ani = animation.FuncAnimation(self.fig, self.update, frames=self.total_chunks, interval=200, blit=False, repeat=False)
        plt.show()

    def update(self, i):   
        # Get the next chunk of data
        data_chunk = next(self.data_generator)
        processed_chunk = DataAnimation.complete_pipeline(self, data_chunk, file_path_to_model)

        # Concatenate the processed chunk with the aggregated data
        self.data_agg = pd.concat([self.data_agg, processed_chunk])

        # Update the plot with the new data
        sc = self.ax.scatter(self.data_agg['Timestamp'], self.data_agg['Alt(m)'], c=self.data_agg['color'])

        # Set the axes limits dynamically based on the new data
        x_min = self.data_agg['Timestamp'].min() - pd.Timedelta(minutes=5)
        x_max = self.data_agg['Timestamp'].max() + pd.Timedelta(minutes=5)
        y_min = self.data_agg['Alt(m)'].min() - 100
        y_max = self.data_agg['Alt(m)'].max() + 100

        # Set the axes limits if necessary
        self.ax.set_xlim([x_min, x_max])
        self.ax.set_ylim([y_min, y_max])

        # Update the x-axis major locator and formatter
        #self.ax.xaxis.set_major_locator(mdates.MinuteLocator(interval=1))
        self.ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))

        #Set tick mark angle
        plt.setp(self.ax.get_xticklabels(), rotation=45, ha='right')

        # Add labels and title if necessary
        self.ax.set_title(f'Real-time Data Analysis (minute: {i})')
        self.ax.set_xlabel('Time')
        self.ax.set_ylabel('Altitude (m)')

         # Add legend with custom labels
        legend_labels = {'red': 'on_lift'}
        self.ax.legend(handles=sc.legend_elements()[0], labels=legend_labels)

        # Return the scatter plot object to blit
        return sc

    # Define your custom function
    def complete_pipeline(self, data_chunk, file_path_to_model):
        # Define colors for the categories
        colors = {1: 'red',  0: 'blue'}
        #file_path_to_model = '../../models/rf_v_0.4.pkl'
        df = add_engineered_features(data_chunk)
        df = convert_datetime(df)
        # Feature selection
        features = select_features(df)
        rfc=load_model(file_path_to_model)
        # Make predictions
        df = predict_on_features(rfc, df, features)
        df, event_log = generate_misclassification_mask(df)
        df['color'] = df['mask'].map(colors)
        return df

    # Placeholder for your data stream setup
    def get_next_chunk(self, df, chunk_size):
        total_chunks = len(df) // chunk_size
        for i in range(total_chunks):
            start_index = i * chunk_size
            end_index = start_index + chunk_size
            data_chunk = df.iloc[start_index:end_index]
            yield data_chunk



In [92]:
df_1=import_data(path_to_csv_file)
data_anim = DataAnimation(df=df_1, file_path_to_model=file_path_to_model)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

2024-02-16 16:26:10,665 - INFO - Shape before outlier removal: 60
2024-02-16 16:26:10,666 - INFO - Shape after outlier removal: 59

Collection without array used. Make sure to specify the values to be c