# Data Import

In [1]:
import pandas as pd
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from geopy.distance import geodesic

from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
from sklearn.cluster import MiniBatchKMeans

from sklearn.decomposition import PCA

from mpl_toolkits.mplot3d import Axes3D

import plotly.graph_objs as go

import dask.dataframe as dd

import gc

import pyarrow

In [2]:
dataset_path = '/kaggle/input/sncb-data-augumentation/enriched_cleaned_ar41_for_ulb.csv'

# Check if the file exists before trying to read it
if os.path.exists(dataset_path):
    data = pd.read_csv(dataset_path)

    # Display the basic information and the first few rows of the dataframe
    data_info = data.info()
    data_head = data.head()

    # If you want to print the information to the console
    print(data_info)
    print(data_head)
else:
    print(f"The file {dataset_path} does not exist.")
    
data = data.drop(['Unnamed: 0', 'dayofweek', 'datetime', 'date_hour'], axis=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17677337 entries, 0 to 17677336
Data columns (total 32 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Unnamed: 0          int64  
 1   timestamps_UTC      object 
 2   mapped_veh_id       float64
 3   lat                 float64
 4   lon                 float64
 5   RS_E_InAirTemp_PC1  float64
 6   RS_E_InAirTemp_PC2  float64
 7   RS_E_OilPress_PC1   float64
 8   RS_E_OilPress_PC2   float64
 9   RS_E_RPM_PC1        float64
 10  RS_E_RPM_PC2        float64
 11  RS_E_WatTemp_PC1    float64
 12  RS_E_WatTemp_PC2    float64
 13  RS_T_OilTemp_PC1    float64
 14  RS_T_OilTemp_PC2    float64
 15  date                object 
 16  hour                float64
 17  dayofweek           float64
 18  weekday             object 
 19  Distance            float64
 20  Speed               float64
 21  date_hour           object 
 22  datetime            object 
 23  weather_main        object 
 24  temp                fl

# Feature Engineering

In [3]:
def optimize_datatypes(df):
    """
    Convert columns to more memory-efficient types.
    """
    for col in df.select_dtypes(include=['float64', 'float32']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    for col in df.select_dtypes(include=['int64', 'int32']).columns:
        df[col] = pd.to_numeric(df[col], downcast='unsigned')
    return df

def feat_eng(df):
    """
    Perform feature engineering on the dataset with optimizations for memory efficiency.
    """
    # Define aggregation functions
    agg_funcs = {
        'mean': 'mean',
        'median': 'median',
        'std': 'std',
        'skew': 'skew',
        'kurt': lambda x: pd.Series.kurt(x),
        'min': 'min',
        'max': 'max'
    }

    # Optimize data types first
    df = optimize_datatypes(df)
    
    # Identify sensor columns in the dataset
    sensors = [col for col in df.columns if col.startswith('RS_E_') or col.startswith('RS_T_')]
    
    # Initialize the sequence DataFrame with the correct index
    seq_df = pd.DataFrame(index=df['mapped_veh_id'].unique())
    
    for sensor in sensors:
        # Compute rolling means and other window-based metrics only when needed
        sensor_df = df.groupby('mapped_veh_id')[sensor]
        for window in [3, 6, 9]:
            df[f'{sensor}_roll_mean{window}'] = sensor_df.rolling(window=window, min_periods=1).mean().reset_index(level=0, drop=True).astype('float32')
        
        # Fill NaN values if necessary
        df.fillna(method='ffill', inplace=True)

        # Calculate aggregated features for each sensor and mapped_veh_id
        for func_name, func in agg_funcs.items():
            seq_df[f'{sensor}_{func_name}'] = df.groupby('mapped_veh_id')[sensor].agg(func).astype('float32')

    # Drop temporary columns to free up memory
    temp_cols = [col for col in df.columns if 'roll_mean' in col]
    df.drop(columns=temp_cols, inplace=True)
    
    # Manually trigger garbage collection to free up memory
    gc.collect()
    
    return df, seq_df


In [4]:
data = feat_eng(data)
data

(               timestamps_UTC  mapped_veh_id        lat   lon  \
 0         2023-01-23 07:25:08          102.0  51.020000  3.77   
 1         2023-01-23 07:25:16          102.0  51.020000  3.77   
 2         2023-01-23 07:25:37          102.0  51.020000  3.77   
 3         2023-01-23 07:25:41          102.0  51.020000  3.77   
 4         2023-01-23 07:26:10          102.0  51.020000  3.77   
 ...                       ...            ...        ...   ...   
 17677332  2023-09-13 17:33:03          197.0  50.400002  4.45   
 17677333  2023-09-13 17:33:58          197.0  50.400002  4.45   
 17677334  2023-09-13 17:34:03          197.0  50.400002  4.45   
 17677335  2023-09-13 17:34:58          197.0  50.400002  4.46   
 17677336  2023-09-13 17:35:04          197.0  50.400002  4.46   
 
           RS_E_InAirTemp_PC1  RS_E_InAirTemp_PC2  RS_E_OilPress_PC1  \
 0                       17.0                18.0              210.0   
 1                       17.0                20.0             

In [5]:
type(data)

tuple

# Save to CSV

In [6]:
# data.to_csv('labeled_augumented_cleaned_ar41_for_ulb.csv', index=True)

In [7]:
df1, df2 = data 

df1.to_parquet('enriched_data.parquet') 
df2.to_parquet('feature_engineering.parquet')  