# Data Import

In [1]:
import pandas as pd
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from geopy.distance import geodesic

from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
from sklearn.cluster import MiniBatchKMeans

from sklearn.decomposition import PCA

from mpl_toolkits.mplot3d import Axes3D

import plotly.graph_objs as go

import dask.dataframe as dd

import gc

import pyarrow

In [2]:
dataset_path = '/kaggle/input/naive-label-ar41/labeled_augumented_cleaned_ar41_for_ulb.csv'

# Check if the file exists before trying to read it
if os.path.exists(dataset_path):
    data = pd.read_csv(dataset_path)

    # Display the basic information and the first few rows of the dataframe
    data_info = data.info()
    data_head = data.head()

    # If you want to print the information to the console
    print(data_info)
    print(data_head)
else:
    print(f"The file {dataset_path} does not exist.")
    
# data = data.drop(['Unnamed: 0', 'dayofweek', 'datetime', 'date_hour'], axis=1)

  data = pd.read_csv(dataset_path)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17677337 entries, 0 to 17677336
Data columns (total 32 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   Unnamed: 0               int64  
 1   timestamps_UTC           object 
 2   mapped_veh_id            float64
 3   lat                      float64
 4   lon                      float64
 5   RS_E_InAirTemp_PC1       float64
 6   RS_E_InAirTemp_PC2       float64
 7   RS_E_OilPress_PC1        float64
 8   RS_E_OilPress_PC2        float64
 9   RS_E_RPM_PC1             float64
 10  RS_E_RPM_PC2             float64
 11  RS_E_WatTemp_PC1         float64
 12  RS_E_WatTemp_PC2         float64
 13  RS_T_OilTemp_PC1         float64
 14  RS_T_OilTemp_PC2         float64
 15  date                     object 
 16  hour                     float64
 17  weekday                  object 
 18  Distance                 float64
 19  Speed                    float64
 20  weather_main             object 
 21  temp  

# Feature Engineering

In [3]:
def optimize_datatypes(df):
    """
    Convert columns to more memory-efficient types.
    """
    for col in df.select_dtypes(include=['float64', 'float32']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    for col in df.select_dtypes(include=['int64', 'int32']).columns:
        df[col] = pd.to_numeric(df[col], downcast='unsigned')
    return df

def feat_eng(df):
    # 转换时间戳
    df['timestamps_UTC'] = pd.to_datetime(df['timestamps_UTC'])

    # 数据类型优化
    df = optimize_datatypes(df)

    # 识别传感器列
    sensors = [col for col in df.columns if col.startswith('RS_E_') or col.startswith('RS_T_')]

    # 为每个传感器添加特征
    for sensor in sensors:
        # 滞后特征
        df['{}_lag1'.format(sensor)] = df.groupby('mapped_veh_id')[sensor].shift(1)
        df['{}_lag1'.format(sensor)].fillna(df[sensor].median(), inplace=True)

        # 差分特征
        df['{}_diff'.format(sensor)] = df[sensor] - df['{}_lag1'.format(sensor)]

        # 滚动窗口特征
        df['{}_roll_mean3'.format(sensor)] = df[sensor].rolling(window=3).mean()
        df['{}_roll_mean6'.format(sensor)] = df[sensor].rolling(window=6).mean()
        df['{}_roll_mean9'.format(sensor)] = df[sensor].rolling(window=9).mean()
        df['{}_roll_mean3'.format(sensor)].fillna(df['{}_roll_mean3'.format(sensor)].median(), inplace=True)
        df['{}_roll_mean6'.format(sensor)].fillna(df['{}_roll_mean6'.format(sensor)].median(), inplace=True)
        df['{}_roll_mean9'.format(sensor)].fillna(df['{}_roll_mean9'.format(sensor)].median(), inplace=True)

    # 聚合特征
    agg_funcs = {
        'mean': 'mean',
        'median': 'median',
        'std': 'std',
        'skew': 'skew',
        'kurt': lambda x: pd.Series.kurt(x),
        'min': 'min',
        'max': 'max'
    }

    # 对于每个传感器和mapped_veh_id组合计算聚合特征
    for sensor in sensors:
        s_diff = '{}_diff'.format(sensor)
        for func_name, func in agg_funcs.items():
            df['{}_{}_by_mapped_veh_id'.format(sensor, func_name)] = df.groupby('mapped_veh_id')[sensor].transform(func)

    return df

In [4]:
data = feat_eng(data)
data

  df['{}_{}_by_mapped_veh_id'.format(sensor, func_name)] = df.groupby('mapped_veh_id')[sensor].transform(func)
  df['{}_{}_by_mapped_veh_id'.format(sensor, func_name)] = df.groupby('mapped_veh_id')[sensor].transform(func)
  df['{}_{}_by_mapped_veh_id'.format(sensor, func_name)] = df.groupby('mapped_veh_id')[sensor].transform(func)
  df['{}_{}_by_mapped_veh_id'.format(sensor, func_name)] = df.groupby('mapped_veh_id')[sensor].transform(func)
  df['{}_{}_by_mapped_veh_id'.format(sensor, func_name)] = df.groupby('mapped_veh_id')[sensor].transform(func)
  df['{}_{}_by_mapped_veh_id'.format(sensor, func_name)] = df.groupby('mapped_veh_id')[sensor].transform(func)
  df['{}_{}_by_mapped_veh_id'.format(sensor, func_name)] = df.groupby('mapped_veh_id')[sensor].transform(func)
  df['{}_{}_by_mapped_veh_id'.format(sensor, func_name)] = df.groupby('mapped_veh_id')[sensor].transform(func)
  df['{}_{}_by_mapped_veh_id'.format(sensor, func_name)] = df.groupby('mapped_veh_id')[sensor].transform(func)
 

Unnamed: 0.1,Unnamed: 0,timestamps_UTC,mapped_veh_id,lat,lon,RS_E_InAirTemp_PC1,RS_E_InAirTemp_PC2,RS_E_OilPress_PC1,RS_E_OilPress_PC2,RS_E_RPM_PC1,...,RS_T_OilTemp_PC1_kurt_by_mapped_veh_id,RS_T_OilTemp_PC1_min_by_mapped_veh_id,RS_T_OilTemp_PC1_max_by_mapped_veh_id,RS_T_OilTemp_PC2_mean_by_mapped_veh_id,RS_T_OilTemp_PC2_median_by_mapped_veh_id,RS_T_OilTemp_PC2_std_by_mapped_veh_id,RS_T_OilTemp_PC2_skew_by_mapped_veh_id,RS_T_OilTemp_PC2_kurt_by_mapped_veh_id,RS_T_OilTemp_PC2_min_by_mapped_veh_id,RS_T_OilTemp_PC2_max_by_mapped_veh_id
0,10309566,2023-01-23 02:29:50,160.0,51.040001,3.69,14.0,15.0,6.0,27.0,802.0,...,1.521257,2.0,115.0,78.355850,82.0,14.222122,-1.782691,3.714270,3.0,107.0
1,10309567,2023-01-23 02:30:49,160.0,51.040001,3.69,14.0,25.0,6.0,24.0,802.0,...,1.521257,2.0,115.0,78.355850,82.0,14.222122,-1.782691,3.714270,3.0,107.0
2,10309568,2023-01-23 02:31:50,160.0,51.040001,3.69,20.0,30.0,6.0,24.0,802.0,...,1.521257,2.0,115.0,78.355850,82.0,14.222122,-1.782691,3.714270,3.0,107.0
3,479843,2023-01-23 02:50:41,106.0,50.410000,4.52,41.0,41.0,6.0,3.0,802.0,...,-0.081694,1.0,103.0,76.451469,80.0,13.087856,-2.243729,6.085453,1.0,101.0
4,479844,2023-01-23 02:50:48,106.0,50.410000,4.52,41.0,41.0,3.0,3.0,802.0,...,-0.081694,1.0,103.0,76.451469,80.0,13.087856,-2.243729,6.085453,1.0,101.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17677332,2319481,2023-09-13 21:51:55,115.0,50.139999,4.50,38.0,36.0,182.0,213.0,802.0,...,11.059318,0.5,100.0,78.336197,83.0,12.844290,-2.200371,5.962202,1.0,108.0
17677333,16731733,2023-09-13 21:51:57,191.0,51.150002,4.61,36.0,37.0,424.0,420.0,1501.0,...,7.413821,0.5,120.0,78.207191,81.0,11.416896,-2.719661,10.042174,1.0,101.0
17677334,3190108,2023-09-13 21:52:16,120.0,50.150002,4.50,37.0,31.0,196.0,241.0,801.0,...,9.111080,2.0,100.0,74.114502,77.0,12.495768,-2.126366,5.778567,0.5,99.0
17677335,17418908,2023-09-13 21:52:22,196.0,50.419998,4.54,18.0,23.0,690.0,300.0,800.0,...,0.219329,1.0,99.0,72.191246,76.5,15.855944,-0.956632,0.491501,1.0,101.0


# Save to CSV

In [5]:
# data.to_csv('feature_engineeringed_labeled_augumented_cleaned_ar41_for_ulb.csv', index=True)

In [6]:
data.to_parquet('feature_engineeringed_labeled_augumented_cleaned_ar41_for_ulb.parquet') 

In [7]:
# df1, df2 = data 

# df1.to_parquet('enriched_data.parquet') 
# df2.to_parquet('feature_engineering.parquet')  