In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import seaborn as sns

In [21]:
df = pd.read_csv('/Users/Sam/Downloads/Data-Preparation-07.csv')
print(df.shape)

  df = pd.read_csv('/Users/Sam/Downloads/Data-Preparation-07.csv')


(2949338, 225)


In [22]:
irrelevant_columns = [
    # Climate Control & Cabin Features
    'is_climate_on', 'driver_temp_setting', 'passenger_temp_setting',
    'seat_heater_left', 'seat_heater_right', 'steering_wheel_heater',
    'side_mirror_heaters', 'fan_status', 'auto_seat_climate_left', 'auto_seat_climate_right',
    
    # Safety Systems & Warnings
    'sentry_mode', 'service_mode', 'valet_mode',
    'tpms_hard_warning_fl', 'tpms_hard_warning_fr', 'tpms_hard_warning_rl', 'tpms_hard_warning_rr',
    'tpms_last_seen_pressure_time_fl', 'tpms_last_seen_pressure_time_fr',
    'tpms_last_seen_pressure_time_rl', 'tpms_last_seen_pressure_time_rr',
    'tpms_pressure_fl', 'tpms_pressure_fr', 'tpms_pressure_rl', 'tpms_pressure_rr',
    'tpms_rcp_front_value', 'tpms_rcp_rear_value',
    'tpms_soft_warning_fl', 'tpms_soft_warning_fr', 'tpms_soft_warning_rl', 'tpms_soft_warning_rr',
    
    # Entertainment & Miscellaneous
    'webcam_selfie_supported', 'webcam_supported', 'santa_mode', 'notifications_supported',
    
    # Charging Scheduling (Detailed)
    'off_peak_charging_enabled', 'off_peak_hours_end_time', 'preconditioning_times',
    
    # Appearance & Vehicle Metadata
    'exterior_color', 'wheel_type', 'interior_trim_type',
    'roof_color', 'vehicle_name', 'car_special_type'
]

# Drop the columns
df = df.drop(columns=irrelevant_columns, errors='ignore')

In [23]:
df.shape

(2949338, 181)

In [24]:
df.head()

Unnamed: 0,battery_heater_on,battery_level,battery_range,charge_amps,charge_current_request,charge_current_request_max,charge_enable_request,charge_energy_added,charge_limit_soc,charge_limit_soc_max,...,Folder Name,Folder Month,tpms_last_seen_pressure_time_fl_utc,tpms_last_seen_pressure_time_fl_utc_date_only,tpms_last_seen_pressure_time_fr_utc,tpms_last_seen_pressure_time_fr_utc_date_only,tpms_last_seen_pressure_time_rl_utc,tpms_last_seen_pressure_time_rl_utc_date_only,tpms_last_seen_pressure_time_rr_utc,tpms_last_seen_pressure_time_rr_utc_date_only
0,False,91.0,241.54,32.0,32.0,32.0,True,40.72,100,100,...,IN/CA/2024/07/None,7,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:07+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30
1,False,91.0,241.54,32.0,32.0,32.0,True,40.72,100,100,...,IN/CA/2024/07/None,7,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:07+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30
2,False,91.0,241.54,32.0,32.0,32.0,True,40.72,100,100,...,IN/CA/2024/07/None,7,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:07+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30
3,False,91.0,241.54,32.0,32.0,32.0,True,40.72,100,100,...,IN/CA/2024/07/None,7,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:07+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30
4,False,91.0,241.54,32.0,32.0,32.0,True,40.72,100,100,...,IN/CA/2024/07/None,7,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:07+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30


In [25]:
# Group columns to remove based on their irrelevance to the recommendations
columns_to_remove = {
    # General Metadata & Diagnostic Data
    "Metadata & Diagnostics": [
        'valet_mode', 'vehicle_name', 'vehicle_self_test_progress', 
        'vehicle_self_test_requested', 'webcam_available'
    ],
    
    # Detailed TPMS (Tire Pressure Monitoring System) Data
    "TPMS Details": [
        'tpms_hard_warning_fl', 'tpms_hard_warning_fr', 
        'tpms_hard_warning_rl', 'tpms_hard_warning_rr',
        'tpms_last_seen_pressure_time_fl', 'tpms_last_seen_pressure_time_fr',
        'tpms_last_seen_pressure_time_rl', 'tpms_last_seen_pressure_time_rr',
        'tpms_pressure_fl', 'tpms_pressure_fr', 'tpms_pressure_rl', 
        'tpms_pressure_rr', 'tpms_rcp_front_value', 'tpms_rcp_rear_value',
        'tpms_soft_warning_fl', 'tpms_soft_warning_fr', 
        'tpms_soft_warning_rl', 'tpms_soft_warning_rr'
    ],
    
    # Charging Metadata
    "Charging Metadata": [
        'off_peak_charging_enabled', 'off_peak_hours_end_time', 
        'preconditioning_times'
    ],
    
    # Cabin Climate & Features
    "Cabin Climate & Features": [
        'driver_temp_setting', 'passenger_temp_setting', 
        'seat_heater_left', 'seat_heater_right', 'steering_wheel_heater', 
        'side_mirror_heaters', 'auto_seat_climate_left', 'auto_seat_climate_right'
    ],
   
    # Entertainment & Miscellaneous Features
    "Entertainment & Miscellaneous": [
        'webcam_selfie_supported', 'webcam_supported', 
        'santa_mode', 'notifications_supported'
    ],
    
    # Appearance & Customization Features
    "Appearance & Customization": [
        'exterior_color', 'wheel_type', 'interior_trim_type', 'roof_color', 
        'car_special_type'
    ]
}


In [26]:
# Drop additional columns from grouped categories
#df = df.drop(columns=[col for group in columns_to_remove.values() for col in group], errors='ignore')

df = df.drop(columns=columns_to_remove, errors='ignore')


In [27]:
df.shape

(2949338, 181)