In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import seaborn as sns

In [42]:
df = pd.read_csv('/Users/Sam/Downloads/Data-Preparation-07.csv')
print(df.shape)

  df = pd.read_csv('/Users/Sam/Downloads/Data-Preparation-07.csv')


(2949338, 225)


In [43]:
irrelevant_columns = [
    # Climate Control & Cabin Features
    'is_climate_on', 'driver_temp_setting', 'passenger_temp_setting',
    'seat_heater_left', 'seat_heater_right', 'steering_wheel_heater',
    'side_mirror_heaters', 'fan_status', 'auto_seat_climate_left', 'auto_seat_climate_right',
    
    # Safety Systems & Warnings
    'sentry_mode', 'service_mode', 'valet_mode',
    'tpms_hard_warning_fl', 'tpms_hard_warning_fr', 'tpms_hard_warning_rl', 'tpms_hard_warning_rr',
    'tpms_last_seen_pressure_time_fl', 'tpms_last_seen_pressure_time_fr',
    'tpms_last_seen_pressure_time_rl', 'tpms_last_seen_pressure_time_rr',
    'tpms_pressure_fl', 'tpms_pressure_fr', 'tpms_pressure_rl', 'tpms_pressure_rr',
    'tpms_rcp_front_value', 'tpms_rcp_rear_value',
    'tpms_soft_warning_fl', 'tpms_soft_warning_fr', 'tpms_soft_warning_rl', 'tpms_soft_warning_rr',
    
    # Entertainment & Miscellaneous
    'webcam_selfie_supported', 'webcam_supported', 'santa_mode', 'notifications_supported',
    
    # Charging Scheduling (Detailed)
    'off_peak_charging_enabled', 'off_peak_hours_end_time', 'preconditioning_times',
    'charge_current_request', 'charge_current_request_max',  # Unnecessary charging diagnostics

    # Appearance & Vehicle Metadata
    'exterior_color', 'wheel_type', 'interior_trim_type',
    'roof_color', 'vehicle_name', 'car_special_type',

    # Diagnostic & Metadata
    'vehicle_self_test_progress',  # Unrelated to current features
    'vehicle_self_test_requested',  # Duplicate from diagnostics
    'dashcam_statesentry_mode',  # Specific to safety but not prioritized

    # Redundant and Granular Features
    'battery_energy_remaining',  # Not specified in battery recommendations
    'active_route_speed_limit_mph',  # Too detailed for route guidance

    # Other redundant or rarely used columns
    'valet_pin_needed',  # Unnecessary metadata
    'speed_limit_mode',  # Not related to current route guidance
    'steering_angle',  # Not specified for any recommendations
    'time_to_full_charge',  # Rarely referenced and redundant
    'service_interval_remaining',  # Unrelated to recommendations
    'powertrain_state'  # Too granular for high-level recommendations
]


In [44]:
# Drop the irrelevant columns
cleaned_data = df.drop(columns=irrelevant_columns, errors='ignore')

In [45]:
cleaned_data.shape

(2949338, 174)

In [46]:
cleaned_data.head()

Unnamed: 0,battery_heater_on,battery_level,battery_range,charge_amps,charge_enable_request,charge_energy_added,charge_limit_soc,charge_limit_soc_max,charge_limit_soc_min,charge_limit_soc_std,...,Folder Name,Folder Month,tpms_last_seen_pressure_time_fl_utc,tpms_last_seen_pressure_time_fl_utc_date_only,tpms_last_seen_pressure_time_fr_utc,tpms_last_seen_pressure_time_fr_utc_date_only,tpms_last_seen_pressure_time_rl_utc,tpms_last_seen_pressure_time_rl_utc_date_only,tpms_last_seen_pressure_time_rr_utc,tpms_last_seen_pressure_time_rr_utc_date_only
0,False,91.0,241.54,32.0,True,40.72,100,100,50,80,...,IN/CA/2024/07/None,7,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:07+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30
1,False,91.0,241.54,32.0,True,40.72,100,100,50,80,...,IN/CA/2024/07/None,7,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:07+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30
2,False,91.0,241.54,32.0,True,40.72,100,100,50,80,...,IN/CA/2024/07/None,7,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:07+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30
3,False,91.0,241.54,32.0,True,40.72,100,100,50,80,...,IN/CA/2024/07/None,7,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:07+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30
4,False,91.0,241.54,32.0,True,40.72,100,100,50,80,...,IN/CA/2024/07/None,7,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:07+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30


In [47]:
df.head()

Unnamed: 0,battery_heater_on,battery_level,battery_range,charge_amps,charge_current_request,charge_current_request_max,charge_enable_request,charge_energy_added,charge_limit_soc,charge_limit_soc_max,...,Folder Name,Folder Month,tpms_last_seen_pressure_time_fl_utc,tpms_last_seen_pressure_time_fl_utc_date_only,tpms_last_seen_pressure_time_fr_utc,tpms_last_seen_pressure_time_fr_utc_date_only,tpms_last_seen_pressure_time_rl_utc,tpms_last_seen_pressure_time_rl_utc_date_only,tpms_last_seen_pressure_time_rr_utc,tpms_last_seen_pressure_time_rr_utc_date_only
0,False,91.0,241.54,32.0,32.0,32.0,True,40.72,100,100,...,IN/CA/2024/07/None,7,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:07+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30
1,False,91.0,241.54,32.0,32.0,32.0,True,40.72,100,100,...,IN/CA/2024/07/None,7,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:07+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30
2,False,91.0,241.54,32.0,32.0,32.0,True,40.72,100,100,...,IN/CA/2024/07/None,7,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:07+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30
3,False,91.0,241.54,32.0,32.0,32.0,True,40.72,100,100,...,IN/CA/2024/07/None,7,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:07+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30
4,False,91.0,241.54,32.0,32.0,32.0,True,40.72,100,100,...,IN/CA/2024/07/None,7,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:07+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30,2024-06-30 08:26:06+00:00,2024-06-30


In [48]:
#1. Route and Driving Guidance
route_and_driving_guidance_columns = [
    'scheduled_departure_time',
    'active_route_minutes_to_arrival',
    'active_route_traffic_minutes_delay',
    'can_accept_navigation_requests',
    'gps_latitude',
    'gps_longitude',
    'navigation_state',
    'route_distance_remaining',
    'cruise_control_speed'
]


In [49]:
#2. Weather Recommendations
weather_recommendations_columns = [
    'ambient_temperature',
    'humidity',
    'rain_sensor',
    'wind_speed',
    'external_temperature'
]

In [50]:
#3. Battery
battery_columns = [
    'battery_heater_on',
    'battery_range',
    'charge_miles_added_ideal',
    'charge_port_cold_weather_mode',
    'charge_port_door_open',
    'fast_charger_present',
    'max_range_charge_counter',
    'state_of_charge',
    'energy_consumption_rate',
    'charging_power',
    'regen_energy_recovered',
    'vehicle_thermally_protected'
]


In [51]:
#4. Event Recommendations
event_recommendations_columns = [
    'odometer',
    'location_history',
    'drive_mode',
    'navigation_suggestions'
]


In [52]:
#5. Cabin Environment
cabin_environment_columns = [
    'allow_cabin_overheat_protection',
    'bioweapon_mode',
    'defrost_mode',
    'side_mirror_heaters',
    'vehicle_self_test_requested',
    'cabin_temperature',
    'hvac_state',
    'seat_belt_status',
    'air_quality_index'
]


In [53]:
#6. Idle Time Detection
idle_time_detection_columns = [
    'shift_state',
    'speed',
    'power',
    'timestamp',
    'odometer',
    'engine_state',
    'fuel_consumption_rate',
    'duration_in_current_state',
    'gps_speed'
]
