In [None]:
import pandas as pd
import os
import re

# Set the input file path (modify this to your actual path)
input_file_path = "/Users/Sam/Downloads/processed_data_aug_24.csv"

# Use regular expression to extract the month (assuming the format is fixed as processed_data_<month>_24.csv)
match = re.search(r'processed_data_([a-z]+)_24\.csv', os.path.basename(input_file_path))
if match:
    month = match.group(1)  # Extracted month
else:
    raise ValueError("Unable to parse the month from the file name. Please check the file name format!")

# Read the CSV file
df = pd.read_csv(input_file_path)

# Columns to retain
columns_to_keep = [
    "battery_heater_on", "battery_range", "charge_limit_soc", "charge_limit_soc_max", 
    "charge_miles_added_ideal", "charge_miles_added_rated", "charge_port_cold_weather_mode", 
    "charge_port_door_open", "charger_pilot_current", "charger_power", "charger_voltage", 
    "fast_charger_present", "preconditioning_enabled", "scheduled_departure_time", 
    "scheduled_departure_time_minutes", "timestamp", "auto_steering_wheel_heat", 
    "battery_heater", "bioweapon_mode", "cabin_overheat_protection", "defrost_mode", 
    "inside_temp", "outside_temp", "remote_heater_control_enabled", "active_route_energy_at_arrival", 
    "active_route_miles_to_arrival", "active_route_minutes_to_arrival", 
    "active_route_traffic_minutes_delay", "speed", "can_accept_navigation_requests", 
    "cop_user_set_temp_supported", "driver_assist", "api_version", "autopark_state_v2", 
    "dashcam_clip_save_available", "dashcam_state", "df", "dr", "fd_window", 
    "fp_window", "ft", "is_user_present", "locked", "pf", 
    "pr", "rd_window", "remote_start", "remote_start_enabled", "rp_window", "rt", 
    "sentry_mode_available", "speed_limit_mode", "vehicle_self_test_progress", 
    "vehicle_self_test_requested", "active_route_latitude", "latitude", 
    "active_route_longitude", "longitude", "max_range_charge_counter", "power", "odometer", "shift_state", "model_name"
]

# Keep only the specified columns (will not throw an error if some columns are missing)
df_filtered = df[[col for col in columns_to_keep if col in df.columns]]

# Drop columns that are completely empty
df_filtered = df_filtered.dropna(axis=1, how='all')

# Automatically generate output file path
output_file_path = f"/Users/Sam/Downloads/filtered_data_{month}_24.csv"

# Save the filtered DataFrame to a CSV file
df_filtered.to_csv(output_file_path, index=False)

# Print success message
print(f"Filtered data has been successfully saved to: {output_file_path}")


In [5]:
# import pandas as pd

# # 读取文件
# df = pd.read_csv('/Users/Sam/Downloads/filtered_data_sep_24.csv')

# # 定义需要进行 mode 和 mean 计算的列
# mode_columns = [
#     'battery_heater_on', 'charge_port_cold_weather_mode', 'charge_port_door_open',
#     'fast_charger_present', 'preconditioning_enabled', 'auto_steering_wheel_heat',
#     'battery_heater', 'bioweapon_mode', 'cabin_overheat_protection', 'defrost_mode',
#     'remote_heater_control_enabled', 'can_accept_navigation_requests', 'cop_user_set_temp_supported',
#     'driver_assist', 'api_version', 'autopark_state_v2', 'dashcam_clip_save_available',
#     'dashcam_state', 'df', 'dr', 'fd_window', 'fp_window', 'ft', 'is_user_present',
#     'locked', 'pf', 'pr', 'rd_window', 'remote_start', 'remote_start_enabled',
#     'rp_window', 'rt', 'sentry_mode_available', 'speed_limit_mode', 
#     'vehicle_self_test_progress', 'vehicle_self_test_requested'
# ]

# mean_columns = [
#     'battery_range', 'charge_limit_soc', 'charge_limit_soc_max', 'charge_miles_added_ideal',
#     'charge_miles_added_rated', 'charger_pilot_current', 'charger_power', 'charger_voltage',
#     'scheduled_departure_time', 'scheduled_departure_time_minutes', 'timestamp',
#     'inside_temp', 'outside_temp', 'active_route_energy_at_arrival',
#     'active_route_miles_to_arrival', 'active_route_minutes_to_arrival',
#     'active_route_traffic_minutes_delay', 'speed', 'active_route_latitude',
#     'latitude', 'active_route_longitude', 'longitude'
# ]

# # 确认 mode 和 mean 列中存在于数据中的列
# available_mode_columns = [col for col in mode_columns if col in df.columns]
# available_mean_columns = [col for col in mean_columns if col in df.columns]

# # 按 timestamp 排序
# df_sorted = df.sort_values(['timestamp'])

# # 定义聚合函数
# def aggregate_group(group):
#     if group['timestamp'].is_monotonic_increasing:
#         return pd.Series({
#             **{col: group[col].mode().iloc[0] if not group[col].mode().empty else None for col in available_mode_columns},
#             **{col: group[col].mean() for col in available_mean_columns}
#         })
#     else:
#         # 如果 timestamp 不是递增的，则返回 NaN
#         return pd.Series({col: None for col in available_mode_columns + available_mean_columns})

# # 每 7 行进行聚合
# aggregated_df = df_sorted.groupby(df_sorted.index // 7).apply(aggregate_group).reset_index(drop=True)

# # 确保列顺序与原始文件一致
# aggregated_df = aggregated_df[df.columns]

# # 保存结果到新文件
# aggregated_df.to_csv('/Users/Sam/Downloads/aggregated_filtered_data_sep_24.csv', index=False)


In [18]:
import pandas as pd

# Read the data
df = pd.read_csv('/Users/Sam/Downloads/filtered_data_may_24.csv')

# Convert 'timestamp' to datetime format in milliseconds and keep 'YYYY-MM-DD HH:MM:SS' format
df['time'] = pd.to_datetime(df['timestamp'], unit='ms').dt.strftime('%Y-%m-%d %H:%M:%S')

# Create a 'vehicle_id' column; increment the ID when the timestamp is earlier than the previous one
vehicle_id = 1
vehicle_ids = [vehicle_id]
for i in range(1, len(df)):
    if pd.to_datetime(df.loc[i, 'time']) < pd.to_datetime(df.loc[i - 1, 'time']):
        vehicle_id += 1
    vehicle_ids.append(vehicle_id)

df['vehicle_id'] = vehicle_ids

# Extract minute-level timestamps
df['minute'] = pd.to_datetime(df['time']).dt.strftime('%Y-%m-%d %H:%M')

# Group by 'vehicle_id' and 'minute', and keep the first record per minute
aggregated_df = df.sort_values('time').groupby(['vehicle_id', 'minute'], as_index=False).first()

# Drop the auxiliary 'minute' column
aggregated_df = aggregated_df.drop(columns=['minute'])

# Reorder columns to place 'time' right after 'vehicle_id'
columns_order = ['vehicle_id', 'time'] + [col for col in aggregated_df.columns if col not in ['vehicle_id', 'time']]
aggregated_df = aggregated_df[columns_order]

# Save the result to a new file
aggregated_df.to_csv('/Users/Sam/Downloads/aggregated_filtered_data_may_24.csv', index=False)

# Print the first few rows of the result
print(aggregated_df.head())


  df = pd.read_csv('/Users/Sam/Downloads/filtered_data_may_24.csv')


   vehicle_id                 time  battery_heater_on  battery_range  \
0           1  2024-05-19 06:26:38              False         259.27   
1           1  2024-05-19 06:29:45              False         259.27   
2           1  2024-05-19 06:30:02              False         259.27   
3           1  2024-05-19 06:31:13              False         259.27   
4           1  2024-05-19 06:32:06              False         259.27   

   charge_limit_soc  charge_limit_soc_max  charge_miles_added_ideal  \
0               100                   100                     106.0   
1               100                   100                     106.0   
2               100                   100                     106.0   
3               100                   100                     106.0   
4               100                   100                     106.0   

   charge_miles_added_rated charge_port_cold_weather_mode  \
0                     106.0                         False   
1                 

In [7]:
# import pandas as pd

# # 读取数据
# df = pd.read_csv('/Users/Sam/Downloads/filtered_data_jan_24.csv')

# # 将 'timestamp' 转换为 datetime 类型，单位为毫秒，并保留 'YYYY-MM-DD HH:MM:SS' 格式
# df['time'] = pd.to_datetime(df['timestamp'], unit='ms').dt.strftime('%Y-%m-%d %H:%M:%S')

# # 提取每分钟的时间戳
# df['minute'] = pd.to_datetime(df['time']).dt.strftime('%Y-%m-%d %H:%M')

# # 按 `model_name` 和 `minute` 分组，保留每分钟的第一条记录
# aggregated_df = df.sort_values('time').groupby(['model_name', 'minute'], as_index=False).first()

# # 删除多余的列 'minute'
# aggregated_df = aggregated_df.drop(columns=['minute'])

# # 调整列顺序，将 'time' 列放在 `model_name` 后面
# columns_order = ['model_name', 'time'] + [col for col in aggregated_df.columns if col not in ['model_name', 'time']]
# aggregated_df = aggregated_df[columns_order]

# # 保存结果到新文件
# aggregated_df.to_csv('/Users/Sam/Downloads/aggregated_filtered_data_jan_24_final.csv', index=False)

# # 打印结果的前几行
# print(aggregated_df.head())
