In [1]:
import pandas as pd
import numpy as np

"""
vehicle_id STRING 车辆唯一标志码
charge_start_time INT 充电开始时间
charge_end_time INT 充电结束时间
mileage FLOAT 充电开始时刻车辆仪表里程（km）
charge_start_soc INT 充电开始时刻动力电池 SOC
charge_end_soc INT 充电结束时刻动力电池 SOC
charge_start_U FLOAT 充电开始时刻动力电池总电压（V）
charge_end_U FLOAT 充电结束时刻动力电池总电压（V）
charge_start_I FLOAT 充电开始时刻动力电池总电流（A）
charge_end_I FLOAT 充电结束时刻动力电池总电流（A）
charge_max_temp FLOAT 充电过程中电池系统温度探针最大值（℃）
charge_min_temp FLOAT 充电过程中电池系统温度探针最小值（℃）
charge_energy FLOAT 此充电过程的充电能量（kWh）
"""

data = pd.read_csv('data/predict_data_e_train.csv')
data[data == 0] = np.nan
data = data.dropna(subset=['charge_energy'])
data.head()

Unnamed: 0,vehicle_id,charge_start_time,charge_end_time,mileage,charge_start_soc,charge_end_soc,charge_start_U,charge_end_U,charge_start_I,charge_end_I,charge_max_temp,charge_min_temp,charge_energy
0,1,20171101113839,20171101144639,69054.6,61.2,90.0,367.8,377.3,-7.7,-7.8,21.0,16.0,9.071
1,1,20171101212850,20171102045945,69180.2,34.0,100.0,363.4,400.6,-7.7,-7.7,19.0,15.0,21.745
2,1,20171102121246,20171102154916,69284.2,54.0,87.2,367.4,377.0,-7.7,-7.8,23.0,17.0,10.434
3,1,20171102204546,20171103025825,69377.0,46.0,100.0,366.6,399.0,-7.7,-7.8,21.0,16.0,17.99
4,1,20171103164054,20171103170824,69431.6,75.2,97.2,372.8,385.8,-29.9,-40.1,18.0,15.0,6.99


In [2]:
# 找出缺失值的列
data.isnull().sum()

vehicle_id             0
charge_start_time      0
charge_end_time        0
mileage                0
charge_start_soc      27
charge_end_soc       198
charge_start_U         0
charge_end_U         137
charge_start_I         0
charge_end_I         317
charge_max_temp        7
charge_min_temp      448
charge_energy          0
dtype: int64

In [3]:
# 计算平均值
charge_start_soc_mean = data.groupby(['vehicle_id']).charge_start_soc.mean()
charge_end_soc_mean = data.groupby(['vehicle_id']).charge_end_soc.mean()
charge_end_U_mean = data.groupby(['vehicle_id']).charge_end_U.mean()
charge_end_I_mean = data.groupby(['vehicle_id']).charge_end_I.mean()
charge_max_temp_mean = data.groupby(['vehicle_id']).charge_max_temp.mean()
charge_min_temp_mean = data.groupby(['vehicle_id']).charge_min_temp.mean()

# 设置索引
data.set_index(['vehicle_id'], inplace=True)

# 填充缺失值
data.charge_start_soc.fillna(charge_start_soc_mean, inplace=True)
data.charge_end_soc.fillna(charge_end_soc_mean, inplace=True)
data.charge_end_U.fillna(charge_end_U_mean, inplace=True)
data.charge_end_I.fillna(charge_end_I_mean, inplace=True)
data.charge_max_temp.fillna(charge_max_temp_mean, inplace=True)
data.charge_min_temp.fillna(charge_min_temp_mean, inplace=True)

# 重置索引
data.reset_index(inplace=True)

In [4]:
# 两列相减
data['time'] = data.apply(lambda row: row.charge_end_time - row.charge_start_time, axis=1)
data['soc'] = data.apply(lambda row: row.charge_end_soc - row.charge_start_soc, axis=1)
data['U'] = data.apply(lambda row: row.charge_end_U - row.charge_start_U, axis=1)
data['I'] = data.apply(lambda row: row.charge_end_I - row.charge_start_I, axis=1)
data['temp'] = data.apply(lambda row: row.charge_max_temp - row.charge_min_temp, axis=1)

In [13]:
data.isnull().any()


vehicle_id           False
charge_start_time    False
charge_end_time      False
mileage              False
charge_start_soc     False
charge_end_soc       False
charge_start_U       False
charge_end_U         False
charge_start_I       False
charge_end_I         False
charge_max_temp      False
charge_min_temp      False
charge_energy        False
time                 False
soc                  False
U                    False
I                    False
temp                 False
dtype: bool

In [15]:
#print((len(data)))

      vehicle_id  charge_start_time  charge_end_time   mileage  \
0              1     20171101113839   20171101144639   69054.6   
1              1     20171101212850   20171102045945   69180.2   
2              1     20171102121246   20171102154916   69284.2   
3              1     20171102204546   20171103025825   69377.0   
4              1     20171103164054   20171103170824   69431.6   
5              1     20171103213445   20171104035405   69546.0   
6              1     20171104140007   20171104143907   69612.0   
7              1     20171104210147   20171105042347   69759.2   
8              1     20171105211022   20171106044343   69914.8   
9              1     20171106100653   20171106125343   69959.5   
10             1     20171106213204   20171107020133   70049.0   
11             1     20171107152248   20171107155308   70109.9   
12             1     20171107214835   20171108052214   70251.3   
13             1     20171108212640   20171109044350   70399.4   
14        

In [10]:
selected_features = ['vehicle_id', 'time', 'mileage', 'soc', 'U', 'I', 'temp']

# 划分数据集
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[selected_features], data['charge_energy'], random_state=0)

print(X_train)
#print(len(X_train), len(X_test))


      vehicle_id      time   mileage        soc          U           I  \
220            1  841510.0   94486.3  29.600000  40.100000    0.000000   
1436           4    5280.0   93961.0  29.772214   8.600000    5.234894   
17             1  831099.0   70808.9  64.400000  33.700000    0.000000   
58             1  838409.0   74593.1  69.200000  37.300000    0.100000   
2100           4    6400.0  118263.2  36.000000  19.100000  -18.000000   
1512           4    6130.0   97421.7  29.000000  25.100000  164.000000   
2125           4    2670.0  119365.0  39.000000   9.100000   58.000000   
411            3   14690.0  138713.0  18.000000  32.600000   68.600000   
2361           5   15770.0   75847.0  72.000000  49.300000   12.000000   
1991           4    9100.0  114159.7  54.000000  12.200000   99.000000   
2407           5    2660.0   81792.0  35.000000   5.400000  119.844011   
1199           4  772260.0   83788.5  67.000000  29.300000   53.000000   
1793           4      30.0  106537.4  