In [1]:
import pandas as pd
import numpy as np

"""
vehicle_id STRING 车辆唯一标志码
charge_start_time INT 充电开始时间
charge_end_time INT 充电结束时间
mileage FLOAT 充电开始时刻车辆仪表里程（km）
charge_start_soc INT 充电开始时刻动力电池 SOC
charge_end_soc INT 充电结束时刻动力电池 SOC
charge_start_U FLOAT 充电开始时刻动力电池总电压（V）
charge_end_U FLOAT 充电结束时刻动力电池总电压（V）
charge_start_I FLOAT 充电开始时刻动力电池总电流（A）
charge_end_I FLOAT 充电结束时刻动力电池总电流（A）
charge_max_temp FLOAT 充电过程中电池系统温度探针最大值（℃）
charge_min_temp FLOAT 充电过程中电池系统温度探针最小值（℃）
charge_energy FLOAT 此充电过程的充电能量（kWh）
"""

data = pd.read_csv('data/predict_data_e_train.csv')
data[data == 0] = np.nan
data = data.dropna(subset=['charge_energy'])
data.head()

Unnamed: 0,vehicle_id,charge_start_time,charge_end_time,mileage,charge_start_soc,charge_end_soc,charge_start_U,charge_end_U,charge_start_I,charge_end_I,charge_max_temp,charge_min_temp,charge_energy
0,1,20171101113839,20171101144639,69054.6,61.2,90.0,367.8,377.3,-7.7,-7.8,21.0,16.0,9.071
1,1,20171101212850,20171102045945,69180.2,34.0,100.0,363.4,400.6,-7.7,-7.7,19.0,15.0,21.745
2,1,20171102121246,20171102154916,69284.2,54.0,87.2,367.4,377.0,-7.7,-7.8,23.0,17.0,10.434
3,1,20171102204546,20171103025825,69377.0,46.0,100.0,366.6,399.0,-7.7,-7.8,21.0,16.0,17.99
4,1,20171103164054,20171103170824,69431.6,75.2,97.2,372.8,385.8,-29.9,-40.1,18.0,15.0,6.99


In [2]:
# 找出缺失值的列
data.isnull().sum()

vehicle_id             0
charge_start_time      0
charge_end_time        0
mileage                0
charge_start_soc      27
charge_end_soc       198
charge_start_U         0
charge_end_U         137
charge_start_I         0
charge_end_I         317
charge_max_temp        7
charge_min_temp      448
charge_energy          0
dtype: int64

In [3]:
# 计算平均值
charge_start_soc_mean = data.groupby(['vehicle_id']).charge_start_soc.mean()
charge_end_soc_mean = data.groupby(['vehicle_id']).charge_end_soc.mean()
charge_end_U_mean = data.groupby(['vehicle_id']).charge_end_U.mean()
charge_end_I_mean = data.groupby(['vehicle_id']).charge_end_I.mean()
charge_max_temp_mean = data.groupby(['vehicle_id']).charge_max_temp.mean()
charge_min_temp_mean = data.groupby(['vehicle_id']).charge_min_temp.mean()

# 设置索引
data.set_index(['vehicle_id'], inplace=True)

# 填充缺失值
data.charge_start_soc.fillna(charge_start_soc_mean, inplace=True)
data.charge_end_soc.fillna(charge_end_soc_mean, inplace=True)
data.charge_end_U.fillna(charge_end_U_mean, inplace=True)
data.charge_end_I.fillna(charge_end_I_mean, inplace=True)
data.charge_max_temp.fillna(charge_max_temp_mean, inplace=True)
data.charge_min_temp.fillna(charge_min_temp_mean, inplace=True)

# 重置索引
data.reset_index(inplace=True)

In [4]:
# 两列相减
data['time'] = data.apply(lambda row: row.charge_end_time - row.charge_start_time, axis=1)
data['soc'] = data.apply(lambda row: row.charge_end_soc - row.charge_start_soc, axis=1)
data['U'] = data.apply(lambda row: row.charge_end_U - row.charge_start_U, axis=1)
data['I'] = data.apply(lambda row: row.charge_end_I - row.charge_start_I, axis=1)
data['temp'] = data.apply(lambda row: row.charge_max_temp - row.charge_min_temp, axis=1)

In [5]:
selected_features = ['vehicle_id', 'time', 'mileage', 'soc', 'U', 'I', 'temp']

# 划分数据集
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[selected_features], data['charge_energy'], random_state=0)

print(len(X_train), len(X_test))

2073 692


In [6]:
# 特征向量化
from sklearn.feature_extraction import DictVectorizer

dict_vec = DictVectorizer(sparse=False)
X_train = dict_vec.fit_transform(X_train.to_dict(orient='record'))
X_test = dict_vec.transform(X_test.to_dict(orient='record'))

In [7]:
# 网格搜索 自动调参
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV

params = {'n_estimators':list(range(100,1001,100))}

grid = GridSearchCV(RandomForestRegressor(), param_grid=params, cv=5)
grid.fit(X_train, y_train)

grid.grid_scores_, grid.best_params_, grid.best_score_



([mean: 0.98753, std: 0.00756, params: {'n_estimators': 100},
  mean: 0.98746, std: 0.00764, params: {'n_estimators': 200},
  mean: 0.98743, std: 0.00757, params: {'n_estimators': 300},
  mean: 0.98742, std: 0.00752, params: {'n_estimators': 400},
  mean: 0.98735, std: 0.00757, params: {'n_estimators': 500},
  mean: 0.98740, std: 0.00743, params: {'n_estimators': 600},
  mean: 0.98745, std: 0.00755, params: {'n_estimators': 700},
  mean: 0.98734, std: 0.00756, params: {'n_estimators': 800},
  mean: 0.98736, std: 0.00759, params: {'n_estimators': 900},
  mean: 0.98745, std: 0.00757, params: {'n_estimators': 1000}],
 {'n_estimators': 100},
 0.9875323374182303)

In [19]:
# 预测
model = RandomForestRegressor(n_estimators= 100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 评估
e = 0
for a, r in zip(y_test, y_pred):
    e += ((r- a) / a) ** 2

e ** 0.5

12.60062093473373

![title](0.png)

In [9]:
test = pd.read_csv('data/testA.csv')
test.head()

Unnamed: 0,vehicle_id,charge_start_time,charge_end_time,mileage,charge_start_soc,charge_end_soc,charge_start_U,charge_end_U,charge_start_I,charge_end_I,charge_max_temp,charge_min_temp
0,1,20171116161033,20171116170903,71710.6,44.0,90.8,370.5,383.9,-29.8,-40.1,21.0,13.0
1,1,20171210190533,20171210193743,75816.9,34.0,60.4,365.1,379.9,-25.9,-40.4,14.0,9.0
2,1,20180104213535,20180105042334,79867.3,39.2,100.0,364.0,399.9,-7.6,-7.8,11.0,2.0
3,1,20180215211935,20180216051456,86867.6,28.8,,362.9,399.9,-7.6,-7.6,8.0,3.0
4,2,20180525192134,20180526025112,67266.3,35.0,99.0,350.8,397.0,-9.4,-5.0,32.0,27.0


In [10]:
# 找出缺失值的列
test.isnull().any()

vehicle_id           False
charge_start_time    False
charge_end_time      False
mileage              False
charge_start_soc     False
charge_end_soc        True
charge_start_U       False
charge_end_U         False
charge_start_I       False
charge_end_I         False
charge_max_temp      False
charge_min_temp      False
dtype: bool

In [11]:
# 填充缺失值的列
test['charge_end_soc'].fillna(test['charge_end_soc'].mean(), inplace=True)

In [12]:
# 两列相减
test['time'] = test.apply(lambda row: row.charge_end_time - row.charge_start_time, axis=1)
test['soc'] = test.apply(lambda row: row.charge_end_soc - row.charge_start_soc, axis=1)
test['U'] = test.apply(lambda row: row.charge_end_U - row.charge_start_U, axis=1)
test['I'] = test.apply(lambda row: row.charge_end_I - row.charge_start_I, axis=1)
test['temp'] = test.apply(lambda row: row.charge_max_temp - row.charge_min_temp, axis=1)

In [13]:
x = dict_vec.transform(test[selected_features].to_dict(orient='record'))
y = model.predict(x)

In [14]:
ids = list(test['vehicle_id'])
len(ids)

27

In [15]:
with open('submit-A_张彬城_暨南大学_15521106350_20181011.csv', 'w') as f:
    f.write('vehicle_id,charge_energy\n')
    for _id, _y in zip(ids, y):
        line = '%d,%d\n' % (_id, _y)
        f.write(line)