In [2]:
import pandas as pd
import numpy as np

"""
vehicle_id STRING 车辆唯一标志码
charge_start_time INT 充电开始时间
charge_end_time INT 充电结束时间
mileage FLOAT 充电开始时刻车辆仪表里程（km）
charge_start_soc INT 充电开始时刻动力电池 SOC
charge_end_soc INT 充电结束时刻动力电池 SOC
charge_start_U FLOAT 充电开始时刻动力电池总电压（V）
charge_end_U FLOAT 充电结束时刻动力电池总电压（V）
charge_start_I FLOAT 充电开始时刻动力电池总电流（A）
charge_end_I FLOAT 充电结束时刻动力电池总电流（A）
charge_max_temp FLOAT 充电过程中电池系统温度探针最大值（℃）
charge_min_temp FLOAT 充电过程中电池系统温度探针最小值（℃）
charge_energy FLOAT 此充电过程的充电能量（kWh）
"""

data = pd.read_csv('决赛数据/energy_train1029.csv')
data[data == 0] = np.nan
data = data.dropna(subset=['charge_energy'])
data.head()

Unnamed: 0,vehicle_id,charge_start_time,charge_end_time,mileage,charge_start_soc,charge_end_soc,charge_start_U,charge_end_U,charge_start_I,charge_end_I,charge_max_temp,charge_min_temp,charge_energy,Unnamed: 13
0,1,20171103203711,20171104035411,82965.6,34.0,100.0,363.8,398.2,-7.7,-7.7,18.0,10.0,21.11,
1,1,20171104152512,20171104163402,83082.5,43.2,99.2,369.3,389.8,-26.8,-40.4,23.0,14.0,17.705,
2,1,20171104202445,20171105015835,83184.8,50.4,100.0,367.4,397.5,-7.6,-7.8,18.0,12.0,16.166,
3,1,20171105135203,20171105151323,83339.0,23.2,88.8,361.0,382.0,-23.2,-40.5,27.0,18.0,20.595,
4,1,20171105204456,20171106045655,83459.4,27.2,100.0,359.0,399.9,-7.7,-7.6,22.0,13.0,23.765,


In [3]:
# 找出缺失值的列
data.isnull().sum()

vehicle_id              0
charge_start_time       0
charge_end_time         0
mileage                 0
charge_start_soc       58
charge_end_soc        477
charge_start_U          0
charge_end_U          316
charge_start_I          0
charge_end_I          473
charge_max_temp        12
charge_min_temp       838
charge_energy           0
Unnamed: 13          5168
dtype: int64

In [4]:
# 计算平均值
charge_start_soc_mean = data.groupby(['vehicle_id']).charge_start_soc.mean()
charge_end_soc_mean = data.groupby(['vehicle_id']).charge_end_soc.mean()
charge_end_U_mean = data.groupby(['vehicle_id']).charge_end_U.mean()
charge_end_I_mean = data.groupby(['vehicle_id']).charge_end_I.mean()
charge_max_temp_mean = data.groupby(['vehicle_id']).charge_max_temp.mean()
charge_min_temp_mean = data.groupby(['vehicle_id']).charge_min_temp.mean()

# 设置索引
data.set_index(['vehicle_id'], inplace=True)

# 填充缺失值
data.charge_start_soc.fillna(charge_start_soc_mean, inplace=True)
data.charge_end_soc.fillna(charge_end_soc_mean, inplace=True)
data.charge_end_U.fillna(charge_end_U_mean, inplace=True)
data.charge_end_I.fillna(charge_end_I_mean, inplace=True)
data.charge_max_temp.fillna(charge_max_temp_mean, inplace=True)
data.charge_min_temp.fillna(charge_min_temp_mean, inplace=True)

# 重置索引
data.reset_index(inplace=True)

In [5]:
# 两列相减
data['time'] = data.apply(lambda row: row.charge_end_time - row.charge_start_time, axis=1)
data['soc'] = data.apply(lambda row: row.charge_end_soc - row.charge_start_soc, axis=1)
data['U'] = data.apply(lambda row: row.charge_end_U - row.charge_start_U, axis=1)
data['I'] = data.apply(lambda row: row.charge_end_I - row.charge_start_I, axis=1)
data['temp'] = data.apply(lambda row: row.charge_max_temp - row.charge_min_temp, axis=1)

In [6]:
selected_features = ['vehicle_id', 'time', 'mileage', 'soc', 'U', 'I', 'temp']

# 划分数据集
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[selected_features], data['charge_energy'], random_state=0)

print(len(X_train), len(X_test))

3876 1292


In [7]:
# 特征向量化
from sklearn.feature_extraction import DictVectorizer

dict_vec = DictVectorizer(sparse=False)
X_train = dict_vec.fit_transform(X_train.to_dict(orient='record'))
X_test = dict_vec.transform(X_test.to_dict(orient='record'))

In [8]:
# 网格搜索 自动调参
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV

params = {'n_estimators':list(range(100,1001,100))}

grid = GridSearchCV(RandomForestRegressor(), param_grid=params, cv=5)
grid.fit(X_train, y_train)

grid.grid_scores_, grid.best_params_, grid.best_score_

  from numpy.core.umath_tests import inner1d


([mean: 0.92146, std: 0.07688, params: {'n_estimators': 100},
  mean: 0.91671, std: 0.07754, params: {'n_estimators': 200},
  mean: 0.92348, std: 0.07650, params: {'n_estimators': 300},
  mean: 0.92476, std: 0.07600, params: {'n_estimators': 400},
  mean: 0.92611, std: 0.07546, params: {'n_estimators': 500},
  mean: 0.92166, std: 0.07701, params: {'n_estimators': 600},
  mean: 0.92407, std: 0.07601, params: {'n_estimators': 700},
  mean: 0.92319, std: 0.07645, params: {'n_estimators': 800},
  mean: 0.92383, std: 0.07691, params: {'n_estimators': 900},
  mean: 0.92299, std: 0.07623, params: {'n_estimators': 1000}],
 {'n_estimators': 500},
 0.9261062691246597)

In [9]:
# 预测
model = RandomForestRegressor(n_estimators= 100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 评估
e = 0
for a, r in zip(y_test, y_pred):
    e += ((r- a) / a) ** 2

e ** 0.5

59.79328828044302

![title](0.png)

In [10]:
test = pd.read_csv('决赛数据/energy_test1029.csv')
test.head()

Unnamed: 0,vehicle_id,charge_start_time,charge_end_time,mileage,charge_start_soc,charge_end_soc,charge_start_U,charge_end_U,charge_start_I,charge_end_I,charge_max_temp,charge_min_temp
0,1,20171101133105,20171101142515,82418.1,44.4,88.4,370.9,382.0,-39.7,-40.7,25,18
1,1,20171101203400,20171102043250,82540.5,29.2,100.0,360.5,399.6,-7.7,-7.8,22,15
2,1,20171102141440,20171102152900,82669.1,39.2,99.2,368.8,390.0,-29.8,-40.4,28,19
3,1,20171102203417,20171103022936,82779.2,48.0,100.0,367.4,396.7,-7.7,-7.8,24,15
4,1,20171103171340,20171103174250,82883.3,51.2,74.4,368.9,381.4,-31.6,-40.3,19,14


In [11]:
# 找出缺失值的列
test.isnull().any()

vehicle_id           False
charge_start_time    False
charge_end_time      False
mileage              False
charge_start_soc     False
charge_end_soc        True
charge_start_U       False
charge_end_U         False
charge_start_I       False
charge_end_I         False
charge_max_temp      False
charge_min_temp      False
dtype: bool

In [12]:
# 填充缺失值的列
test['charge_end_soc'].fillna(test['charge_end_soc'].mean(), inplace=True)

In [13]:
# 两列相减
test['time'] = test.apply(lambda row: row.charge_end_time - row.charge_start_time, axis=1)
test['soc'] = test.apply(lambda row: row.charge_end_soc - row.charge_start_soc, axis=1)
test['U'] = test.apply(lambda row: row.charge_end_U - row.charge_start_U, axis=1)
test['I'] = test.apply(lambda row: row.charge_end_I - row.charge_start_I, axis=1)
test['temp'] = test.apply(lambda row: row.charge_max_temp - row.charge_min_temp, axis=1)

In [14]:
x = dict_vec.transform(test[selected_features].to_dict(orient='record'))
y = model.predict(x)

In [15]:
ids = list(test['vehicle_id'])
len(ids)
y

array([ 13.23725,  23.01508,  19.08019,  16.84651,   7.54113,  13.10819,
        11.43081,  21.36827,  23.85305,  20.16095,  21.65209,  11.39809,
        10.27959,  20.16909,  20.01976,  16.49225,  17.26412,  21.88967,
         8.13798,  17.62306,  21.07629,  20.06152,  21.00521,  19.44465,
        24.85693,  20.24069,  18.44069,  23.2727 ,  22.25424,  20.79716,
        12.93894,  16.52132,  19.94527,  22.09112,  15.15311,   7.29963,
        24.06384,  19.32535,   5.89041,  20.12829,  19.65657,  19.55936,
        15.87869,  17.92019,  20.12377,  19.9949 ,  20.59495,  18.80078,
        20.09635,  19.91891,  20.46094,   8.54243,  20.11688,  20.34509,
        20.70745,  18.95638,  12.45312,  20.52026,   4.11064,  16.71903,
        93.55716,   5.80671,  31.97594,  17.7691 ,  23.88085,   3.74518,
        24.83261,   8.3827 ,  10.46495,  21.74508,  18.19967,  12.39853,
        10.12443,  28.85729,  14.04189,  31.85465,  23.13443,  26.30811,
        23.08835,  23.79413,  27.96762,  27.1884 , 

In [16]:
with open(' energy-submit_张彬城_暨南大学_15521106350_20181108.csv', 'w') as f:
    f.write('vehicle_id,charge_energy\n')
    for _id, _y in zip(ids, y):
        line = '%d,%f\n' % (_id, _y)
        f.write(line)